In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langchain_community.document_loaders import PyPDFLoader
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.models import Word2Vec
from collections import Counter
from nltk import ngrams
from sklearn.manifold import TSNE

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r'<[^>]+>', '', text) 
    text=re.sub(r'http[s]?://\S+', '', text)
    text=text.replace('\n', ' ').replace('\r', '').replace('\t',' ')
    text=re.sub(r'\s+', ' ', text)
    text=text.translate(str.maketrans('','', string.punctuation))
    tokens=word_tokenize(text)
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]
    lemmatizer=WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [None]:
class DocumentProcessor:
    def __init__(self,doc_path:str):
        self.doc_path=doc_path
        self.loader=PyPDFLoader(self.doc_path)
    def process_document(self):
        document_pages=self.loader.load_and_split()
        raw_pages=[doc.page_content for doc in document_pages]
        full_text= ' '.join(raw_pages)
        cleaned_text=clean_text(full_text)
        return full_text, cleaned_text

In [None]:
def compute_cosine_similarity(texts_before,texts_after):
    vectorizer=TfidfVectorizer()
    tfidf_before=vectorizer.fit_transform(texts_before)
    tfidf_after=vectorizer.transform(texts_after)
    similarity_scores =cosine_similarity(tfidf_before, tfidf_after)
    return np.diag(similarity_scores)


In [None]:
def generate_word_clouds(text_before,text_after):
    wc_before=WordCloud(background_color='white').generate(text_before)
    wc_after=WordCloud(background_color='white').generate(text_after)
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    plt.imshow(wc_before,interpolation='bilinear')
    plt.title('before Cleaning')
    plt.axis('off')
    plt.subplot(1,2,2)
    plt.imshow(wc_after,interpolation='bilinear')
    plt.title('after cleaning')
    plt.axis('off')
    plt.show()

In [None]:
def evaluate_topics(texts,num_topics=5,passes=20, chunksize=50):
    tokenized_texts=[word_tokenize(text.lower()) for text in texts]
    dictionary=Dictionary(tokenized_texts)
    corpus =[dictionary.doc2bow(text) for text in tokenized_texts]
    lda =LdaModel(corpus,num_topics=num_topics,id2word=dictionary,passes=passes,chunksize=chunksize,random_state=42,update_every=1)
    coherence_model =CoherenceModel(model=lda,texts=tokenized_texts,dictionary=dictionary,coherence='c_v')
    coherence= coherence_model.get_coherence()
    return coherence,lda

In [None]:
def plot_term_frequency(text,title="term frequency"):
    words=text.split()
    frequencies=Counter(words)
    labels,values=zip(*frequencies.most_common(30))
    indexes=range(len(labels))
    width=1
    plt.figure(figsize=(10,5))
    plt.bar(indexes,values,width)
    plt.xticks(indexes,labels,rotation='vertical')
    plt.title(title)
    plt.show()

In [None]:
def plot_ngram_frequency(text,n=2,num_ngrams=20,title="N-gram frequency"):
    words=word_tokenize(text)
    n_grams=ngrams(words, n)
    frequencies =Counter(n_grams)
    labels,values =zip(*frequencies.most_common(num_ngrams))
    labels=[' '.join(gram) for gram in labels]
    indexes=range(len(labels))
    width=1
    plt.figure(figsize=(12,6))
    plt.bar(indexes,values,width)
    plt.xticks(indexes,labels,rotation='vertical')
    plt.title(title)
    plt.show()

In [None]:
def visualize_embeddings(text, title="word embedding visualization",annotate_top_n=20):
    words=[word_tokenize(text)]
    model=Word2Vec(words, min_count=1, vector_size=50, window=3, workers=4)
    labels=[]
    tokens =[]
    word_frequencies =[(word,model.wv.get_vecattr(word,"count")) for word in model.wv.key_to_index]
    word_frequencies.sort(key=lambda x:x[1],reverse=True)
    for word,_ in word_frequencies:
        tokens.append(model.wv[word])
        labels.append(word)
    tokens_np =np.array(tokens)
    tsne_model=TSNE(perplexity=5,n_components=2, init='pca',n_iter=2500,random_state=23)
    new_values=tsne_model.fit_transform(tokens_np)
    plt.figure(figsize=(10,10))
    for i, value in enumerate(new_values):
        plt.scatter(value[0],value[1])
        if i < annotate_top_n:plt.annotate(labels[i],xy=(value[0],value[1]),xytext=(5,2),textcoords='offset points',ha='right',va='bottom')
    plt.title(title)
    plt.show()

In [None]:
processor=DocumentProcessor(doc_path="doc.pdf")
text_before_cleaning,text_after_cleaning= processor.process_document()
similarity_scores = compute_cosine_similarity([text_before_cleaning],[text_after_cleaning])
print("cosine similarity scores:",similarity_scores)


In [None]:
generate_word_clouds(text_before_cleaning,text_after_cleaning)

In [None]:
coherence_before, lda_before =evaluate_topics([text_before_cleaning], passes=20, chunksize=50)
coherence_after, lda_after = evaluate_topics([text_after_cleaning], passes=20, chunksize=50)
print(f'coherence - before cleaning: {coherence_before}')
print(f'coherence - after cleaning: {coherence_after}')

In [None]:
plot_term_frequency(text_before_cleaning,"term frequency before cleaning")
plot_term_frequency(text_after_cleaning,"term frequency after cleaning")


In [None]:
plot_ngram_frequency(text_before_cleaning,n=2,title="bi-gram frequency before cleaning")
plot_ngram_frequency(text_after_cleaning,n=2,title="bi-gram frequency after cleaning")

In [None]:
visualize_embeddings(text_before_cleaning,"embedding space before cleaning")
visualize_embeddings(text_after_cleaning,"embedding space after cleaning")