In [None]:
!pip install tika
!pip install langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install chromadb

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import tika
from tika import parser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter

In [None]:
nltk.download('punkt')
def extract_text_from_pdf(pdf_path):
    tika.initVM()
    pdf_text=parser.from_file(pdf_path)['content']
    return pdf_text
def perform_semantic_chunking(text,num_clusters=3):
    sentences=sent_tokenize(text)
    model=SentenceTransformer('all-MiniLM-L6-v2')
    embeddings=model.encode(sentences)
    kmeans=KMeans(n_clusters=num_clusters,random_state=0).fit(embeddings)
    clusters=[[] for _ in range(num_clusters)]
    for sentence,label in zip(sentences,kmeans.labels_):clusters[label].append(sentence)
    return clusters,embeddings,kmeans,model

def visualize_clusters(embeddings,kmeans,num_clusters):
    pca=PCA(n_components=2)
    reduced_embeddings=pca.fit_transform(embeddings)
    plt.figure(figsize=(10,7))
    for i in range(num_clusters):
        cluster_points=reduced_embeddings[kmeans.labels_==i]
        plt.scatter(cluster_points[:,0],cluster_points[:,1],label=f'cluster {i+1}')
    plt.legend()
    plt.title('semantic clustering of sentences')
    plt.xlabel('pca component 1')
    plt.ylabel('pca component 2')
    plt.show()

def plot_similarity_scores(query_embeddings,doc_embeddings,query_texts,doc_texts):
    for i, query_embedding in enumerate(query_embeddings):
        similarities =util.pytorch_cos_sim(query_embedding,doc_embeddings).flatten()
        plt.figure(figsize=(10,5))
        plt.bar(range(len(similarities)),similarities)
        plt.title(f'similarity scores for query: "{query_texts[i]}"')
        plt.xlabel('document chunk index')
        plt.ylabel('similarity score')
        plt.show()

def split_text_into_chunks(text,chunk_size=1000,chunk_overlap=200):
    text_splitter=CharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = text_splitter.create_documents([text])
    return chunks

def create_embeddings_for_chunks(chunks):
    embeddings_model=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-l6-v2',model_kwargs={'device':'cpu'},encode_kwargs={'normalize_embeddings':False})
    chunks_text=[chunk.page_content for chunk in chunks]
    chunk_embeddings=embeddings_model.embed_documents(chunks_text)
    return chunks,chunk_embeddings,embeddings_model

def perform_similarity_search(chunks,chunk_embeddings,queries,model):
    query_embeddings =model.encode(queries)
    doc_texts=[chunk.page_content for chunk in chunks]
    for query in queries:
        similarities =util.pytorch_cos_sim(model.encode([query]),chunk_embeddings).flatten()
        most_similar_index=similarities.argmax()
        most_similar_doc=doc_texts[most_similar_index]
        print(f"\nQuery: {query}\nResult: {most_similar_doc}")
    plot_similarity_scores(query_embeddings,chunk_embeddings,queries,doc_texts)

def main():
    pdf_path='output.pdf'
    pdf_text=extract_text_from_pdf(pdf_path)
    num_clusters=3
    clusters,embeddings,kmeans,model=perform_semantic_chunking(pdf_text,num_clusters)
    for i, cluster in enumerate(clusters):
        print(f"\nCluster{i+1}:")
        for sentence in cluster:print(f"-{sentence}")
    visualize_clusters(embeddings,kmeans, num_clusters)
    chunks=split_text_into_chunks(pdf_text)
    chunks,chunk_embeddings,_=create_embeddings_for_chunks(chunks)
    queries=["What is CNN vs ANN?","What is an apple?"]
    perform_similarity_search(chunks,chunk_embeddings,queries,model)

if __name__=="__main__":main()
