In [12]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import os

In [26]:
def extract_text_from_pdf(pdf_file):
    raw_text = ''
    pdf_reader = PdfReader(pdf_file)
    for i, page in enumerate(pdf_reader.pages):
        content = page.extract_text()
        if content:
            raw_text += content
        else:
            print(f"Warning: No content extracted from page {i} of {pdf_file}")

    if not raw_text:
        print(f"Warning: No text extracted from {pdf_file}. Skipping this file.")
    return raw_text

In [27]:
def create_embeddings(text, model):
    sentences = text.split('\n')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentences, embeddings

In [28]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [29]:
# Saving the DB and extracted content
output_directory = r"C:\Users\harsh\Desktop\Gen AI Projects\ChatPDF\Performance Measure\Sample DB"
os.makedirs(output_directory, exist_ok=True)

In [30]:
pdf_file = r"C:\Users\harsh\Downloads\Sample PDF.pdf"

In [37]:
# Extracting and Embedding contents from PDF
all_sentences=[]
all_embeddings=[]

text = extract_text_from_pdf(pdf_path)
sentences, embeddings = create_embeddings(text, model)
all_sentences.extend(sentences)
all_embeddings.append(embeddings)

In [38]:
#print(all_sentences[10:200])

In [39]:
all_embeddings = np.vstack(all_embeddings)

# Creating FAISS index
embedding_dim = all_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(all_embeddings)

In [40]:
faiss.write_index(index, os.path.join(output_directory, 'vector_store.index'))
with open(os.path.join(output_directory, 'sentences.txt'), 'w') as f:
    for sentence in all_sentences:
        f.write(f"{sentence}\n")

In [2]:
# Loading the saved DB and Content

import faiss
import os
from sentence_transformers import SentenceTransformer
import re

output_directory = r"C:\Users\harsh\Desktop\Gen AI Projects\ChatPDF\Performance Measure\Sample DB"

index = faiss.read_index(os.path.join(output_directory, 'vector_store.index'))

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
with open(os.path.join(output_directory, 'sentences.txt'), 'r', encoding='utf-8') as f:
    sentences = f.readlines()

In [6]:
def search_similar_sentences(query, model, index, sentences, top_k=5):
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = query_embedding.cpu().detach().numpy()

    # Searching in Faiss index
    D, I = index.search(query_embedding_np, top_k)

    # Retrieving similar sentences
    results = []
    for i in range(top_k):
        similar_sentence = sentences[I[0][i]].strip()  # Get the sentence
        similarity_score = 1 - D[0][i]  # Calculate similarity score (cosine similarity)
        results.append((similar_sentence, similarity_score))

    return results


In [7]:
# Function to ask questions and see similarity search result
def ask_multiple_questions(model, index, sentences, top_k=5):
    while True:
        query = input("Enter your question (type 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break
        
        similar_results = search_similar_sentences(query, model, index, sentences, top_k=top_k)
        print(f"Query: {query}")
        for i, (sentence, score) in enumerate(similar_results, start=1):
            print(f"Similar Sentence {i}: {sentence} (Similarity Score: {score:.4f})")
        print()  # Print a blank line for separation

In [None]:
Question = ask_multiple_questions(model, index, sentences)

Enter your question (type 'exit' to quit):  Who was Tom's Friend?


Query: Who was Tom's Friend?
Similar Sentence 1: “No,” Tom said, “I can’t be your friend, because the (Similarity Score: 0.4959)
Similar Sentence 2: Tom walked to his friend Joe Harper’s house and played (Similarity Score: 0.3403)
Similar Sentence 3: Tom Sawyer (Similarity Score: 0.3296)
Similar Sentence 4: Then Tom went to school, but he was late. The teacher (Similarity Score: 0.3024)
Similar Sentence 5: Tom Sawyer lived with his aunt because his mother and (Similarity Score: 0.2923)



Enter your question (type 'exit' to quit):  At twelve o clock where was Tom?


Query: At twelve o clock where was Tom?
Similar Sentence 1: That night Tom went to bed at nine o’clock, but he (Similarity Score: 0.4182)
Similar Sentence 2: One Saturday afternoon Tom wanted to have an adventure (Similarity Score: 0.3186)
Similar Sentence 3: Then Tom went to school, but he was late. The teacher (Similarity Score: 0.2898)
Similar Sentence 4: Monday morning, Tom went to school. The children wanted to hear about his adventure, and Tom liked (Similarity Score: 0.2793)
Similar Sentence 5: Tom went out of the cave. Chapter 11    In the Cave Again (Similarity Score: 0.2598)



Enter your question (type 'exit' to quit):  Who was angry with Tom?


Query: Who was angry with Tom?
Similar Sentence 1: Saturday morning, Tom was not happy, but he started to (Similarity Score: 0.3717)
Similar Sentence 2: a    is Aunt Polly angry with Tom? (Similarity Score: 0.3690)
Similar Sentence 3: The teacher was angry again. “Tom Sawyer, stop (Similarity Score: 0.3632)
Similar Sentence 4: But Becky was angry with Tom. She walked away and (Similarity Score: 0.3224)
Similar Sentence 5: didn’t answer. Tom was unhappy. He didn’t go to school in the afternoon. (Similarity Score: 0.3120)

