In [47]:
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.chunk import RegexpParser
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from google.colab import files
import io

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to perform overlapping chunking on text
def chunk_text(text, num_chunks=20):
    sentences = sent_tokenize(text)  # Split the text into sentences
    total_sentences = len(sentences)
    if num_chunks > total_sentences:
        num_chunks = total_sentences  # Reduce num_chunks if there are fewer sentences than chunks

    chunk_size = max(1, total_sentences // num_chunks)  # Ensure chunk_size is at least 1
    overlap = chunk_size // 2  # Define the overlap size

    # Ensure the step size (chunk_size - overlap) is at least 1
    step_size = max(1, chunk_size - overlap)

    chunks = []
    for i in range(0, total_sentences, step_size):
        chunk = " ".join(sentences[i:i + chunk_size])
        chunks.append(chunk)

    print(f"Total chunks created: {len(chunks)}")
    return chunks


# Train Word2Vec model
def train_word2vec(processed_sentences):
    model = Word2Vec(sentences=processed_sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Get sentence embeddings by averaging word vectors
def get_sentence_embedding(sentence, model):
    words = word_tokenize(sentence)
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Match input sentences with PDF sentences
def match_sentences(input_sentences, pdf_sentences, model):
    matched_sentences = []
    # Combine all chunks into a single string to create a context for extraction
    pdf_text = " ".join(pdf_sentences)
    pdf_words = word_tokenize(pdf_text)  # Tokenize the entire PDF text for precise indexing

    for input_sentence in input_sentences:
        input_embedding = get_sentence_embedding(input_sentence, model)
        similarities = []
        for pdf_sentence in pdf_sentences:
            pdf_embedding = get_sentence_embedding(pdf_sentence, model)
            similarity = cosine_similarity([input_embedding], [pdf_embedding])[0][0]
            similarities.append((pdf_sentence, similarity))

        # Find the most similar sentence
        best_match = max(similarities, key=lambda x: x[1])
        matched_sentence = best_match[0]

        # Locate the start of the matched sentence within the combined PDF text
        match_start_index = pdf_text.find(matched_sentence)
        if match_start_index == -1:
            continue  # Skip if the matched sentence is not found due to some error

        # Find word index for the start and end of the matched sentence in the overall PDF words list
        words_before_match = word_tokenize(pdf_text[:match_start_index])
        start_index = len(words_before_match)
        end_index = start_index + len(word_tokenize(matched_sentence))

        # Calculate the start and end indices to get roughly 50 words before and after the matched sentence
        context_start_index = max(0, start_index - 50)
        context_end_index = min(len(pdf_words), end_index + 50)

        # Extract the context words and form the context string
        context_words = pdf_words[context_start_index:context_end_index]
        context = ' '.join(context_words)

        matched_sentences.append((input_sentence, context, best_match[1]))

    return matched_sentences


def main():
    print("Please upload your PDF file:")
    uploaded = files.upload()
    pdf_file = next(iter(uploaded.values()))
    pdf_text = extract_text_from_pdf(io.BytesIO(pdf_file))

    # Preprocess and chunk the text
    pdf_sentences = chunk_text(pdf_text)  # Using the revised chunking method

    # Train Word2Vec model on the chunks
    model = train_word2vec(pdf_sentences)

    print("Enter your sentences separated by a newline. Type 'end' to finish:")
    input_sentences = []
    while True:
        line = input()
        if line.lower() == 'end':
            break
        input_sentences.append(line)

    # Match input sentences with PDF sentences
    results = match_sentences(input_sentences, pdf_sentences, model)

    # Display results
    print("\nMatched Sentences:")
    for input_sentence, context, similarity in results:
        print(f"Input: {input_sentence}")
        print(f"Context: {context}")
        print(f"Similarity: {similarity:.2f}")
        print("-" * 50)

if __name__ == "__main__":
    main()


Please upload your PDF file:


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Saving sample_doc(chunking).pdf to sample_doc(chunking) (22).pdf




Total chunks created: 35
Enter your sentences separated by a newline. Type 'end' to finish:
U.S. Government Bonds have not historically had credit-related defaults
end

Matched Sentences:
Input: U.S. Government Bonds have not historically had credit-related defaults
Context: Important Disclosures – Stone Ridge Longevity and Term Income ETFs The information in the preliminary prospectuses ( as filed with the Securities and Exchange Commission ) for the Stone Ridge Term Income ETFs ( as defined below ) is not complete and will change . The securities described herein for such funds may not be sold u ntil the registration statements become effective . This is not an offer to sell or the solicitation of an offer to buy securities and is not soliciting an offer to buy these securities in any state in which the offer , solicitation or sale would be unlawful . This is not an offer to sell or the solicitation of an offer to buy securities and is not soliciting an offer to buy these securities 