In [2]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

def preprocess_text(text):
    # Tokenize into sentences
    sentences = sent_tokenize(text)

    # Remove stopwords, perform stemming
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    preprocessed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        words = [stemmer.stem(word.lower()) for word in words if word.isalnum() and word.lower() not in stop_words]
        preprocessed_sentences.append(" ".join(words))

    return preprocessed_sentences

def generate_summary(document, target_length):
    # Tokenize and preprocess the document
    text_sentences = preprocess_text(document)

    # Initialize an empty summary
    summary = []

    # Keep track of the total length
    current_length = 0

    # Iterate through sentences and add to summary until target_length is reached
    for sentence in text_sentences:
        if current_length + len(sentence.split()) <= target_length:
            summary.append(sentence)
            current_length += len(sentence.split())
        else:
            break

    return " ".join(summary)

def generate_query(summary):
    # Tokenize and tag parts of speech
    tokens = word_tokenize(summary)
    pos_tags = pos_tag(tokens)

    # Extract nouns and adjectives for the query
    query_terms = [word for word, pos in pos_tags if pos.startswith('NN') or pos.startswith('JJ')]

    # Form a query by joining the extracted terms
    query = " ".join(query_terms)

    return query

def hierarchical_summarization_with_query(first_doc, second_doc):
    # Measure the length of the two documents
    len_first_doc = len(first_doc.split())
    len_second_doc = len(second_doc.split())

    # Compute target lengths proportionally
    target_len_first_doc = int(len_first_doc / (len_first_doc + len_second_doc) * 128)  # Assuming 128 Mb context window limit
    target_len_second_doc = 128 - target_len_first_doc

    # Initialize an empty collated summary
    collated_summary = ""

    # Slice and summarize the second document iteratively
    while len(collated_summary.split()) <= target_len_second_doc:
        # Slice the second document from start to a point within the context window
        sliced_text = second_doc[:target_len_second_doc - len(collated_summary.split())]

        # Summarize the slice with no request for the size of the target
        summary = generate_summary(sliced_text, target_len_second_doc)

        # Add the summary to collated_summary
        collated_summary += summary

        # Update second_doc by removing the processed part
        second_doc = second_doc[len(sliced_text):]

        if not second_doc:
            break  # Stop if the entire document has been processed

    # Repeat shrinking activities until the summary size is within the context window
    final_summary = generate_summary(collated_summary, 128)  # Assuming 128 Mb context window limit

    # Save the document
    with open("final_summary.txt", "w") as file:
        file.write(final_summary)

    # Generate a query from the final summary
    generated_query = generate_query(final_summary)

    return final_summary, generated_query

# Input prompts
user_first_doc = input("Enter your first document text: ")
user_second_doc = input("Enter your second document text: ")

result_summary, generated_query = hierarchical_summarization_with_query(user_first_doc, user_second_doc)

print("\nFinal Summary saved to 'final_summary.txt'.\n", result_summary)
print("\nGenerated Query:", generated_query)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leuls\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leuls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\leuls\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Enter your first document text: This repository encompasses a Python implementation for generating document summaries and queries. Leveraging NLTK for basic natural language processing tasks and spaCy for named entity recognition, the code allows users to input text and a style reference. The operational pipeline involves tokenization, stemming, and frequency-based summarization. The resulting summary is collated and analyzed for named entities and sentiment, forming a query. The system is adaptable for diverse use cases, providing a simple yet functional approach to text summarization and query generation.Large Language Models (LLMs) have become integral for various natural language processing tasks. These models, such as GPT-3.5, are capable of understanding and generating human-like text. They are trained on vast amounts of diverse data to capture intricate language patterns and nuances. In this context, their applications range from chatbots and language translation to code generat