<a href="https://colab.research.google.com/github/NargesSamaeii/NLP_Assignment/blob/main/NLPAssignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def tokenize_sentences(text):
    return sent_tokenize(text)

def tokenize_words(sentence):
    return word_tokenize(sentence)

def sentence_similarity(sent1, sent2, stopwords):
    words1 = [word.lower() for word in sent1 if word.isalnum() and word.lower() not in stopwords]
    words2 = [word.lower() for word in sent2 if word.isalnum() and word.lower() not in stopwords]

    all_words = list(set(words1 + words2))

    vector1 = [1 if word in words1 else 0 for word in all_words]
    vector2 = [1 if word in words2 else 0 for word in all_words]

    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stopwords):
    matrix = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                matrix[i][j] = sentence_similarity(sentences[i], sentences[j], stopwords)

    return matrix

def generate_summary(document_text, context_window_size, style_text=None):
    sentences = tokenize_sentences(document_text)
    stopwords_list = set(stopwords.words('english'))

    # Measure the length of the document
    document_length = len(tokenize_words(document_text))

    # Compute target lengths in a proportional way
    target_lengths = [int(len(sentence) * (len(sentence) / document_length)) for sentence in sentences]

    # Slice the document and generate summaries
    summary = ""
    for i in range(len(sentences)):
        slice_start = 0 if i == 0 else int(sum(target_lengths[:i]))
        slice_end = int(sum(target_lengths[:i + 1]))
        slice_end = min(slice_end, len(sentences))  # Ensure not to go beyond the document length

        # Slice the document
        sliced_document = ' '.join(sentences[slice_start:slice_end])

        # Summarize the slice
        slice_summary = extractive_summarization(sliced_document, stopwords_list)

        # Collate the summaries
        summary += slice_summary

    # Repeat shrinking activities until the summary size is within the context window
    while len(tokenize_words(summary)) > context_window_size:
        summary = extractive_summarization(summary, stopwords_list)

    # Save the document (you can save the summary to a file if needed)
    with open('summary.txt', 'w', encoding='utf-8') as file:
        file.write(summary)

    # Repeat the summarization for the second document (if needed)
    if style_text:
        style_summary = extractive_summarization(style_text, stopwords_list)
        summary += style_summary

    # Generate the query
    query = generate_query(summary)

    return summary, query

def extractive_summarization(text, stopwords_list):
    sentences = tokenize_sentences(text)
    sentence_matrix = build_similarity_matrix(sentences, stopwords_list)

    # Rank sentences based on similarity matrix
    sentence_ranks = np.sum(sentence_matrix, axis=1)

    # Sort sentences by rank
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_ranks)[::-1]]

    # Select top sentences for the summary (you can adjust the summary length as needed)
    summary_length = int(len(sentences) * 0.3)
    summary = ' '.join(ranked_sentences[:summary_length])

    return summary

def generate_query(summary):
    # Placeholder for query generation logic
    query = "Please provide relevant information about:\n" + summary
    return query

# Example usage:
document_path = 'path_to_your_document.txt'  # Replace with the actual path to the document
context_window_size = 128  # You can adjust this based on your requirements
document_text = read_text(document_path)

# Optional: Style text (provide another text for style transfer, if needed)
style_text = 'path_to_your_style_text.txt'  # Replace with the actual path to the style text file

# Generate summary and query
generated_summary, generated_query = generate_summary(document_text, context_window_size, style_text)
print("Generated Summary:\n", generated_summary)
print("\nGenerated Query:\n", generated_query)