<a href="https://colab.research.google.com/github/NargesSamaeii/NLP_Assignment/blob/main/NLP_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import os

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def read_and_tokenize_text(file_path):
    """Reads text from a file, tokenizes it into sentences and words."""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]
    return text, sentences, words

def calculate_sentence_similarity(sent1, sent2, stopwords):
    """Calculates cosine similarity between two sentences."""
    words1 = [word.lower() for word in sent1 if word.isalnum() and word.lower() not in stopwords]
    words2 = [word.lower() for word in sent2 if word.isalnum() and word.lower() not in stopwords]

    all_words = list(set(words1 + words2))

    vector1 = [1 if word in words1 else 0 for word in all_words]
    vector2 = [1 if word in words2 else 0 for word in all_words]

    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stopwords):
    """Builds a similarity matrix for a list of sentences."""
    matrix_size = len(sentences)
    matrix = np.zeros((matrix_size, matrix_size))

    for i in range(matrix_size):
        for j in range(matrix_size):
            if i != j:
                matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j], stopwords)

    return matrix

def slice_and_summarize_document(sentences, target_lengths, stopwords_list, context_window_size):
    """Slices and summarizes a document based on target lengths."""
    summary = ""
    included_sentences = set()

    for i in range(len(sentences)):
        slice_start = 0 if i == 0 else int(sum(target_lengths[:i]))
        slice_end = min(int(sum(target_lengths[:i + 1])), len(sentences))

        sliced_document = ' '.join(sentences[slice_start:slice_end])
        slice_summary = extractive_summarization(sliced_document, stopwords_list)

        for sent in slice_summary.split('\n'):
            if sent not in included_sentences:
                summary += sent + '\n'
                included_sentences.add(sent)

    while len(word_tokenize(summary)) > context_window_size:
        summary = extractive_summarization(summary, stopwords_list)

    return summary

In [None]:
def generate_summary(document_text, context_window_size, style_text=None):
    """Generates a summary for a document with optional style text."""
    document_text, sentences, _ = read_and_tokenize_text(document_text)
    stopwords_list = set(stopwords.words('english'))

    document_length = len(word_tokenize(document_text))
    target_lengths = [int(len(sentence) * (len(sentence) / document_length)) for sentence in sentences]

    summary = slice_and_summarize_document(sentences, target_lengths, stopwords_list, context_window_size)

    with open('summary.txt', 'w', encoding='utf-8') as file:
        file.write(summary)

    if style_text:
        style_text, _, _ = read_and_tokenize_text(style_text)
        style_summary = extractive_summarization(style_text, stopwords_list)
        summary += style_summary

    query = generate_query(summary)

    return summary, query

def extractive_summarization(text, stopwords_list):
    """Generates an extractive summary for a given text."""
    sentences = sent_tokenize(text)
    sentence_matrix = build_similarity_matrix(sentences, stopwords_list)

    sentence_ranks = np.sum(sentence_matrix, axis=1)
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_ranks)[::-1]]

    summary_length = int(len(sentences) * 0.3)
    summary = '\n'.join(ranked_sentences[:summary_length])

    return summary

def generate_query(summary):
    """Generates a query based on the provided summary."""
    query = "Please provide relevant information about:\n" + summary
    return query

In [None]:
# Example usage:
document_path = 'example.txt'
context_window_size = 4000
style_text_path = 'example_style_text.txt' if os.path.exists('example_style_text.txt') else None

generated_summary, generated_query = generate_summary(document_path, context_window_size, style_text_path)
print("Generated Summary:\n", generated_summary)
print("\nGenerated Query:\n", generated_query)