In [1]:
from joblib import dump
import sys

import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import math

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stemmer
ps = PorterStemmer()

def frequency_table(text, language):
    words = nltk.word_tokenize(text)
    word_freq = {}
    
    stop_words = set(stopwords.words(language))
    
    for word in words:
        word = ps.stem(word.lower())
        if word not in stop_words and len(word) > 2:
            if word not in word_freq:
                word_freq[word] = 1
            else:
                word_freq[word] += 1
    return word_freq

def frequency_matrix(sentences, language):
    frequency_matrix = {}
    for sent in nltk.sent_tokenize(sentences):
        frequency_matrix[sent[:15]] = frequency_table(sent, language)
    return frequency_matrix

def tf_matrix(freq_matrix):
    tf_matrix = {}
    for sent, freq_table in freq_matrix.items():
        tf_table = {}
        ft_len = len(freq_table)
        for Dict in freq_table.keys():
            tf_table[Dict] = round(freq_table[Dict] / ft_len, 2)
        tf_matrix[sent] = tf_table
    return tf_matrix

def doc_frequency(freq_matrix):
    doc_table = {}
    for sent, freq_table in freq_matrix.items():
        for word in freq_table.keys():
            if word not in doc_table:
                doc_table[word] = 1
            else:
                doc_table[word] += 1
    return doc_table

def idf_matrix(freq_matrix):
    idf_matrix = {}
    total_documents = len(freq_matrix)

    word_doc_count = {}
    for sentence in freq_matrix:
        for word in freq_matrix[sentence]:
            if freq_matrix[sentence][word] > 0:
                word_doc_count[word] = word_doc_count.get(word, 0) + 1

    for sentence, freq_table in freq_matrix.items():
        idf_table = {}
        for word in freq_table.keys():
            if word in word_doc_count:
                idf_table[word] = round(math.log(total_documents / (1 + word_doc_count[word])), 2)
            else:
                idf_table[word] = 0  # Word not found in any document
        idf_matrix[sentence] = idf_table

    return idf_matrix

def tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    for (sent1, ft_table1), (sent2, ft_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        freq_table = {}
        for (word1, value1), (word2, value2) in zip(ft_table1.items(), ft_table2.items()):
            freq_table[word1] = round(value1 * value2, 2)
        tf_idf_matrix[sent1] = freq_table
    return tf_idf_matrix

def score_sentence(tf_idf_matrix):
    sent_score = {}
    for sent, ft_table in tf_idf_matrix.items():
        score = sum(ft_table.values())  # Calculate the score for the sentence
        sent_score[sent] = score  # Store the score for the sentence
    return sent_score

def average_score(sent_score):
    sum_sent = sum(sent_score.values())
    avg_score = round(sum_sent / len(sent_score), 2)
    return avg_score

def get_summary(sentences, sentence_score, threshold):
    summary = ""
    for sent in sentences:
        if sent[:15] in sentence_score and sentence_score[sent[:15]] >= threshold:
            summary += " " + sent
    return summary

def lets_summarize(text, language='english'):
    sentences = sent_tokenize(text)
    freq_matrix = frequency_matrix(text, language)
    tf_matrix1 = tf_matrix(freq_matrix)
    idf1_matrix = idf_matrix(freq_matrix)
    tf_idf_matrix1 = tf_idf_matrix(tf_matrix1, idf1_matrix)
    sent_score = score_sentence(tf_idf_matrix1)
    
    if not sent_score:
        return "No sentences scored. Unable to generate a summary."
    
    threshold = average_score(sent_score)
    summary = get_summary(sentences, sent_score, 1.15 * threshold)
    return summary

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python summarization.py <language> <text>")
        sys.exit(1)
    
    language = sys.argv[1]
    text = sys.argv[2]
    
    summary = lets_summarize(text, language=language)
    print(summary)

dump(lets_summarize, 'summarization_model.joblib')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\azerty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\azerty\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['summarization_model.joblib']