In [122]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice



In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def preprocessing(article):
    lines = article.split(".")   # splits the whole article into lines
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    article_preprocessed = []    # list that contains the main sentences after being preprocessed 
    for line in lines:
        line_preprocessed = []
        words_in_line = line.split()
        for word in words_in_line:
            if (word not in stopwords_english and word not in string.punctuation):   # make sure word is not a stop word
                                                                                     # and not a punctuation 
                word_stemmed = stemmer.stem(word)  
                line_preprocessed.append(word_stemmed)
        article_preprocessed.append(line_preprocessed)
    return article_preprocessed

In [14]:
with open("article_1.txt", "r", encoding="utf8") as file:
    contents = file.read()
    article_preprocessed = preprocessing(contents)

In [15]:
print(contents)

The “A” in UAW has long carried an elaborate set of meanings, both economic and social. Rooted in the vehicle factories of the Upper Midwest, the United Automobile Workers organized hundreds of thousands of blue-collar workers in America’s most important industry by the onset of the Second World War. That war generated aircraft industry jobs, many of them held by female Rosies who riveted airplane frames. By the end of the war, many of those workers had joined the UAW alongside workers from the farm equipment industry, giving the union a new full name: the United Automobile, Aircraft, and Agricultural Implement Workers. (Aircraft would later be changed to Aerospace during the Cold War missile race.)

Now, unofficially, that “A” also stands for academic workers, 48,000 of whom are on strike at the University of California. Though there has been no formal change to the UAW name since 1962, there is no doubt that the academy has become the most dynamic and successful venue for the union i

In [16]:
def convert_list_to_string(content):
    content_modified = []   # list of strings
    for line in content:
        line_as_string = " ".join(line)
        content_modified.append(line_as_string)
    return content_modified

In [74]:
def calculate_TF_IDF(content):
    
    vectorizer = TfidfVectorizer()   # Create a TfidfVectorizer object
    vectorizer.fit(content)   # Fit the vectorizer to the documents
    feature_names = vectorizer.get_feature_names()

#     print("Feature names:")   # Print the feature names
#     print(feature_names)   

    tfidf_matrix = vectorizer.transform(content)   # Transform the documents into a TF-IDF matrix
    np.set_printoptions(threshold=np.inf)
#     print("TF-IDF matrix:")   # Print the TF-IDF matrix
#     print(tfidf_matrix.toarray())
    return tfidf_matrix.toarray()
    

In [75]:
article_modified = convert_list_to_string(article_preprocessed)
calculate_TF_IDF(article_modified)

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [108]:
def calculate_each_sentence_score(tf_idf_matrix):
    score_dict = {}
    for index, line in enumerate(tf_idf_matrix):
#         print(line)
#         print('at')
#         print(index)
        score = np.sum(line)
        score_dict[score] = index
        
    sorted_keys = sorted(score_dict.items(), reverse=True)
    score_dict_reversed = dict(sorted_keys)

#     print(score_dict_reversed)
    return score_dict_reversed

In [138]:
def summary_generation(article, score_dict_reversed, number_of_sentences):
    lines = article.split(".")   # list contains strings, where these strings are original lines
    
    score_dict = dict(islice(score_dict_reversed.items(), number_of_sentences))
    sort_data = sorted(score_dict.items(), key=lambda x: x[1])
    score_dict_ascending = dict(sort_data)
    print(score_dict_ascending)
    print(score_dict_reversed)
    output_list = []
    for key in score_dict_ascending:
        output_list.append(lines[score_dict_ascending[key]])
            
    output_string = ".".join(output_list)
    output_string += "."
    return output_string

In [140]:
article_modified = convert_list_to_string(article_preprocessed)
tf_idf_matrix = calculate_TF_IDF(article_modified)
dict_scores = calculate_each_sentence_score(tf_idf_matrix)
output = summary_generation(contents, dict_scores, 7)
print(output)

{4.85586067388996: 11, 5.737073337079719: 39, 5.513554685381946: 44, 7.02412776932971: 46, 4.868808928511606: 48, 5.152671936771097: 49, 5.14579250580587: 56}
{7.02412776932971: 46, 5.737073337079719: 39, 5.513554685381946: 44, 5.152671936771097: 49, 5.14579250580587: 56, 4.868808928511606: 48, 4.85586067388996: 11, 4.8474107125185935: 12, 4.6651755743163035: 53, 4.659850646983827: 8, 4.459960211336074: 3, 4.45025713707998: 1, 4.448901704242086: 36, 4.424294529503122: 40, 4.327644219427301: 24, 4.308989433653054: 31, 4.29575142912558: 16, 4.29179565044478: 29, 4.2868004094801115: 35, 4.201144898834351: 43, 4.183282324197345: 41, 4.121725590741891: 25, 4.0986091404287315: 38, 4.070226800181553: 18, 4.052688253391732: 9, 3.955871020702079: 6, 3.9361990487141405: 30, 3.8529217482183906: 28, 3.8261280129800426: 50, 3.7629160476937766: 15, 3.71259934392405: 17, 3.601634280663853: 55, 3.5727359392263986: 2, 3.5676480572726383: 26, 3.4319264522060497: 14, 3.4225344873703034: 34, 3.41986801849