In [1]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice



In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
def preprocessing(article):
    lines = article.split(".")   # splits the whole article into lines
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    article_preprocessed = []    # list that contains the main sentences after being preprocessed 
    for line in lines:
        line_preprocessed = []
        words_in_line = line.split()
        for word in words_in_line:
            if (word not in stopwords_english and word not in string.punctuation and word != 'the' and word != 'The'):   # make sure word is not a stop word
                                                                                     # and not a punctuation 
                word_stemmed = stemmer.stem(word)  
                line_preprocessed.append(word_stemmed)
        article_preprocessed.append(line_preprocessed)
    return article_preprocessed

In [27]:
with open("article_2.txt", "r", encoding="utf8") as file:
    contents = file.read()
    article_preprocessed = preprocessing(contents)

In [28]:
print(contents)

'Alcohol' is taken in almost all cool and cold climates, and to a very much less extent in hot ones. Thus, it is taken by people who live in the Himalaya Mountains, but not nearly so much by those who live in the plains of India. Alcohol is not necessary in any way to anybody. The regular use of alcohol, even in small quantities, tends to cause mischief in many ways to various organs of the body. It affects the liver, it weakens the mental powers, and lessens the general energy of the body. In addition, damage to the central nervous system and peripheral nervous system can occur from chronic alcohol abuse.


In [29]:
def convert_list_to_string(content):
    content_modified = []   # list of strings
    for line in content:
        line_as_string = " ".join(line)
        content_modified.append(line_as_string)
    return content_modified

In [30]:
def calculate_TF_IDF(content):
    
    vectorizer = TfidfVectorizer()   # Create a TfidfVectorizer object
    vectorizer.fit(content)   # Fit the vectorizer to the documents
    feature_names = vectorizer.get_feature_names()

    print("Feature names:")   # Print the feature names
    print(feature_names)   

    tfidf_matrix = vectorizer.transform(content)   # Transform the documents into a TF-IDF matrix
    np.set_printoptions(threshold=np.inf)
#     print("TF-IDF matrix:")   # Print the TF-IDF matrix
#     print(tfidf_matrix.toarray())
    return tfidf_matrix.toarray()
    

In [31]:
article_modified = convert_list_to_string(article_preprocessed)
calculate_TF_IDF(article_modified)

Feature names:
['abus', 'addition', 'affect', 'alcohol', 'almost', 'anybodi', 'bodi', 'caus', 'central', 'chronic', 'climates', 'cold', 'cool', 'damag', 'energi', 'even', 'extent', 'gener', 'himalaya', 'hot', 'in', 'india', 'it', 'less', 'lessen', 'live', 'liver', 'mani', 'mental', 'mischief', 'mountains', 'much', 'nearli', 'necessari', 'nervou', 'occur', 'one', 'organ', 'peopl', 'peripher', 'plain', 'powers', 'quantities', 'regular', 'small', 'system', 'taken', 'tend', 'thus', 'use', 'variou', 'way', 'weaken']


array([[0.        , 0.        , 0.        , 0.19720761, 0.32013214,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32013214, 0.32013214, 0.32013214, 0.        , 0.        ,
        0.        , 0.32013214, 0.        , 0.        , 0.32013214,
        0.        , 0.        , 0.        , 0.32013214, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26573717, 0.        , 0.        , 0.        ,
        0.        , 0.32013214, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26573717, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28423219, 0.        ,
   

In [32]:
def calculate_each_sentence_score(tf_idf_matrix):
    score_dict = {}   # dictionary that stores keys as summation of tf_idf scores for all word in current line 
                      # and values of dictionary as index of the line in the article
    for index, line in enumerate(tf_idf_matrix):
#         print(line)
#         print('at')
#         print(index)
        score = np.sum(line)
        score_dict[score] = index
        
    sorted_keys = sorted(score_dict.items(), reverse=True)   # sort the dictionary by keys in the descending order
    score_dict_reversed = dict(sorted_keys)

#     print(score_dict_reversed)
    return score_dict_reversed

In [33]:
def summary_generation(article, score_dict_reversed, number_of_sentences):
    lines = article.split(".")   # list contains strings, where these strings are original lines
    
    score_dict = dict(islice(score_dict_reversed.items(), number_of_sentences))   # select only certain number of lines
                                                                                  # to be displayed 
        
    sort_data = sorted(score_dict.items(), key=lambda x: x[1])   # sort the dictionary by value (index of lines) in
                                                                 # the ascending order to display lines ordered as the 
                                                                 # original article 
    score_dict_ascending = dict(sort_data)
    print(score_dict_ascending)
    print(score_dict_reversed)
    output_list = []
    for key in score_dict_ascending:
        output_list.append(lines[score_dict_ascending[key]])
            
    output_string = ".".join(output_list)
    output_string += "."
    return output_string

In [40]:
article_modified = convert_list_to_string(article_preprocessed)
tf_idf_matrix = calculate_TF_IDF(article_modified)
dict_scores = calculate_each_sentence_score(tf_idf_matrix)
output = summary_generation(contents, dict_scores, 4)
print(output)

Feature names:
['abus', 'addition', 'affect', 'alcohol', 'almost', 'anybodi', 'bodi', 'caus', 'central', 'chronic', 'climates', 'cold', 'cool', 'damag', 'energi', 'even', 'extent', 'gener', 'himalaya', 'hot', 'in', 'india', 'it', 'less', 'lessen', 'live', 'liver', 'mani', 'mental', 'mischief', 'mountains', 'much', 'nearli', 'necessari', 'nervou', 'occur', 'one', 'organ', 'peopl', 'peripher', 'plain', 'powers', 'quantities', 'regular', 'small', 'system', 'taken', 'tend', 'thus', 'use', 'variou', 'way', 'weaken']
{3.289739028647383: 0, 3.7169746704965743: 3, 3.1580345719945706: 4, 3.117254743580706: 5}
{3.7169746704965743: 3, 3.289739028647383: 0, 3.1580345719945706: 4, 3.117254743580706: 5, 3.0299640067036613: 1, 1.967269629670797: 2, 0.0: 6}
'Alcohol' is taken in almost all cool and cold climates, and to a very much less extent in hot ones. The regular use of alcohol, even in small quantities, tends to cause mischief in many ways to various organs of the body. It affects the liver, it 