In [4]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice



In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [193]:
def preprocessing(article):
    lines = article.split(".")   # splits the whole article into lines
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    article_preprocessed = []    # list that contains the main sentences after being preprocessed 
    for line in lines:
        line_preprocessed = []
        words_in_line = line.split()
        for word in words_in_line:
            if (word not in stopwords_english and word not in string.punctuation):   # make sure word is not a stop word
                                                                                     # and not a punctuation 
                word_stemmed = stemmer.stem(word)  
                line_preprocessed.append(word_stemmed)
        article_preprocessed.append(line_preprocessed)
    return article_preprocessed

In [194]:
with open("article_2.txt", "r", encoding="utf8") as file:
    contents = file.read()
    article_preprocessed = preprocessing(contents)
    print(article_preprocessed)

[["'alcohol'", 'taken', 'almost', 'cool', 'cold', 'climates,', 'much', 'less', 'extent', 'hot', 'one'], ['thus,', 'taken', 'peopl', 'live', 'himalaya', 'mountains,', 'nearli', 'much', 'live', 'plain', 'india'], ['alcohol', 'necessari', 'way', 'anybodi'], ['the', 'regular', 'use', 'alcohol,', 'even', 'small', 'quantities,', 'tend', 'caus', 'mischief', 'mani', 'way', 'variou', 'organ', 'bodi'], ['it', 'affect', 'liver,', 'weaken', 'mental', 'powers,', 'lessen', 'gener', 'energi', 'bodi'], ['in', 'addition,', 'damag', 'central', 'nervou', 'system', 'peripher', 'nervou', 'system', 'occur', 'chronic', 'alcohol', 'abus'], []]


In [195]:
print(contents)

'Alcohol' is taken in almost all cool and cold climates, and to a very much less extent in hot ones. Thus, it is taken by people who live in the Himalaya Mountains, but not nearly so much by those who live in the plains of India. Alcohol is not necessary in any way to anybody. The regular use of alcohol, even in small quantities, tends to cause mischief in many ways to various organs of the body. It affects the liver, it weakens the mental powers, and lessens the general energy of the body. In addition, damage to the central nervous system and peripheral nervous system can occur from chronic alcohol abuse.


In [196]:
def convert_list_to_string(content):
    content_modified = []   # list of strings
    for line in content:
        line_as_string = " ".join(line)
        content_modified.append(line_as_string)
    return content_modified

In [197]:
def calculate_TF_IDF(content):
    
    vectorizer = TfidfVectorizer()   # Create a TfidfVectorizer object
    vectorizer.fit(content)   # Fit the vectorizer to the documents
    feature_names = vectorizer.get_feature_names()

#     print("Feature names:")   # Print the feature names
#     print(feature_names)   

    tfidf_matrix = vectorizer.transform(content)   # Transform the documents into a TF-IDF matrix
    np.set_printoptions(threshold=np.inf)
#     print("TF-IDF matrix:")   # Print the TF-IDF matrix
#     print(tfidf_matrix.toarray())
    return tfidf_matrix.toarray()
    

In [198]:
article_modified = convert_list_to_string(article_preprocessed)
calculate_TF_IDF(article_modified)

array([[0.        , 0.        , 0.        , 0.19720761, 0.32013214,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32013214, 0.32013214, 0.32013214, 0.        , 0.        ,
        0.        , 0.32013214, 0.        , 0.        , 0.32013214,
        0.        , 0.        , 0.        , 0.32013214, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26573717, 0.        , 0.        , 0.        ,
        0.        , 0.32013214, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26573717, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28423219, 0. 

In [199]:
def calculate_each_sentence_score(tf_idf_matrix):
    score_dict = {}   # dictionary that stores keys as summation of tf_idf scores for all word in current line 
                      # and values of dictionary as index of the line in the article
    for index, line in enumerate(tf_idf_matrix):
#         print(line)
#         print('at')
#         print(index)
        score = np.sum(line)
        score_dict[score] = index
        
    sorted_keys = sorted(score_dict.items(), reverse=True)   # sort the dictionary by keys in the descending order
    score_dict_reversed = dict(sorted_keys)

#     print(score_dict_reversed)
    return score_dict_reversed

In [200]:
def summary_generation(article, score_dict_reversed, number_of_sentences):
    lines = article.split(".")   # list contains strings, where these strings are original lines
    
    score_dict = dict(islice(score_dict_reversed.items(), number_of_sentences))   # select only certain number of lines
                                                                                  # to be displayed 
        
    sort_data = sorted(score_dict.items(), key=lambda x: x[1])   # sort the dictionary by value (index of lines) in
                                                                 # the ascending order to display lines ordered as the 
                                                                 # original article 
    score_dict_ascending = dict(sort_data)
    print(score_dict_ascending)
    print(score_dict_reversed)
    output_list = []
    for key in score_dict_ascending:
        output_list.append(lines[score_dict_ascending[key]])
            
    output_string = ".".join(output_list)
    output_string += "."
    return output_string

In [201]:
article_modified = convert_list_to_string(article_preprocessed)
tf_idf_matrix = calculate_TF_IDF(article_modified)
dict_scores = calculate_each_sentence_score(tf_idf_matrix)
output = summary_generation(contents, dict_scores, 3)
print(output)

{3.289739028647383: 0, 3.8489437778305033: 3, 3.1580345719945706: 4}
{3.8489437778305033: 3, 3.289739028647383: 0, 3.1580345719945706: 4, 3.117254743580706: 5, 3.0299640067036613: 1, 1.967269629670797: 2, 0.0: 6}
'Alcohol' is taken in almost all cool and cold climates, and to a very much less extent in hot ones. The regular use of alcohol, even in small quantities, tends to cause mischief in many ways to various organs of the body. It affects the liver, it weakens the mental powers, and lessens the general energy of the body.


In [202]:
def sentence_length(content):
    max_length = 0
    for sentence in content:
        print(sentence)
        if len(sentence) > max_length:
            max_length = len(sentence)
            
    sentence_length_feature = []
    for sentence in content:
        sentence_length_feature.append(len(sentence) / max_length)
    return sentence_length_feature

In [203]:
sentence_length(article_preprocessed)

["'alcohol'", 'taken', 'almost', 'cool', 'cold', 'climates,', 'much', 'less', 'extent', 'hot', 'one']
['thus,', 'taken', 'peopl', 'live', 'himalaya', 'mountains,', 'nearli', 'much', 'live', 'plain', 'india']
['alcohol', 'necessari', 'way', 'anybodi']
['the', 'regular', 'use', 'alcohol,', 'even', 'small', 'quantities,', 'tend', 'caus', 'mischief', 'mani', 'way', 'variou', 'organ', 'bodi']
['it', 'affect', 'liver,', 'weaken', 'mental', 'powers,', 'lessen', 'gener', 'energi', 'bodi']
['in', 'addition,', 'damag', 'central', 'nervou', 'system', 'peripher', 'nervou', 'system', 'occur', 'chronic', 'alcohol', 'abus']
[]


[0.7333333333333333,
 0.7333333333333333,
 0.26666666666666666,
 1.0,
 0.6666666666666666,
 0.8666666666666667,
 0.0]

In [214]:
with open("001_original.txt", "r", encoding="utf8") as file:
    original = file.read()
    article_preprocessed = preprocessing(original)
    print(original)

with open("001_summarized.txt", "r", encoding="utf8") as file:
    summarized = file.read()
    print(summarized)

Claxton hunting first major medal.

British hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid.

The 25-year-old has already smashed the British record over 60m hurdles twice this season, setting a new mark of 7.96 seconds to win the AAAs title. "I am quite confident," said Claxton. "But I take each race as it comes. "As long as I keep up my training but not do too much I think there is a chance of a medal." Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage. Now, the Scotland-born athlete owns the equal fifth-fastest time in the world this year. And at last week's Birmingham Grand Prix, Claxton left European medal favourite Russian Irina Shevchenko trailing in sixth spot.

For the first time, Claxton has only been preparing for a campaign over the hurdles - which could explain her leap in form. In previous seasons

In [215]:
def generate_Y_labels(original, summarized):
    Y_list = []
    original_list = re.split(r"\n\n|\.", original)
    original_list_removed_empty = [x for x in original_list if x.strip()]   # removes empty elements
    original_list_no_quotation = [x.replace('"', '') for x in original_list_removed_empty]
    original_list_no_first_space = [x.lstrip() for x in original_list_no_quotation]
    
    summarized_list = summarized.replace('"', '')
    summarized_list = summarized.split(".")
    summarized_list_removed_empty = [x for x in summarized_list if x.strip()]   # removes empty elements
    summarized_list_no_quotation = [x.replace('"', '') for x in summarized_list_removed_empty]
    summarized_list_no_first_space = [x.lstrip() for x in summarized_list_no_quotation]
    
    print(original_list_no_first_space)
    print(summarized_list_no_first_space)
    
    print(len(original_list_no_first_space))
    print(len(summarized_list_no_first_space))
    for sentences in original_list_no_first_space:
        if sentences in summarized_list_no_first_space:
            Y_list.append(1)
            print(sentences)
            
        else:
            Y_list.append(0)
    return Y_list

In [216]:
Y = generate_Y_labels(original, summarized)
print(Y)

['Claxton hunting first major medal', "British hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid", 'The 25-year-old has already smashed the British record over 60m hurdles twice this season, setting a new mark of 7', '96 seconds to win the AAAs title', 'I am quite confident, said Claxton', 'But I take each race as it comes', 'As long as I keep up my training but not do too much I think there is a chance of a medal', 'Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage', 'Now, the Scotland-born athlete owns the equal fifth-fastest time in the world this year', "And at last week's Birmingham Grand Prix, Claxton left European medal favourite Russian Irina Shevchenko trailing in sixth spot", 'For the first time, Claxton has only been preparing for a campaign over the hurdles - which could explain her leap in form', 'In

In [220]:
def generate_X_labels(preprocessed_artcile):
    sentence_length_feature = sentence_length(preprocessed_artcile)
    
    article_modified = convert_list_to_string(article_preprocessed)
    tf_idf_matrix = calculate_TF_IDF(article_modified)
    score = []
    for index, line in enumerate(tf_idf_matrix):
#         print(line)
#         print('at')
#         print(index)
        score.append(np.sum(line))
    matrix = np.column_stack((sentence_length_feature, score))
    matrix = matrix[:len(matrix)-1]
    return matrix

In [221]:
X = generate_X_labels(article_preprocessed)
print(X)

['claxton', 'hunt', 'first', 'major', 'medal']
['british', 'hurdler', 'sarah', 'claxton', 'confid', 'win', 'first', 'major', 'medal', 'next', "month'", 'european', 'indoor', 'championship', 'madrid']
['the', '25-year-old', 'alreadi', 'smash', 'british', 'record', '60m', 'hurdl', 'twice', 'season,', 'set', 'new', 'mark', '7']
['96', 'second', 'win', 'aaa', 'titl']
['"i', 'quit', 'confident,"', 'said', 'claxton']
['"but', 'i', 'take', 'race', 'come']
['"a', 'long', 'i', 'keep', 'train', 'much', 'i', 'think', 'chanc', 'medal']
['claxton', 'nation', '60m', 'hurdl', 'titl', 'past', 'three', 'year', 'struggl', 'translat', 'domest', 'success', 'intern', 'stage']
['now,', 'scotland-born', 'athlet', 'own', 'equal', 'fifth-fastest', 'time', 'world', 'year']
['and', 'last', "week'", 'birmingham', 'grand', 'prix,', 'claxton', 'left', 'european', 'medal', 'favourit', 'russian', 'irina', 'shevchenko', 'trail', 'sixth', 'spot']
['for', 'first', 'time,', 'claxton', 'prepar', 'campaign', 'hurdl', 'coul

In [237]:
m = len(Y)  # training set size
m2 = len(X)
print(m)
print(m2)
nn_input_dim = 2  # input layer dimensionality (we have two input features)
nn_output_dim = 1  # output layer dimensionality (we have one output)

# Gradient descent parameters
alpha = 0.2  # learning rate for gradient descent
print(Y)
print(X)

13
13
[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1]
[[0.29411765 2.1925485 ]
 [0.88235294 3.82819379]
 [0.82352941 3.85198754]
 [0.29411765 2.2306669 ]
 [0.29411765 1.95123377]
 [0.29411765 1.99637852]
 [0.58823529 2.62763431]
 [0.82352941 3.69742079]
 [0.52941176 3.30152319]
 [1.         4.08286008]
 [0.64705882 3.27426007]
 [0.82352941 4.10771955]
 [0.76470588 3.42691045]]


In [223]:
def sigmoid(x):
    # TODO 1: Compute the sigmoid function at the given x (~1 line)
    # For example: sigmoid(2) should compute the value of sigmoid function at x = 2.
    # Hint: Use np.exp instead of math.exp to allow for vectorization.
    #----------------------------------------------------------------------------------------------
    sig = (1/(1+np.exp(-x)))
    #----------------------------------------------------------------------------------------------
    
    return sig

In [239]:
def build_model(nn_hdim, num_passes=20000, print_loss=False):
    
    # This function learns parameters for the neural network and returns the model.
    # - nn_hdim: Number of nodes in the hidden layer
    # - num_passes: Number of iterations (epochs) through the training data for gradient descent
    # - print_loss: If True, print the loss every 1000 iterations

    # Initialize the parameters to random values. We need to learn these at the end.
    np.random.seed(0)
    W1 = np.random.randn(nn_hdim, nn_input_dim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((nn_hdim, 1))
    W2 = np.random.randn(nn_output_dim, nn_hdim) / np.sqrt(nn_hdim)
    b2 = np.zeros((nn_output_dim, 1))

    # This is what we return at the end
    model = {}

    # Batch Gradient descent (We accumulate the loss for each training point before updating the weights)
    # For each iteration:
    for i in range(0, num_passes):
        DW1 = 0
        DW2 = 0
        Db1 = 0
        Db2 = 0
        cost = 0
        # Loop on every training example...
        for j in range(0, m):
            a0 = X[j, :].reshape(-1, 1)  # Every training example is a column vector.
            y = Y[j]
            
            # TODO 2: Apply forward propagation on every training example a0 (a column vector 2x1) with its
            # corresponding label y. It is required to compute z1, a1, z2, and a2
            #----------------------------------------------------------------------------------------------
            # Forward propagation
            z1 = np.dot(W1 , a0 )+ b1
            a1 = np.tanh(z1)
            z2 = np.dot(W2 , a1) + b2
            a2 = sigmoid(z2)
            #----------------------------------------------------------------------------------------------

            # TODO 3: Compute the cost/loss function for every training example (Hint: use np.log)
            # ---------------------------------------------------------------------------------------------
            cost_j = -1 * ((np.log(a2) * y + (1-y)* np.log(1-a2)))
            # ---------------------------------------------------------------------------------------------

            # TODO 4: Derive the equations of backpropagation to find dW2, db2, dW1, and db1.
            # Hint: Check the dimensions at each step. 
            # Hint: For element-wise multiplication use *, for matrix multiplication use @
            # Example: y = A * B performs element wise multiplication 
            #          y = A @ B performs matrix multiplication
            # ---------------------------------------------------------------------------------------------
            da2 =  ( -y/a2  + (1-y)/(1-a2) )
            dz2 =  da2 * a2 * ( 1 - a2)
            dW2 = np.dot(dz2 , a1.T)
            db2 = dz2

            da1 =  np.dot(dz2,W2).T
            dz1 = np.multiply(da1 , 1 - np.square(a1) )
            dW1 = np.dot(dz1 , a0.T )
            db1 = dz1
            # ---------------------------------------------------------------------------------------------
            
            # Accumulating the sum of dW1, db1, dW2, db2 and cost_j into the variables DW1, Db1, DW2, Db2 and cost
            # for all training set. 
            DW1 += dW1
            DW2 += dW2
            Db2 += db2
            Db1 += db1
            cost += cost_j
        
        # Averaging DW1, DW2, Db1, Db2 and cost over the m training examples. 
        DW1 /= m
        DW2 /= m
        Db1 /= m
        Db2 /= m
        cost /= m

        # TODO 5: Perform the gradient descent parameter update.
        # ---------------------------------------------------------------------------------------------------
        # Gradient descent parameter update
        W1 -= alpha * DW1
        b1 -= alpha * Db1
        W2 -= alpha * DW2
        b2 -= alpha * Db2
        # ---------------------------------------------------------------------------------------------------

        # Assign new parameters to the model
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

        # Optionally print the loss.
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, cost))
    print(model)
    return model

In [240]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    a0 = x.T
    
    # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
    #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
    # -----------------------------------------------------------------------------------------------
    z1 = np.dot(W1 , a0) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(W2 , a1) + b2
    a2 = sigmoid(z2)
    # ------------------------------------------------------------------------------------------------
    # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
    prediction = np.round(a2)
    
    return prediction

In [245]:
model = build_model(nn_hdim=8, num_passes=10001, print_loss=True)

Loss after iteration 0: 0.684393
Loss after iteration 1000: 0.614555
Loss after iteration 2000: 0.596470
Loss after iteration 3000: 0.574862
Loss after iteration 4000: 0.548112
Loss after iteration 5000: 0.509054
Loss after iteration 6000: 0.469818
Loss after iteration 7000: 0.443109
Loss after iteration 8000: 0.409340
Loss after iteration 9000: 0.316174
Loss after iteration 10000: 0.254303
{'W1': array([[12.26954788, -3.45243096],
       [ 0.5801395 ,  0.74633741],
       [ 1.36176148, -4.17164096],
       [ 0.16700643, -0.52786444],
       [ 0.63121089,  1.02068698],
       [ 0.87423084,  0.96460447],
       [-0.24798193, -1.19908236],
       [ 0.95918156, -2.93361678]]), 'b1': array([[ 4.48743384],
       [-0.52572212],
       [15.03638251],
       [ 0.45146368],
       [-2.00637842],
       [-2.01112914],
       [ 2.25139129],
       [ 7.87686656]]), 'W2': array([[ 6.4886588 , -1.19787487,  3.58297149,  0.85084144, -2.63104263,
        -2.19928309,  2.62615517, -5.27983905]]), 'b2'