In [1]:
import sys
import os
import re
import string
import numpy as np
import math
import nltk
import io
import csv
import pandas as pd
import trax
import textwrap
wrapper = textwrap.TextWrapper(width=70)


from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice
from rouge import Rouge
from trax import layers as tlayer
from trax.fastmath import numpy as trax_np
from trax.supervised import training





In [2]:
# nltk.download('stopwords')
nltk.download('stopwords')  # uncomment these lines once they are not downloaded
nltk.download('punkt')  # uncomment these lines once they are not downloaded

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mohammedzaki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/mohammedzaki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Extractive Summary

In [3]:
def preprocessing(article):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    article_preprocessed = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
        sentence_preprocessed = []
        sentence = re.sub(r"[^a-zA-Z\s]+", "", sentence)
        words = word_tokenize(sentence)
        for word in words:
            if (word not in stopwords_english and word not in string.punctuation):
                word_stemmed = stemmer.stem(word)  
                sentence_preprocessed.append(word_stemmed)
        if sentence_preprocessed:
            article_preprocessed.append(sentence_preprocessed)
            
    return article_preprocessed


In [4]:
def convert_list_to_string(sentences):  # converts list of lists to list of strings
    sentences_modified = []   # list of strings
    for sentence in sentences:
        sentence_modified = ' '.join(sentence)
        sentences_modified.append(sentence_modified)
    return sentences_modified

In [5]:
### Feature 1

def calculate_TF_IDF(content):
    flat_words = [word for sent in content for word in sent]
    words_set = set(flat_words)
    words_num = len(words_set)
    tf = pd.DataFrame(np.zeros((len(content), words_num)), columns = list(words_set))
    for i in range (len(content)):
        for w in content[i]:
                      tf[w][i] += 1/len(content[i])
                      
    idf = {}
    
    for word in words_set:
        num_docs = 0
        for i in range(len(content)):
            if word in content[i]:
                num_docs += 1
                
        idf[word] = np.log10(len(content) / num_docs)
        
    tf_idf = np.zeros(len(content))
    
    for i in range (len(content)):
        for word in content[i]:
            tf_idf[i] += tf[word][i] * idf[word]
            
    tf_idf = tf_idf/max(tf_idf)  # might be commented (this normalizes the tf-idf)
            
    return tf_idf

In [6]:
### Feature 2

def sentence_length(article_preprocessed):
    article_preprocessed = convert_list_to_string(article_preprocessed)
    max_length = 0
    for sentence in article_preprocessed:
        if len(sentence.split()) > max_length:
            max_length = len(sentence.split())
            
    sentence_length_feature = []
    for sentence in article_preprocessed:
        sentence_length_feature.append(len(sentence.split()) / max_length)


    return sentence_length_feature

In [7]:
### Feature 3

def proper_nouns_number(sentences):
    number_proper_nouns_in_sentences = []
    proper_nouns_number_feature = []
    stopwords_english = stopwords.words('english')
    for sentence in sentences:
        words_in_sentence = re.sub(r"[^a-zA-Z\s]+", "", sentence)
        words = word_tokenize(words_in_sentence)
        count_true_words = 0
        for word in words:
            if (word not in stopwords_english and word not in string.punctuation):
                count_true_words += 1
        if(count_true_words > 0):
            tokens = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(tokens)
            proper_nouns = [word for word, pos in tagged if pos == 'NNP']
            number_proper_nouns_in_sentences.append(len(proper_nouns))
#         print(proper_nouns)
        
    if max(number_proper_nouns_in_sentences) > 0:
        for number_proper_nouns_in_sentence in number_proper_nouns_in_sentences:
            proper_nouns_number_feature.append(number_proper_nouns_in_sentence / max(number_proper_nouns_in_sentences))
    else:
        for number_proper_nouns_in_sentence in number_proper_nouns_in_sentences:
            proper_nouns_number_feature.append(0)

    return proper_nouns_number_feature

In [8]:
article_file = io.open("train_original/business/article (" + str(1) +").txt", "r", encoding='utf8')
article_file.readline()
article = article_file.read()
article_file.close()
article = sent_tokenize(article)
article[0] = article[0][1:] # to remove the \n
num_proper_nouns = proper_nouns_number(article)
print("Number of proper nouns:", num_proper_nouns)

Number of proper nouns: [0.6, 0.2, 0.2, 0.6, 0.8, 0.2, 0.0, 0.2, 0.4, 1.0, 0.4, 0.8, 0.2, 0.4, 0.2, 0.6, 0.2, 0.0, 0.6, 0.4]


In [9]:
def generate_X_labels(article_preprocessed, original_sentences):
    # feature 1 (tf_idf)
    tf_idf_score_feature = calculate_TF_IDF(article_preprocessed)
    
    # feature 2 (sentence_length)
    sentence_length_feature = sentence_length(article_preprocessed)
    
    # feature 3 (proper nouns number)
    num_proper_nouns_feature = proper_nouns_number(original_sentences)
    
    matrix = np.column_stack((tf_idf_score_feature, sentence_length_feature, num_proper_nouns_feature))

    return matrix

In [10]:
def generate_Y_labels(original, summarized):
    Y_list = []
    original_sentences = sent_tokenize(original)
    original_sentences[0] = original_sentences[0][1:] # to remove the \n
    summarized_sentences = sent_tokenize(summarized)
    
    for original_sentence in original_sentences:
        added = 0
        for summarized_sentence in summarized_sentences:
            if original_sentence in summarized_sentence:
                Y_list.append(1)
                added = 1
                break
        if added == 0:
            Y_list.append(0)
    
    return Y_list, original_sentences
    

In [11]:
X_matrix = []
X = []
Y = []
sentences = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range (1, 51):   # loading business articles
        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf8')
        article_file.readline()
        article = article_file.read()
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf8')
        summarized = summarized_file.read()
        summarized_file.close()

        article_preprocessed = preprocessing(article)
        original_sentences = sent_tokenize(article)
        original_sentences[0] = original_sentences[0][1:] # to remove the \n
        X_i = generate_X_labels(article_preprocessed, original_sentences)
        Y_i, original_list_no_first_space = generate_Y_labels(article, summarized)

        if(len(X_i) != len(Y_i)):
            print('Error! features and labels are not equal in length')

        Y.extend(Y_i)
        X_matrix.extend(X_i)
        sentences.extend(original_list_no_first_space)
    

for x in X_matrix:
    X.append(x.tolist())
    
X = np.matrix(X)

m = len(X)

print(len(X))
print(len(Y))

4179
4179


In [12]:
print(type(X))
print(X[: 50, :])
print(type(Y))
print(Y[: 50])

<class 'numpy.matrix'>
[[0.73203128 0.54545455 0.6       ]
 [0.87185081 0.63636364 0.2       ]
 [0.69813911 0.36363636 0.2       ]
 [0.75779078 0.59090909 0.6       ]
 [0.74727107 0.31818182 0.8       ]
 [0.72061857 0.27272727 0.2       ]
 [0.78105634 0.45454545 0.        ]
 [0.69638967 0.63636364 0.2       ]
 [0.8741924  0.81818182 0.4       ]
 [0.78987147 0.59090909 1.        ]
 [0.69569929 0.40909091 0.4       ]
 [0.95439123 1.         0.8       ]
 [0.74587596 0.45454545 0.2       ]
 [0.87394958 0.77272727 0.4       ]
 [0.74736969 0.63636364 0.2       ]
 [0.7488841  0.5        0.6       ]
 [0.79524522 0.40909091 0.2       ]
 [1.         0.59090909 0.        ]
 [0.77195353 0.77272727 0.6       ]
 [0.84899475 0.40909091 0.4       ]
 [0.71124193 0.75       0.375     ]
 [0.93292871 0.66666667 0.375     ]
 [0.8066192  0.375      0.375     ]
 [0.6972612  0.29166667 0.        ]
 [0.89068626 1.         1.        ]
 [0.8456085  0.95833333 0.75      ]
 [0.8264392  0.54166667 0.        ]
 [0.7

In [13]:
nn_input_dim = 3 # input layer size (we have two input features)
nn_output_dim = 1  # output layer size (we have one output)

# Gradient descent parameters
alpha = 0.1  # learning rate for gradient descent

In [14]:
def sigmoid(x):
    sig = (1/(1+np.exp(-x)))
    return sig

def softmax(x):
    """Compute softmax values for each element in the input array x."""
    e_x = np.exp(x - np.max(x))  # Subtracting the maximum value for numerical stability
    return e_x / np.sum(e_x, axis=0)

In [15]:
my_matrix = np.array([[10, 20, 30],
                      [40, 50, 60],
                      [70, 80, 90]])

divisor = 5
print(my_matrix / divisor)

[[ 2.  4.  6.]
 [ 8. 10. 12.]
 [14. 16. 18.]]


In [22]:

def build_model(nn_hdim, num_passes=20000, print_loss=False):
    
    # This function learns parameters for the neural network and returns the model.
    # - nn_hdim: Number of nodes in the hidden layer
    # - num_passes: Number of iterations (epochs) through the training data for gradient descent
    # - print_loss: If True, print the loss every 1000 iterations

    # Initialize the parameters to random values. We need to learn these at the end.
    np.random.seed(0)
    W1 = np.random.randn(nn_hdim, nn_input_dim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((nn_hdim, 1))
    W2 = np.random.randn(nn_output_dim, nn_hdim) / np.sqrt(nn_hdim)
    b2 = np.zeros((nn_output_dim, 1))

    # This is what we return at the end
    model = {}

    # Batch Gradient descent (We accumulate the loss for each training point before updating the weights)
    # For each iteration:
    for i in range(0, num_passes):
        DW1 = 0
        DW2 = 0
        Db1 = 0
        Db2 = 0
        cost = 0
        # Loop on every training example...
        for j in range(0, m):
            a0 = X[j, :].reshape(-1, 1)  # Every training example is a column vector.
            y = Y[j]
            
            # TODO 2: Apply forward propagation on every training example a0 (a column vector 2x1) with its
            # corresponding label y. It is required to compute z1, a1, z2, and a2
            #----------------------------------------------------------------------------------------------
            # Forward propagation
            z1 = np.dot(W1 , a0 )+ b1
            a1 = np.tanh(z1)
            z2 = np.dot(W2 , a1) + b2
            a2 = sigmoid(z2)
            #----------------------------------------------------------------------------------------------

            # TODO 3: Compute the cost/loss function for every training example (Hint: use np.log)
            # ---------------------------------------------------------------------------------------------
            cost_j = -1 * ((np.log(a2) * y + (1-y)* np.log(1-a2)))
            # ---------------------------------------------------------------------------------------------

            # TODO 4: Derive the equations of backpropagation to find dW2, db2, dW1, and db1.
            # Hint: Check the dimensions at each step. 
            # Hint: For element-wise multiplication use *, for matrix multiplication use @
            # Example: y = A * B performs element wise multiplication 
            #          y = A @ B performs matrix multiplication
            # ---------------------------------------------------------------------------------------------
            da2 =  ( -y/a2  + (1-y)/(1-a2) )
            dz2 =  da2 * a2 * ( 1 - a2)
            dW2 = np.dot(dz2 , a1.T)
            db2 = dz2

            da1 =  np.dot(dz2,W2).T
            dz1 = np.multiply(da1 , 1 - np.square(a1) )
            dW1 = np.dot(dz1 , a0.T )
            db1 = dz1
            # ---------------------------------------------------------------------------------------------
            
            # Accumulating the sum of dW1, db1, dW2, db2 and cost_j into the variables DW1, Db1, DW2, Db2 and cost
            # for all training set. 
            DW1 += dW1
            DW2 += dW2
            Db2 += db2
            Db1 += db1
            cost += cost_j
        
        # Averaging DW1, DW2, Db1, Db2 and cost over the m training examples. 
        DW1 /= m
        DW2 /= m
        Db1 /= m
        Db2 /= m
        cost /= m

        # TODO 5: Perform the gradient descent parameter update.
        # ---------------------------------------------------------------------------------------------------
        # Gradient descent parameter update
        W1 -= alpha * DW1
        b1 -= alpha * Db1
        W2 -= alpha * DW2
        b2 -= alpha * Db2
        # ---------------------------------------------------------------------------------------------------

        # Assign new parameters to the model
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

        # Optionally print the loss.
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, cost))

    return model


In [23]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    a0 = x.T
    z1 = np.dot(W1 , a0) + b1
    a1 = np.tanh(z1)
    z2 = np.dot(W2 , a1) + b2
    a2 = sigmoid(z2)
    prediction = a2
    
    return prediction

# def predict(model, x):
#     W1, b1, W2, b2, W3, b3, W4, b4, W5, b5, W6, b6, W7, b7 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3'], + \
#                                                              model['W4'], model['b4'], model['W5'], model['b5'], model['W6'], model['b6'], + \
#                                                              model['W7'], model['b7']
#     a0 = x.T
    
#     # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
#     #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
#     # -----------------------------------------------------------------------------------------------
#     z1 = np.dot(W1 , a0)+ b1
#     a1 = np.tanh(z1)
#     z2 = np.dot(W2 , a1)+ b2
#     a2 = np.tanh(z2)
#     z3 = np.dot(W3 , a2)+ b3
#     a3 = np.tanh(z3)
#     z4 = np.dot(W4 , a3)+ b4
#     a4 = np.tanh(z4)
#     z5 = np.dot(W5 , a4)+ b5
#     a5 = np.tanh(z5)
#     z6 = np.dot(W6 , a5)+ b6
#     a6 = np.tanh(z6)
#     z7 = np.dot(W7 , a6) + b7
#     a7 = sigmoid(z7)
#     # ------------------------------------------------------------------------------------------------
#     # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
#     # prediction = np.round(a2)
#     prediction = a7    
#     return prediction

In [24]:
model = build_model(nn_hdim= 8, num_passes = 3001, print_loss=True)

2.23606797749979
[[0.73203128]
 [0.54545455]
 [1.8       ]]
[[0.28311261]
 [0.26044848]
 [0.45643891]]
2.23606797749979
[[0.87185081]
 [0.63636364]
 [0.6       ]]
[[0.35899334]
 [0.32310934]
 [0.31789733]]
2.23606797749979
[[0.69813911]
 [0.36363636]
 [0.6       ]]
[[0.35484682]
 [0.30554346]
 [0.33960972]]
2.23606797749979
[[0.75779078]
 [0.59090909]
 [1.8       ]]
[[0.28394282]
 [0.26352312]
 [0.45253405]]
2.23606797749979
[[0.74727107]
 [0.31818182]
 [2.4       ]]
[[0.25513512]
 [0.21058697]
 [0.53427791]]
2.23606797749979
[[0.72061857]
 [0.27272727]
 [0.6       ]]
[[0.36153689]
 [0.29591167]
 [0.34255144]]
2.23606797749979
[[0.78105634]
 [0.45454545]
 [0.        ]]
[[0.38920765]
 [0.33633003]
 [0.27446232]]
2.23606797749979
[[0.69638967]
 [0.63636364]
 [0.6       ]]
[[0.34114303]
 [0.33210706]
 [0.32674991]]
2.23606797749979
[[0.8741924 ]
 [0.81818182]
 [1.2       ]]
[[0.31927278]
 [0.31137474]
 [0.36935248]]
2.23606797749979
[[0.78987147]
 [0.59090909]
 [3.        ]]
[[0.21730647]

2.23606797749979
[[0.84330297]
 [0.70588235]
 [0.6       ]]
[[0.35244822]
 [0.33144018]
 [0.3161116 ]]
2.23606797749979
[[0.90414848]
 [0.82352941]
 [0.        ]]
[[0.37993915]
 [0.36648484]
 [0.25357601]]
2.23606797749979
[[0.81575249]
 [0.47058824]
 [0.6       ]]
[[0.36166589]
 [0.30993378]
 [0.32840033]]
2.23606797749979
[[0.8398632 ]
 [0.41176471]
 [0.        ]]
[[0.3979881 ]
 [0.32864253]
 [0.27336936]]
2.23606797749979
[[0.80778499]
 [1.        ]
 [3.        ]]
[[0.21029366]
 [0.22917046]
 [0.56053589]]
2.23606797749979
[[0.90551338]
 [0.47058824]
 [0.        ]]
[[0.40156652]
 [0.33058664]
 [0.26784684]]
2.23606797749979
[[0.91374318]
 [0.76470588]
 [2.4       ]]
[[0.25777326]
 [0.24115234]
 [0.50107441]]
2.23606797749979
[[0.80320292]
 [0.58823529]
 [0.6       ]]
[[0.35442495]
 [0.32193838]
 [0.32363667]]
2.23606797749979
[[1.        ]
 [0.41176471]
 [0.6       ]]
[[0.38389304]
 [0.29509495]
 [0.32101201]]
2.23606797749979
[[0.87931461]
 [0.82352941]
 [0.6       ]]
[[0.34990309]

2.23606797749979
[[0.79417995]
 [0.7       ]
 [1.28571429]]
[[0.31205049]
 [0.29918031]
 [0.3887692 ]]
2.23606797749979
[[0.93334742]
 [0.9       ]
 [2.14285714]]
[[0.27006845]
 [0.26607069]
 [0.46386086]]
2.23606797749979
[[0.70555804]
 [0.52173913]
 [2.14285714]]
[[0.26158551]
 [0.24094168]
 [0.49747282]]
2.23606797749979
[[0.96125613]
 [0.69565217]
 [0.85714286]]
[[0.35180199]
 [0.31240074]
 [0.33579727]]
2.23606797749979
[[0.85949167]
 [0.52173913]
 [1.07142857]]
[[0.33792632]
 [0.29055136]
 [0.37152232]]
2.23606797749979
[[0.69123575]
 [0.56521739]
 [1.92857143]]
[[0.27142371]
 [0.25655011]
 [0.47202618]]
2.23606797749979
[[0.84044495]
 [0.73913043]
 [1.71428571]]
[[0.29121788]
 [0.27831748]
 [0.43046464]]
2.23606797749979
[[0.875     ]
 [0.69565217]
 [1.07142857]]
[[0.33170302]
 [0.30613717]
 [0.36215981]]
2.23606797749979
[[0.96523359]
 [0.34782609]
 [0.        ]]
[[0.41525589]
 [0.31506592]
 [0.26967819]]
2.23606797749979
[[0.89557797]
 [0.86956522]
 [2.14285714]]
[[0.267719 ]


2.23606797749979
[[0.95786347]
 [0.6       ]
 [3.        ]]
[[0.23017232]
 [0.19613179]
 [0.57369589]]
2.23606797749979
[[0.7458678]
 [0.25     ]
 [1.5      ]]
[[0.3122844]
 [0.2501737]
 [0.4375419]]
2.23606797749979
[[0.87583666]
 [0.6       ]
 [0.        ]]
[[0.39064583]
 [0.3453103 ]
 [0.26404387]]
2.23606797749979
[[0.7486895]
 [0.35     ]
 [0.5      ]]
[[0.36610777]
 [0.30631941]
 [0.32757283]]
2.23606797749979
[[0.84025725]
 [0.8       ]
 [2.        ]]
[[0.27308075]
 [0.26820831]
 [0.45871094]]
2.23606797749979
[[0.80860037]
 [0.5       ]
 [0.        ]]
[[0.38946288]
 [0.33925712]
 [0.27128   ]]
2.23606797749979
[[0.98590695]
 [0.65      ]
 [1.5       ]]
[[0.3206153 ]
 [0.27589485]
 [0.40348986]]
2.23606797749979
[[0.93311574]
 [0.5       ]
 [0.        ]]
[[0.40278228]
 [0.33185593]
 [0.26536179]]
2.23606797749979
[[0.76094203]
 [0.45      ]
 [2.        ]]
[[0.27696282]
 [0.24100695]
 [0.48203023]]
2.23606797749979
[[0.84295586]
 [0.55      ]
 [0.5       ]]
[[0.36562884]
 [0.3207

2.23606797749979
[[0.5086136 ]
 [0.18181818]
 [0.        ]]
[[0.37585712]
 [0.32475196]
 [0.29939092]]
2.23606797749979
[[0.49655927]
 [0.25454545]
 [0.66666667]]
[[0.33596932]
 [0.30150541]
 [0.36252527]]
2.23606797749979
[[0.4917614 ]
 [0.30909091]
 [0.66666667]]
[[0.33300969]
 [0.30688677]
 [0.36010354]]
2.23606797749979
[[0.50975926]
 [0.32727273]
 [0.33333333]]
[[0.35139959]
 [0.32386072]
 [0.3247397 ]]
2.23606797749979
[[0.56462124]
 [0.34545455]
 [0.33333333]]
[[0.35607838]
 [0.32283345]
 [0.32108818]]
2.23606797749979
[[0.52929912]
 [0.25454545]
 [0.        ]]
[[0.37402812]
 [0.33078129]
 [0.29519059]]
2.23606797749979
[[0.59461264]
 [0.18181818]
 [0.33333333]]
[[0.36749149]
 [0.30554372]
 [0.32696479]]
2.23606797749979
[[0.52419349]
 [0.16363636]
 [0.33333333]]
[[0.36110617]
 [0.30733119]
 [0.33156264]]
2.23606797749979
[[0.5553575 ]
 [0.38181818]
 [0.33333333]]
[[0.35325652]
 [0.32687745]
 [0.31986603]]
2.23606797749979
[[0.52970854]
 [0.21818182]
 [2.        ]]
[[0.26315941]

2.23606797749979
[[0.54526714]
 [0.66666667]
 [0.33333333]]
[[0.33722654]
 [0.3560412 ]
 [0.30673226]]
2.23606797749979
[[0.52474034]
 [0.55555556]
 [1.33333333]]
[[0.28989534]
 [0.29391804]
 [0.41618661]]
2.23606797749979
[[1.        ]
 [0.38888889]
 [0.        ]]
[[0.41661911]
 [0.31699158]
 [0.26638931]]
2.23606797749979
[[0.56458752]
 [0.55555556]
 [1.66666667]]
[[0.27525703]
 [0.27414745]
 [0.45059552]]
2.23606797749979
[[0.65486999]
 [0.61111111]
 [0.        ]]
[[0.366738  ]
 [0.35963087]
 [0.27363113]]
2.23606797749979
[[0.3845702 ]
 [0.38888889]
 [0.33333333]]
[[0.33565157]
 [0.33630047]
 [0.32804796]]
2.23606797749979
[[0.80497954]
 [0.66666667]
 [0.        ]]
[[0.37911865]
 [0.35637866]
 [0.26450269]]
2.23606797749979
[[0.53354982]
 [0.38888889]
 [0.        ]]
[[0.36696232]
 [0.34397358]
 [0.2890641 ]]
2.23606797749979
[[0.74714195]
 [0.54166667]
 [0.85714286]]
[[0.33753813]
 [0.30790372]
 [0.35455815]]
2.23606797749979
[[0.93075824]
 [0.66666667]
 [2.14285714]]
[[0.27714956]

2.23606797749979
[[0.67033714]
 [0.17391304]
 [0.        ]]
[[0.39340842]
 [0.31508445]
 [0.29150712]]
2.23606797749979
[[0.77851796]
 [1.        ]
 [2.4       ]]
[[0.23985649]
 [0.26483065]
 [0.49531286]]
2.23606797749979
[[1.        ]
 [0.47826087]
 [0.6       ]]
[[0.38050377]
 [0.30131833]
 [0.3181779 ]]
2.23606797749979
[[0.73610474]
 [0.52173913]
 [0.6       ]]
[[0.3509348 ]
 [0.31885397]
 [0.33021123]]
2.23606797749979
[[0.78855372]
 [0.60869565]
 [1.2       ]]
[[0.32002704]
 [0.29529372]
 [0.38467924]]
2.23606797749979
[[0.80803529]
 [0.43478261]
 [0.        ]]
[[0.39323799]
 [0.33278323]
 [0.27397877]]
2.23606797749979
[[0.85245722]
 [0.72222222]
 [0.75      ]]
[[0.34499004]
 [0.32547078]
 [0.32953919]]
2.23606797749979
[[0.80213527]
 [0.72222222]
 [2.25      ]]
[[0.25802057]
 [0.24896221]
 [0.49301722]]
2.23606797749979
[[0.98319918]
 [0.77777778]
 [0.        ]]
[[0.39116654]
 [0.35683238]
 [0.25200108]]
2.23606797749979
[[0.63269776]
 [0.33333333]
 [0.75      ]]
[[0.34146537]

2.23606797749979
[[0.75770637]
 [0.5       ]
 [1.8       ]]
[[0.28694781]
 [0.25571168]
 [0.45734052]]
2.23606797749979
[[0.66532645]
 [1.        ]
 [5.4       ]]
[[0.09550147]
 [0.11092034]
 [0.79357819]]
2.23606797749979
[[0.78605888]
 [0.77777778]
 [9.        ]]
[[0.02416548]
 [0.02407615]
 [0.95175838]]
2.23606797749979
[[0.78221104]
 [0.55555556]
 [0.        ]]
[[0.38337344]
 [0.346418  ]
 [0.27020857]]
2.23606797749979
[[0.73913058]
 [0.72222222]
 [3.6       ]]
[[0.17898629]
 [0.17763796]
 [0.64337574]]
2.23606797749979
[[0.60321954]
 [0.5       ]
 [5.4       ]]
[[0.09525158]
 [0.0909546 ]
 [0.81379382]]
2.23606797749979
[[0.69002362]
 [0.66666667]
 [0.        ]]
[[0.36709477]
 [0.36328023]
 [0.269625  ]]
2.23606797749979
[[0.87636543]
 [0.88888889]
 [9.        ]]
[[0.02510561]
 [0.02524661]
 [0.94964778]]
2.23606797749979
[[1.        ]
 [0.88888889]
 [0.        ]]
[[0.38596161]
 [0.36725172]
 [0.24678668]]
2.23606797749979
[[0.69487743]
 [0.77777778]
 [7.2       ]]
[[0.0490689 ]

KeyboardInterrupt: 

In [19]:
print(model)

{'W1': array([[ 1.00928635,  0.07351834,  0.33735752],
       [ 1.22295925,  1.07735298, -0.62299389],
       [ 0.98602486, -0.35478846, -0.01500833],
       [ 0.20291243,  0.41523832,  0.8577783 ],
       [ 1.01529708, -0.39351811,  0.36468252],
       [ 0.3986278 ,  0.48836652,  0.04839573],
       [ 0.33036743, -1.15896626, -1.47832569],
       [ 0.81102003,  0.18049328, -0.46458352]]), 'b1': array([[-0.51391236],
       [-0.10592739],
       [-0.3485713 ],
       [ 0.1619751 ],
       [-0.42119977],
       [-0.21587941],
       [-0.12101779],
       [-0.25500703]]), 'W2': array([[ 1.35851593, -0.97104069,  0.15589377, -0.11237024,  0.94256831,
         0.73375435,  0.26557206,  0.21639691],
       [-0.53982657, -1.07545507, -0.29505734,  0.16863559,  0.65774359,
         0.71130925, -0.36213994, -0.26265189],
       [-0.63385341, -0.78829021, -1.05576679,  1.09969719, -0.34537992,
        -0.19548553, -0.78949401,  0.47523255],
       [-0.98597515, -0.14743318, -0.47773171,  0.2193

In [17]:
# # print(model)
# model tf-idf not normalized
# model = {'W1': np.array([[ 1.38785383,  0.49082933],
#        [ 0.55641748,  1.62806027],
#        [ 1.85376828, -0.3225326 ],
#        [ 1.78168709,  0.05507265],
#        [ 1.5035727 ,  0.15089058],
#        [-0.21027833,  1.92211576],
#        [ 0.46820238,  0.16111129],
#        [ 0.84196808,  0.12407808]]), 'b1': np.array([[ 0.38819622],
#        [-0.02713244],
#        [-0.8429403 ],
#        [-0.91256074],
#        [-0.66510711],
#        [ 0.01664907],
#        [ 0.03959506],
#        [-0.22607371]]), 'W2': np.array([[ 0.6266244 ,  0.50983672, -1.1677148 , -1.47645926, -1.39277239,
#          1.53280975,  0.17978621, -0.50006151]]), 'b2': np.array([[0.58550391]])}

# model = {'W1': np.array([[ 1.18336695,  0.1250435 ], # 3 layers
#        [ 0.38733117,  1.8530943 ],
#        [ 1.75886377, -0.58510996],
#        [ 0.54240451, -0.05259935],
#        [-0.09248666,  0.2141493 ],
#        [ 0.0432581 ,  0.9329085 ],
#        [ 0.66859945,  0.25386436],
#        [ 0.5554128 ,  0.36237266]]), 'b1': np.array([[-0.28405656],
#        [-0.40425758],
#        [-0.99870768],
#        [ 0.05080746],
#        [-0.09605534],
#        [-0.26051699],
#        [-0.18459905],
#        [-0.28472492]]), 'W2': np.array([[ 1.13457184, -0.20822159,  0.38311448, -0.51971462, -1.7927709 ,
#          0.47739742,  0.70361151, -0.44614564],
#        [ 1.63177288, -1.0197636 , -0.08924496, -0.14037122,  1.09361726,
#          1.04981611,  0.09540724,  0.2347938 ],
#        [-0.67150498, -1.45686798, -0.20758122,  0.09659014,  0.85981688,
#          0.80570778, -0.296603  , -0.22930296],
#        [-0.8237228 , -1.15021352, -1.24466209,  1.34648287, -0.36599564,
#         -0.38097044, -0.92866868,  0.50709689],
#        [-1.05799442, -0.01422579, -0.90377796,  0.28580989, -0.35451199,
#         -0.7717647 , -0.02472574,  0.26142491],
#        [ 0.01077594,  0.59710575, -0.83069778, -0.40593486, -0.49098818,
#         -0.20682705, -0.76479984, -1.38033905],
#        [ 0.03789108, -0.00967381, -1.58485507,  0.14915192, -0.62770378,
#          0.15492018,  0.36470344, -0.01501329],
#        [ 0.78647832, -1.02272337,  0.42132782, -0.45981076, -0.62587553,
#         -0.50757541, -0.22120812,  0.02481316]]), 'b2': np.array([[-0.05359941],
#        [ 0.12717962],
#        [-0.061023  ],
#        [-0.05207335],
#        [ 0.31889328],
#        [ 0.25685478],
#        [ 0.1398867 ],
#        [-0.04797326]]), 'W3': np.array([[-0.80349617,  0.71868534,  0.20718625, -1.20396728,  1.00463432,
#          1.57134532,  1.02363706, -0.37870051],
#        [-0.89039626,  0.50355204, -0.02357477,  1.08788051,  0.29202038,
#          0.60301908,  0.07305096,  0.60210272],
#        [ 0.03802195,  1.28299045,  0.06325746,  0.27166014,  1.30692966,
#         -1.09726151, -1.02922169,  0.86082546],
#        [-0.90207611,  1.29694031, -0.26202875, -0.47270786,  1.46564552,
#          1.30205973,  1.4459055 ,  0.57549995],
#        [-0.66125529,  1.43636039, -0.21285799,  0.59702511,  0.6944864 ,
#         -0.04123769,  0.48235273,  0.71124213],
#        [ 0.263836  , -0.77833376,  0.21475251,  0.94851392, -0.48556096,
#         -0.10818819, -0.30816457,  1.30529932],
#        [ 0.487098  ,  0.38296521, -0.62201678,  0.30699157, -0.51424632,
#          0.09400846, -0.3431704 ,  0.40476813],
#        [ 0.39364728, -0.2458679 ,  0.37127184, -0.72310426, -1.03654804,
#          0.2336883 ,  0.0376808 ,  0.47617952]]), 'b3': np.array([[ 0.14588719],
#        [-0.32320868],
#        [ 0.02159727],
#        [-0.02307498],
#        [ 0.08842675],
#        [ 0.00334234],
#        [ 0.09982744],
#        [-0.13245038]]), 'W4': np.array([[ 1.27121312,  0.57705831, -0.72331337,  0.69760112, -0.86182132,
#          0.17466716, -0.23174607,  0.40275134]]), 'b4': np.array([[-0.47140409]])}
# model tf-idf normalized
# model = {'W1': np.array([[ 1.29643591,  0.45074824],
#        [ 0.70773118,  1.58460852],
#        [ 2.67738766, -1.74837318],
#        [ 1.33109077,  0.10529432],
#        [ 0.69949236,  0.23108707],
#        [-0.35599099,  2.07436754],
#        [ 0.47995731,  0.24472955],
#        [ 0.63720221,  0.2369889 ]]), 'b1': np.array([[ 0.24555316],
#        [-0.02617257],
#        [-1.61662627],
#        [-1.31429877],
#        [-0.17949485],
#        [ 0.07467697],
#        [ 0.13714999],
#        [ 0.03253189]]), 'W2': np.array([[-0.20112114,  0.1343918 , -2.29593635, -1.68775395, -1.66578869,
#          1.04315377, -0.56503497, -1.11453418]]), 'b2': np.array([[-0.64778329]])}

# using three features
model = {'W1': np.array([[ 0.88364761,  0.50919285,  0.74467075],
       [ 1.73773793,  0.26511113, -0.42124022],
       [ 1.27773621, -0.38888054,  0.15784937],
       [ 0.42125748, -0.03326718,  0.75696027],
       [-0.45589358,  0.14883678,  0.1730209 ],
       [-1.09943191,  1.10288614, -0.40660062],
       [ 0.72254153, -1.09129596, -1.81751967],
       [ 0.62326718,  0.42956105, -0.33338668]]), 'b1': np.array([[ 0.19366842],
       [-1.42823596],
       [-0.61464812],
       [-0.06618449],
       [-0.08272171],
       [ 0.36715538],
       [-0.16069309],
       [-0.07659377]]), 'W2': np.array([[ 0.20531547, -2.00741573, -1.51597711, -0.52825433,  0.52933292,
         1.53705344, -1.19439849, -0.52011505]]), 'b2': np.array([[-0.33145915]])}

# model = {'W1': np.array([[ 1.00928635,  0.07351834,  0.33735752],
#        [ 1.22295925,  1.07735298, -0.62299389],
#        [ 0.98602486, -0.35478846, -0.01500833],
#        [ 0.20291243,  0.41523832,  0.8577783 ],
#        [ 1.01529708, -0.39351811,  0.36468252],
#        [ 0.3986278 ,  0.48836652,  0.04839573],
#        [ 0.33036743, -1.15896626, -1.47832569],
#        [ 0.81102003,  0.18049328, -0.46458352]]), 'b1': np.array([[-0.51391236],
#        [-0.10592739],
#        [-0.3485713 ],
#        [ 0.1619751 ],
#        [-0.42119977],
#        [-0.21587941],
#        [-0.12101779],
#        [-0.25500703]]), 'W2': np.array([[ 1.35851593, -0.97104069,  0.15589377, -0.11237024,  0.94256831,
#          0.73375435,  0.26557206,  0.21639691],
#        [-0.53982657, -1.07545507, -0.29505734,  0.16863559,  0.65774359,
#          0.71130925, -0.36213994, -0.26265189],
#        [-0.63385341, -0.78829021, -1.05576679,  1.09969719, -0.34537992,
#         -0.19548553, -0.78949401,  0.47523255],
#        [-0.98597515, -0.14743318, -0.47773171,  0.21938979, -0.27989207,
#         -0.77835357,  0.08281961,  0.20780316],
#        [-0.20320588,  0.21184209, -0.67758596, -0.33464305, -0.61047779,
#         -0.15416586, -0.57452362, -1.08887969],
#        [ 0.12172396, -0.18249314, -0.91940673,  0.31264724, -0.52283903,
#          0.05546179,  0.39303603,  0.11006103],
#        [ 0.68402001, -0.65819284,  0.26562118, -0.34388363, -0.48762405,
#         -0.31086456, -0.22521898,  0.06481267],
#        [-0.65736824,  0.34454364,  0.37049441, -0.87105138,  0.93958582,
#          0.90621985,  0.82987161, -0.22734991]]), 'b2': np.array([[-0.15796601],
#        [ 0.14593599],
#        [-0.00072719],
#        [ 0.07853822],
#        [ 0.06125897],
#        [ 0.08521379],
#        [ 0.0907268 ],
#        [-0.1053139 ]]), 'W3': np.array([[-0.66315763,  0.5986112 , -0.23659771,  0.72007548,  0.13126273,
#          0.56238905,  0.19212277,  0.39739259],
#        [-0.06366396,  1.05627536,  0.2198348 ,  0.24174283,  1.19611142,
#         -0.7073278 , -0.75042594,  0.55417268],
#        [-0.72291238,  1.10397896, -0.19008599, -0.44811444,  1.14392895,
#          0.84933982,  1.04643006,  0.4743812 ],
#        [-0.51979034,  1.09537478, -0.11891471,  0.43810554,  0.56904785,
#         -0.09581734,  0.32877147,  0.49645784],
#        [ 0.26006801, -0.62983226,  0.15069594,  0.76957945, -0.42501927,
#         -0.14996383, -0.23207742,  0.98800541],
#        [ 0.38307001,  0.19245472, -0.49243511,  0.24277335, -0.41572917,
#         -0.0218554 , -0.39613362,  0.35940041],
#        [ 0.40077362, -0.10797163,  0.33521092, -0.71358431, -0.82792958,
#          0.21613124,  0.09056291,  0.24054516],
#        [ 1.41178703,  0.56401253, -0.61384489,  0.66371555, -0.75280848,
#         -0.35031787,  0.0044621 ,  0.92773551]]), 'b3': np.array([[ 0.07006033],
#        [ 0.00255995],
#        [ 0.07800188],
#        [ 0.06595736],
#        [ 0.0260233 ],
#        [ 0.07603515],
#        [ 0.03325726],
#        [-0.0277332 ]]), 'W4': np.array([[-0.45719148, -0.56542934, -0.11932111, -0.42308547,  0.62379089,
#         -0.58830891, -0.60687405, -0.22897998],
#        [-0.3257999 ,  1.13041884,  0.51849831,  0.01422794, -0.67528605,
#          0.46219173, -0.57722297, -0.97333546],
#        [ 0.73181123,  0.20880507,  0.57310122,  0.22330418,  0.49438932,
#         -0.39361292, -0.64482415,  0.39337319],
#        [-0.48871429, -0.43726208, -0.27809285, -0.0120847 , -0.20131573,
#         -0.80859086, -0.3455871 , -1.32272963],
#        [ 0.33439694, -0.95918524, -0.68092927, -0.00561404, -0.39742893,
#          0.92948566, -0.7158546 ,  0.1339824 ],
#        [-0.11594168, -0.74646884,  0.20702841, -0.19300647,  0.35330977,
#          0.41952688,  1.30807659,  0.69380312],
#        [-0.20406831, -0.02854681,  0.6761973 ,  0.39208154,  0.36752163,
#         -0.98910881, -0.09216579, -0.48172531],
#        [ 0.19353222, -0.1132903 ,  0.51203663,  0.19261513,  0.53062717,
#         -0.17384195, -0.53533687, -0.17073551]]), 'b4': np.array([[-0.00324443],
#        [ 0.02433509],
#        [ 0.00042344],
#        [ 0.00845328],
#        [ 0.04700034],
#        [ 0.09360225],
#        [ 0.01624685],
#        [ 0.04891615]]), 'W5': np.array([[-0.03752244,  0.16296613,  1.39573272, -0.06501207, -0.51932995,
#         -0.14867124, -0.29999274,  0.32335211],
#        [-0.92835562, -0.09720688,  0.03880068, -0.09048357, -0.22572581,
#          0.00955438, -0.91347079, -0.34074669],
#        [-0.27774345,  0.24520023, -0.67686833,  0.52038343,  0.85364309,
#         -1.18817036,  0.23387813,  0.40081473],
#        [-0.39509793, -0.15669169, -0.06063954, -0.18507325, -0.2256856 ,
#         -1.04158751,  0.72451379,  0.67124921],
#        [-0.49979437, -0.91715063,  0.22004419, -0.52299259,  0.17418406,
#         -0.10224603,  0.31120054,  0.31810261],
#        [-0.41395898, -0.7860536 , -0.92075935,  0.34723825, -0.70460716,
#         -0.30253207, -0.34382726, -0.03934298],
#        [-1.10245628,  0.40875926,  0.44689074,  0.23192093, -0.4197032 ,
#         -0.24715129,  0.41193466, -1.47120391],
#        [ 1.11342939,  0.23620975, -0.36219564, -0.20216174,  0.29587611,
#         -0.06760388, -1.17832412,  1.18860966]]), 'b5': np.array([[-0.06829037],
#        [ 0.07285708],
#        [ 0.00732926],
#        [ 0.02027343],
#        [ 0.09361624],
#        [-0.00022931],
#        [-0.14982764],
#        [-0.02508978]]), 'W6': np.array([[-0.10874748,  0.60781621, -0.43410092,  0.9001778 ,  0.2008005 ,
#          0.36663817, -0.56410758,  0.68757072],
#        [ 0.39590289,  0.76785818, -0.37497232, -0.28026259,  1.36900287,
#         -0.6248279 , -0.04870613,  0.65719644],
#        [-0.02943083,  0.39201566, -0.32173434,  0.15622132, -0.70536944,
#          1.0061769 , -0.09826958, -0.34559808],
#        [ 0.37508881, -0.24855596, -0.79242951, -0.79728988,  0.41810817,
#         -0.08612814, -0.05840617,  0.60086487],
#        [-0.59131343, -0.4569847 , -0.16285025,  0.10703118, -0.05642005,
#         -0.20230203, -0.03237355, -0.08672535],
#        [-0.4289636 , -0.47431029,  0.16615187, -0.50655073, -0.6763771 ,
#         -0.16886415, -0.08446271,  1.29492643],
#        [-0.48407495,  0.58046073,  0.35185158, -0.78859705,  0.47892983,
#         -0.64646197, -1.59024412,  0.39264152],
#        [-1.19535072,  0.29742344, -0.49836439,  0.96059391,  0.68329159,
#         -0.16537882, -0.34602103, -0.70352471]]), 'b6': np.array([[ 0.03768639],
#        [-0.00080971],
#        [ 0.0725182 ],
#        [ 0.00425324],
#        [-0.0573086 ],
#        [ 0.01525027],
#        [ 0.05398683],
#        [ 0.18315841]]), 'W7': np.array([[-0.41014971, -0.59947343, -0.31180109, -0.30886923,  0.05648754,
#         -0.04304903, -0.628244  , -0.99417394]]), 'b7': np.array([[-0.44491248]])}

In [18]:
def test(article_preprocessed_test, original_test, summarized_text, compression_ratio, original_sentences, file_number = 0):
    X_test = generate_X_labels(article_preprocessed_test, original_sentences)
    predicton = predict(model, X_test)
    Y_test, original_sentences = generate_Y_labels(original_test, summarized_text)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(predicton[0]) [::-1]
    highest = highest[: num_sentences_summarized]
#     highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    output_indices = []
    

    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        output_indices.append(highest[i])
        
    output_sentences = ''.join(output_sentences)
    
    rouge = Rouge()
#     rouge = Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    scores_nn = rouge.get_scores(output_sentences, summarized_text)
    scores_nn = scores_nn[0]
    
    rouge_1_nn = scores_nn['rouge-1']['f']
    rouge_2_nn = scores_nn['rouge-2']['f']
    rouge_l_nn = scores_nn['rouge-l']['f']
    
    
    
    ### classified using TF_idf score

    output_tf_idf = summary_using_tf_idf_only(original_test, num_sentences_summarized)
    
    scores_tf_idf = rouge.get_scores(output_tf_idf, summarized_text)
    scores_tf_idf = scores_tf_idf[0]
    
    rouge_1_tf_idf = scores_tf_idf['rouge-1']['f']
    rouge_2_tf_idf = scores_tf_idf['rouge-2']['f']
    rouge_l_tf_idf = scores_tf_idf['rouge-l']['f']

    
    return rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf

In [19]:
def summary_using_tf_idf_only(text, num_sentences):
    article_sentences = sent_tokenize(text)
    article_preprocessed = preprocessing(text)
    sentence_scores = calculate_TF_IDF(article_preprocessed)
    highest = np.argsort(sentence_scores) [::-1]
    highest = highest[: num_sentences]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    
    for i in range (0, num_sentences):
        output_sentences.append(article_sentences[highest[i]])
        
    
    output_sentences = ' '.join(output_sentences)
    
    return output_sentences

In [20]:
article_file = io.open("cr7.txt", "r", encoding='utf8')
article = article_file.read()
article_file.close
summary = summary_using_tf_idf_only(article, 7)
print(summary)

Real Madrid forward Cristiano Ronaldo has said that he is the "best player in history" after winning his fifth Ballon d'Or on Thursday. Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him. He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me. I play well with both feet, I’m quick, powerful, good with the head, I score goals, I make assists. That says something, doesn’t it? Legends like Floyd Mayweather [Jr.] and LeBron James don’t get to their perfect level by chance. To be at the top and to stay there, you have to have more talent than the others."


In [21]:
# testing on BBC news summary

rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range(101, 380):
#         print(i)
        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf8')
        article_file.readline()
        article = article_file.read()
        article_preprocessed = preprocessing(article)
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf8')
        summarized = summarized_file.read()
        summarized_file.close()
        original_sentences = sent_tokenize(article)
        original_sentences[0] = original_sentences[0][1:] # to remove the \n
        rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
            article_preprocessed, article, summarized, 0.35, original_sentences, i)

        rouge_1_list_nn.append(rouge_1_nn)
        rouge_2_list_nn.append(rouge_2_nn)
        rouge_l_list_nn.append(rouge_l_nn)

        rouge_1_list_tf_idf.append(rouge_1_tf_idf)
        rouge_2_list_tf_idf.append(rouge_2_tf_idf)
        rouge_l_list_tf_idf.append(rouge_l_tf_idf)


print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

Using nn
Average Rouge 1 score is: 0.768111
Average Rouge 2 score is: 0.695166
Average Rouge l score is: 0.764052
Using tf_idf only
Average Rouge 1 score is: 0.376493
Average Rouge 2 score is: 0.211553
Average Rouge l score is: 0.356842


In [22]:
# testing on cnn dailymail dataset
rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

df = pd.read_csv('test.csv')
articles_cnn = df['article']
summaries_cnn = df['highlights']

i = 0

for i in range (len(articles_cnn)):
    article_cnn_preprocessed = preprocessing(articles_cnn[i])
    original_sentences = sent_tokenize(articles_cnn[i])
    rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
        article_cnn_preprocessed, articles_cnn[i], summaries_cnn[i], 0.35, original_sentences, i)
    i += 1
    rouge_1_list_nn.append(rouge_1_nn)
    rouge_2_list_nn.append(rouge_2_nn)
    rouge_l_list_nn.append(rouge_l_nn)

    rouge_1_list_tf_idf.append(rouge_1_tf_idf)
    rouge_2_list_tf_idf.append(rouge_2_tf_idf)
    rouge_l_list_tf_idf.append(rouge_l_tf_idf)
    if (i == 1000):
        break



print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

    

Using nn
Average Rouge 1 score is: 0.289055
Average Rouge 2 score is: 0.114925
Average Rouge l score is: 0.274278
Using tf_idf only
Average Rouge 1 score is: 0.196080
Average Rouge 2 score is: 0.053615
Average Rouge l score is: 0.181958


In [23]:
def extractive_summary(article, compression_ratio):
    original_sentences = sent_tokenize(article)
    article_preprocessed_entered = preprocessing(article)
    X_test_entered = generate_X_labels(article_preprocessed_entered, original_sentences)
    summary_predicted = predict(model, X_test_entered)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(summary_predicted[0]) [::-1]
    highest = highest[: num_sentences_summarized]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        
    
    output_sentences = ' '.join(output_sentences)
    
    return output_sentences
    

In [24]:
article_file = io.open("cr7.txt", "r", encoding='utf8')
article_file.readline()
article = article_file.read()
article_file.close
summary = extractive_summary(article, 0.35)
print(summary)


Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him. He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me. No footballer can do the things I can. "There’s no player more complete than me. But I tell you: there’s no one more complete than me. Legends like Floyd Mayweather [Jr.] and LeBron James don’t get to their perfect level by chance.


In [25]:
article_file = io.open("godzilla.txt", "r", encoding='utf8')
article = article_file.read()
article_file.close
summary = extractive_summary(article, 0.35)
print(summary)

An actor dressed as the giant creature breathed smoke over photographers on Monday as Godzilla received the 2,271st star on Hollywood Boulevard. "Godzilla should thank you for this historical and monumental star," said Final Wars producer Shogo Tomiyama. Hollywood's honorary mayor, Johnny Grant, said: "I do hereby proclaim this Godzilla Day in Hollywood. The premiere of Godzilla: Final Wars at Grauman's Chinese Theatre followed the ceremony on Hollywood Boulevard. Director Ryuhei Kitamura said it may not be Godzilla's final outing, as it has been billed. And producer Shogo Tomiyama added: "So long as Godzilla can fascinate people, I believe he will be resurrected by new generations of filmmakers in the future."


# Abstractive Summary

In [2]:
train_fn = trax.data.TFDS('cnn_dailymail', data_dir='abstractive_summary_data/', keys=('article', 'highlights'), train=True)

eval_fn = trax.data.TFDS('cnn_dailymail', data_dir='abstractive_summary_data/', keys=('article', 'highlights'), train=False)

2023-06-03 01:58:33.413458: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
VOCAB_DIR = 'abstractive_summary_data/'
VOCAB_FILE = 'summarize32k.subword.subwords'
MODEL_DIR = 'abstractive_summary_data/pretrained_model'
FINE_TUNED_DIR = 'abstractive_summary_data/fine_tuning_weights'

In [3]:
def tokenize(input_str):
    integers =  next(trax.data.tokenize(iter([input_str]), vocab_dir = VOCAB_DIR, vocab_file = VOCAB_FILE))
    return list(integers) + [1]

In [4]:
def detokenize(integers):
    sentence = trax.data.detokenize(integers, vocab_dir = VOCAB_DIR, vocab_file = VOCAB_FILE)
    return wrapper.fill(sentence)

In [6]:
def preprocess_abstractive(stream):
    for (article, summary) in stream:
        combined = np.array(list(article) + [1, 0] + list(summary) + [1])
        mask = [0] * (len(list(article)) + 2) + [1] * (len(list(summary)) + 1)
        yield combined, combined, np.array(mask)

input_pipeline = trax.data.Serial(trax.data.Tokenize(vocab_dir = VOCAB_DIR, vocab_file = VOCAB_FILE), preprocess_abstractive, trax.data.FilterByLength(4096))

train_stream = input_pipeline(train_fn())
eval_stream = input_pipeline(eval_fn())

In [7]:
boundaries =  [128, 256,  512, 1024]
batch_sizes = [16,    8,    4,    2, 1]

# Create the streams.
train_batch_stream = trax.data.BucketByLength(boundaries, batch_sizes)(train_stream)

eval_batch_stream = trax.data.BucketByLength(boundaries, batch_sizes)(eval_stream)

In [5]:
def DecoderBlock(model_dim, feed_forward_depth, heads_num, activation_function, mode, dropout):
    features_dim = model_dim
    heads_dim = features_dim // heads_num

    # X is an input of dimension (batch_size, seqlen, heads_num x heads_dim) and will be converted by the following line to 
    # (batch_size x heads_num, seqlen, heads_dim) to allow matrix multiplication 

    compute_wq_wk_wv = tlayer.Fn('AttnHeads', 
                                      lambda x: trax_np.reshape(trax_np.transpose(
                                          trax_np.reshape(x, (x.shape[0], x.shape[1], heads_num, heads_dim)),
                                          (0, 2, 1, 3)),
                                          (-1, x.shape[1], heads_dim)), n_out=1)
    

    # Create feed-forward block (list) with two dense layers with dropout and input normalized
    # Add list of two Residual blocks: the attention with normalization and dropout and feed-forward blocks
    return [
      tlayer.Residual(
          tlayer.LayerNorm(),
          tlayer.Serial(
            tlayer.Branch( # creates three towers for one input, takes activations and creates queries keys and values
                [tlayer.Dense(features_dim), compute_wq_wk_wv], # queries
                [tlayer.Dense(features_dim), compute_wq_wk_wv], # keys
                [tlayer.Dense(features_dim), compute_wq_wk_wv], # values
            ),
            tlayer.Fn('DotProductAttn', lambda query, key, value: trax_np.matmul(trax_np.exp(trax_np.where(
                                trax_np.tril(trax_np.ones((1, query.shape[-2], query.shape[-2]), 
                                dtype=trax_np.bool_), k=0), trax_np.matmul(query, trax_np.swapaxes(key, -1, -2)) / trax_np.sqrt(query.shape[-1]),
                                trax_np.full_like(trax_np.matmul(query, trax_np.swapaxes(key, -1, -2)) / trax_np.sqrt(query.shape[-1]), -1e9)) -                      
                                trax.fastmath.logsumexp(trax_np.where(trax_np.tril(trax_np.ones((1, query.shape[-2], query.shape[-2]), dtype=trax_np.bool_), k=0),
                                trax_np.matmul(query, trax_np.swapaxes(key, -1, -2)) / trax_np.sqrt(query.shape[-1]),
                                trax_np.full_like(trax_np.matmul(query, trax_np.swapaxes(key, -1, -2)) / trax_np.sqrt(query.shape[-1]), -1e9)),
                                axis=-1, keepdims=True)), value), n_out=1), 
            tlayer.Fn('AttnOutput', lambda x: trax_np.reshape(trax_np.transpose(
                                trax_np.reshape(x, ( -1, heads_num, x.shape[1], heads_dim)), 
                                ( 0, 2, 1 , 3)), (-1, x.shape[1], heads_num * heads_dim)), n_out=1), # to allow for parallel


            tlayer.Dense(features_dim) # Final dense layer
            ),
          tlayer.Dropout(rate = dropout, mode = mode)
        ),
      tlayer.Residual(
            [ 
                tlayer.LayerNorm(),
                tlayer.Dense(feed_forward_depth),
                activation_function(), # Generally ReLU
                tlayer.Dropout(rate = dropout, mode = mode),
                tlayer.Dense(model_dim),
                tlayer.Dropout(rate = dropout,mode = mode)
            ]
        )
      ]
    ### END CODE HERE ###

In [6]:
def TransformerLM(vocab_size=33300, model_dim=512, feed_forward_depth=2048,
                  layers_num=6, heads_num=8, dropout=0.1, max_len=4096,
                  mode='train', activation_function=tlayer.Relu):

    positional_encoder = [ 
        tlayer.Embedding(vocab_size, model_dim),
        tlayer.Dropout(rate=dropout, mode=mode),
        tlayer.PositionalEncoding(max_len=max_len, mode=mode)]

    decoder_blocks = [ 
        DecoderBlock(model_dim, feed_forward_depth, heads_num, activation_function, mode, dropout) 
        for _ in range(layers_num)]

    return tlayer.Serial(
        tlayer.ShiftRight(mode=mode),
        positional_encoder,
        decoder_blocks,
        tlayer.LayerNorm(),
        tlayer.Dense(vocab_size),
        tlayer.LogSoftmax()
    )

    ### END CODE HERE ###

In [10]:
# fine tuning the pretrained model 

abstractive_model = TransformerLM()

# Load the pre-trained weights
abstractive_model.init_from_file(MODEL_DIR + '/model.pkl.gz', weights_only=True)

train_task = training.TrainTask( 
    labeled_data=train_batch_stream, # The training generator
    loss_layer=tlayer.CrossEntropyLoss(), # Loss function 
    optimizer=trax.optimizers.Adam(0.0000001),
    n_steps_per_checkpoint=5
)

# Define evaluation loop
eval_task = training.EvalTask(
    labeled_data = eval_batch_stream,
    metrics=[tlayer.CrossEntropyLoss(), tlayer.Accuracy()],
)

In [None]:
loop = training.Loop(abstractive_model, train_task, eval_tasks = [eval_task], output_dir = FINE_TUNED_DIR)
loop.run(100)

  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 55144980
Step      1: Ran 1 train steps in 66.42 secs
Step      1: train CrossEntropyLoss |  3.92982721


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  4.05336428
Step      1: eval          Accuracy |  0.54263568

Step      5: Ran 4 train steps in 161.34 secs
Step      5: train CrossEntropyLoss |  3.19703913
Step      5: eval  CrossEntropyLoss |  4.02799606
Step      5: eval          Accuracy |  0.53191489

Step     10: Ran 5 train steps in 164.63 secs
Step     10: train CrossEntropyLoss |  3.01276636
Step     10: eval  CrossEntropyLoss |  3.62366676
Step     10: eval          Accuracy |  0.51886791

Step     15: Ran 5 train steps in 175.24 secs
Step     15: train CrossEntropyLoss |  3.29948545
Step     15: eval  CrossEntropyLoss |  5.08440685
Step     15: eval          Accuracy |  0.42105263

Step     20: Ran 5 train steps in 128.95 secs
Step     20: train CrossEntropyLoss |  3.45422411
Step     20: eval  CrossEntropyLoss |  4.29900599
Step     20: eval          Accuracy |  0.36563876

Step     25: Ran 5 train steps in 166.01 secs
Step     25: train CrossEntropyLoss |  3.22532773
Step     25: ev

In [7]:
# Get the model architecture
pretrained_model = TransformerLM(mode='eval')

# Load the pre-trained weights
pretrained_model.init_from_file(MODEL_DIR + '/model.pkl.gz', weights_only=True)

((((), (), ()),
  array([[ 0.00185256, -0.00332041, -0.00516087, ...,  0.02195336,
           0.01051251, -0.00567034],
         [-0.00739519,  0.00183287,  0.00196656, ...,  0.05615862,
           0.00132284, -0.00291792],
         [ 0.01545155,  0.00096676, -0.00778854, ..., -0.01111389,
           0.01328059, -0.00775655],
         ...,
         [ 0.01310026,  0.0053312 , -0.00110446, ...,  0.0018376 ,
          -0.01304161,  0.01059895],
         [ 0.01033204,  0.00199992, -0.00960302, ..., -0.00053245,
          -0.00377625,  0.00046856],
         [ 0.00082984, -0.00329287, -0.00404815, ...,  0.00028511,
          -0.00377798, -0.00656456]], dtype=float32),
  (),
  array([[ 0.01798599,  0.00387794,  0.00273774, ..., -0.07271248,
           0.01859818,  0.0302424 ],
         [ 0.05207964,  0.02332034,  0.00967251, ..., -0.02486727,
           0.0165352 , -0.00352491],
         [ 0.03194407, -0.02127359, -0.00068408, ..., -0.01532798,
           0.01319801, -0.00402305],
         ..

In [8]:
# Get the model architecture
abstractive_fine_tuned_model = TransformerLM(mode='eval')

# Load the pre-trained weights
abstractive_fine_tuned_model.init_from_file(FINE_TUNED_DIR + '/model.pkl.gz', weights_only=True)

((((), (), ()),
  array([[ 0.00185314, -0.00331989, -0.00516132, ...,  0.02194645,
           0.0105072 , -0.00566858],
         [-0.00739478,  0.00183274,  0.00196596, ...,  0.05613796,
           0.0013204 , -0.00291684],
         [ 0.01544436,  0.0009663 , -0.00778807, ..., -0.01110793,
           0.01327326, -0.0077543 ],
         ...,
         [ 0.013095  ,  0.00532906, -0.00110402, ...,  0.00183687,
          -0.0130364 ,  0.0105947 ],
         [ 0.0103279 ,  0.00199912, -0.00959918, ..., -0.00053224,
          -0.00377475,  0.00046837],
         [ 0.00082951, -0.00329155, -0.00404653, ...,  0.000285  ,
          -0.00377647, -0.00656193]], dtype=float32),
  (),
  array([[ 0.01797621,  0.00387489,  0.00273348, ..., -0.07268105,
           0.01858758,  0.0302281 ],
         [ 0.05205658,  0.02330974,  0.00966574, ..., -0.0248561 ,
           0.01652611, -0.00352417],
         [ 0.03192862, -0.02126632, -0.00068625, ..., -0.01532226,
           0.01319057, -0.00402066],
         ..

In [9]:
def abstractive_summary(article, model):
    article_summary = tokenize(article)
    article_summary.append(0)
    summary = []
    generated_word = 0
    start_summary = len(article_summary) 
    count = 0
    while generated_word != 1 and count < 100:
        count += 1
        length_padding = np.power(2, int(np.ceil(np.log2(len(article_summary) + 1)))) - len(article_summary)
        article_summary_padded = article_summary.copy()
        for _ in range(length_padding):
            article_summary_padded.append(0)
        article_summary_padded = np.array(article_summary_padded)[None, :]
        output, _ = model((article_summary_padded, article_summary_padded))  
        log_probs = output[0, len(article_summary), :]
        generated_word = int(np.argmax(log_probs))
        article_summary.append(generated_word)
        
    for i in range(start_summary, len(article_summary)):
        summary.append(article_summary[i])
        
    summary = detokenize(summary[: -1])
    return summary


In [10]:
test_sentence = "It was a sunny day when I went to the market to buy some flowers. But I only found roses, not tulips."
print(wrapper.fill(test_sentence), '\n')
print(abstractive_summary(test_sentence, abstractive_fine_tuned_model))

It was a sunny day when I went to the market to buy some flowers. But
I only found roses, not tulips. 

: I just found roses, not tulips.


In [11]:
article = "It’s the posing craze sweeping the U.S. after being brought to fame by skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert Pujols - and even Republican politician Rick Perry. But now four students at Riverhead High School on Long Island, New York, have been suspended for dropping to a knee and taking up a prayer pose to mimic Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll were all suspended for one day because the ‘Tebowing’ craze was blocking the hallway and presenting a safety hazard to students. Scroll down for video. Banned: Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured left) were all suspended for one day by Riverhead High School on Long Island, New York, for their tribute to Broncos quarterback Tim Tebow. Issue: Four of the pupils were suspended for one day because they allegedly did not heed to warnings that the 'Tebowing' craze at the school was blocking the hallway and presenting a safety hazard to students."
print(wrapper.fill(article), '\n')
print(abstractive_summary(article, abstractive_fine_tuned_model))

It’s the posing craze sweeping the U.S. after being brought to fame by
skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert
Pujols - and even Republican politician Rick Perry. But now four
students at Riverhead High School on Long Island, New York, have been
suspended for dropping to a knee and taking up a prayer pose to mimic
Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel,
Tyler Carroll and Connor Carroll were all suspended for one day
because the ‘Tebowing’ craze was blocking the hallway and presenting a
safety hazard to students. Scroll down for video. Banned: Jordan
Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured
left) were all suspended for one day by Riverhead High School on Long
Island, New York, for their tribute to Broncos quarterback Tim Tebow.
Issue: Four of the pupils were suspended for one day because they
school was blocking the hallway and presenting a safety hazard to
students. 

Jordan Fulcoly, Wayne Drexel, Ty

In [12]:
def test_abstractive(original_test, summarized_text, fine_tuned_model, pretrained_model):
    summary_fine_tuned_model = abstractive_summary(original_test, fine_tuned_model)
    summary_pretrained_model = abstractive_summary(original_test, pretrained_model)
    print(summary_fine_tuned_model)
    print('--------------------------')
    
    print(summary_pretrained_model)
    print('-----------------------------------------------------------')
    rouge = Rouge()
    scores_transformer_fine_tuned_model = rouge.get_scores(summary_fine_tuned_model, summarized_text)
    scores_transformer_fine_tuned_model = scores_transformer_fine_tuned_model[0]
    
    rouge_1_transformer_fine_tuned_model = scores_transformer_fine_tuned_model['rouge-1']['f']
    rouge_2_transformer_fine_tuned_model = scores_transformer_fine_tuned_model['rouge-2']['f']
    rouge_l_transformer_fine_tuned_model = scores_transformer_fine_tuned_model['rouge-l']['f']
    
    scores_transformer_pretrained_model = rouge.get_scores(summary_pretrained_model, summarized_text)
    scores_transformer_pretrained_model = scores_transformer_pretrained_model[0]
    
    rouge_1_transformer_pretrained_model = scores_transformer_pretrained_model['rouge-1']['f']
    rouge_2_transformer_pretrained_model = scores_transformer_pretrained_model['rouge-2']['f']
    rouge_l_transformer_pretrained_model = scores_transformer_pretrained_model['rouge-l']['f']
    
    return rouge_1_transformer_fine_tuned_model, rouge_2_transformer_fine_tuned_model, rouge_l_transformer_fine_tuned_model, rouge_1_transformer_pretrained_model, rouge_2_transformer_pretrained_model, rouge_l_transformer_pretrained_model

In [13]:
# testing on BBC news summary

rouge_1_list_transformer = []
rouge_2_list_transformer = []
rouge_l_list_transformer = []

rouge_1_list_transformer_pretrained = []
rouge_2_list_transformer_pretrained = []
rouge_l_list_transformer_pretrained = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range(101, 121):

        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf8')
        article_file.readline()
        article = article_file.read()
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf8')
        summarized = summarized_file.read()
        summarized_file.close()

        rouge_1_transformer, rouge_2_transformer, rouge_l_transformer, rouge_1_pretrained, rouge_2_pretrained, rouge_l_pretrained= test_abstractive(article, summarized, abstractive_fine_tuned_model, pretrained_model)

        rouge_1_list_transformer.append(rouge_1_transformer)
        rouge_2_list_transformer.append(rouge_2_transformer)
        rouge_l_list_transformer.append(rouge_l_transformer)
        rouge_1_list_transformer_pretrained.append(rouge_1_pretrained)
        rouge_2_list_transformer_pretrained.append(rouge_2_pretrained)
        rouge_l_list_transformer_pretrained.append(rouge_l_pretrained)


print('Using transformer')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_transformer)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_transformer)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_transformer)))
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_transformer_pretrained)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_transformer_pretrained)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_transformer_pretrained)))

The move was criticised by some analysts and 10st figures. The move
was criticised by some analysts and 10st figures. The move was
criticised by some analysts.10; The rate hike was the first since
2003.
--------------------------
The move was criticised by some analysts and some analysts. It was
criticised by some analysts and some analysts. The move was ill-timed
given data showing the economy grew just 0.1% between October and
December.
-----------------------------------------------------------
The Titan corporation was accused of funnelling more than $2m into the
2001 re-election campaign of President Mathieu Kerekou. The company,
which has agreed to pay $28.5m in criminal penalties, was accused of
funnelling more than $2m into the 2001 re-election campaign of
President Mathieu Kerekou. The company says it no longer tolerates
such practices.
--------------------------
The Titan corporation was accused of funnelling more than $2m into the
2001 re-election campaign of President Mathi

Icl and Wellbutrin to a weak dollar hit profits, but global sales were
up 1% in 2004. Company says it had absorbed over £1.5bn of lost sales
to generics but still managing to grow the business.
--------------------------
Icl and Wellbutrin to a weak dollar hit profits, but global sales were
up 1% in 2004. Company says it had absorbed over £1.5bn of lost sales
to generics but still managing to grow the business.
-----------------------------------------------------------
SMFG, and Daiwa Securities jumped amid speculation that two of Japan's
biggest financial companies will merge. The merger would be a good
move, he added.
--------------------------
SMFG, and Daiwa Securities jumped amid speculation that two of Japan's
biggest financial companies will merge. The merger "would be a good
move," he added.
-----------------------------------------------------------
German-US carmaker saw fourth quarter operating profits fall to 785m
euros. Firm said it was determined to retain Mercedes' posi

Police arrested Marion "Suge" Knight for allegedly marijuana
possession. He was arrested in Barstow, California, on Saturday
following an alleged traffic offence. He was arrested on Saturday
after performing an illegal U-turn and a search of his car allegedly
found marijuana.
--------------------------
Police arrested Marion "Suge" Knight for allegedly marijuana
possession. He was arrested in Barstow, California, on Saturday
following an alleged traffic offence. He was arrested on Saturday
after performing an illegal U-turn and a search of his car allegedly
found marijuana.
-----------------------------------------------------------
Sir Elton played a 2,700-strong audience on Sunday at the French
capital's Bastille opera house. The singer played piano accompaniment
throughout the concert. The singer also played a 2,700-strong audience
on Sunday at the French capital's Bastille opera house.
--------------------------
Sir Elton played a 2,700-strong audience on Sunday at the French
capit

Home Office minister said 'UK Muslims should accept that people of
Islamic appearance are more likely to be stopped and searched by
police. She said Muslim groups would be asked what other
qualifications and skills. Comes after Muslim groups have repeatedly
claimed their communities are being victimised under terror laws.
--------------------------
Home Office minister said 'UK Muslims should accept that people of
Islamic appearance are more likely to be stopped and searched by
police. She said Muslim groups would be asked what other
qualifications and skills. Comes after Muslim groups have repeatedly
claimed their communities are being victimised under terror laws.
-----------------------------------------------------------
Tory leader says English local authorities spent more than £3bn. Mr
Howard is expected to tell activists in Kent voters' tolerance. Other
parties and refugee agencies have already attacked Tory plans for
annual limit on numbers.
--------------------------
Tory lead

Lord Scarman was one of the "great advocates of our generation" He
also chaired the 1969 tribunal set up to investigate civil
disturbances in Northern Ireland. Lord Scarman was one of the "great
advocates of our generation" He also chaired the 1969 tribunal set up
to investigate civil disturbances in Northern Ireland.
--------------------------
Lord Scarman conducted the inquiry into the 1981 Brixton riots. He was
one of the "great advocates of our generation" He was one of the
"great advocates of our generation" Lord Scarman died aged 93 from
93.10sec .
-----------------------------------------------------------
Prince Harry was in a privileged position and said he should
apologise. Spoke to the palace after he said he should apologise in
person. Spoke to the palace after the Duchess of Cambridge's comments.
--------------------------
Prince Harry was in a privileged position and said he should
apologise. Spoke to the palace after he said he should apologise in
person. Spoke to the pa

Shaun Wright-Phillips will start against the Netherlands on the left.
Eriksson also revealed that Wes Brown will start alongside Jamie
Carragher. Andy Johnson and Stewart Downing also get a chance to
impress.
--------------------------
Shaun Wright-Phillips will start against the Netherlands on the left.
Eriksson also revealed that Wes Brown will start alongside Jamie
Carragher. Andy Johnson and Stewart Downing will start for the game.
-----------------------------------------------------------
Arjen Robben sustained a broken metatarsal in the Premiership win at
Blackburn. The 21-year-old missed the first three months of the season
with a similar injury. The 21-year-old missed the first three months
of the season with a similar injury.
--------------------------
Arjen Robben sustained a broken metatarsal in the Premiership win at
Blackburn. The 21-year-old missed the first three months of the season
with a similar injury. The 21-year-old missed the first three months
of the season with

Indian writer Rohit Gupta edits a group blog without Borders. He
created the blogger's blog without Borders. He was using SMS text
messages from affected areas of Sri Lanka. Mr Gupta and his fellow
bloggers switched gears.
--------------------------
Indian writer Rohit Gupta edits a group blog without Borders. He
created the blogger's blog without Borders. He was using SMS text
messages from affected areas of Sri Lanka. Mr Gupta and fellow
bloggers switched gears.
-----------------------------------------------------------
10; The Wolong Nature Reserve in the Sichuan Province of southwest
China is home to 20% of the remaining 1,500 giant pandas in the world.
The network has been developed by Intel, working closely with the
staff at Wolong. It includes a 802.11b wireless network and a video
monitoring system using five cameras to observe pandas around the
clock.
--------------------------
10st panda population is getting a helping hand from wireless internet
network. The network has bee

Suspect arrested in Camberley, Surrey, on Wednesday. He has now been
bailed to return to a Surrey police station in December. All 28 people
detained globally are suspected of being involved in an internet-based
network. Police in the US, who have indicted 19 people in Newark, New
Jersey, New Jersey, New Jersey.
--------------------------
Suspect arrested in Camberley, Surrey, on Wednesday. He has now been
bailed to return to a Surrey police station in December. All 28 people
detained globally are suspected of being involved in an internet-based
network. Police say the fraud caused losses of more than $4m.
-----------------------------------------------------------
Using transformer
Average Rouge 1 score is: 0.331689
Average Rouge 2 score is: 0.201642
Average Rouge l score is: 0.323505
Average Rouge 1 score is: 0.318883
Average Rouge 2 score is: 0.193023
Average Rouge l score is: 0.311299


In [17]:
# testing on cnn dailymail dataset

rouge_1_list_transformer = []
rouge_2_list_transformer = []
rouge_l_list_transformer = []

df = pd.read_csv('test.csv')
articles_cnn = df['article']
summaries_cnn = df['highlights']

i = 0
for article_cnn, summary_cnn in zip(articles_cnn, summaries_cnn):
    rouge_1_transformer, rouge_2_transformer, rouge_l_transformer, rouge_1_pretrained, rouge_2_pretrained, rouge_l_pretrained= test_abstractive(article_cnn, summary_cnn, abstractive_fine_tuned_model, pretrained_model)
    i += 1
    if (i == 30):
        break
    
    rouge_1_list_transformer.append(rouge_1_transformer)
    rouge_2_list_transformer.append(rouge_2_transformer)
    rouge_l_list_transformer.append(rouge_l_transformer)
    rouge_1_list_transformer_pretrained.append(rouge_1_pretrained)
    rouge_2_list_transformer_pretrained.append(rouge_2_pretrained)
    rouge_l_list_transformer_pretrained.append(rouge_l_pretrained)


print('Using transformer')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_transformer)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_transformer)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_transformer)))
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_transformer_pretrained)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_transformer_pretrained)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_transformer_pretrained)))

A U.S consumer advisory group set up by the Department of
Transportation. The government is happy to set standards for animals
flying on planes. But some experts are questioning whether planes are
putting passengers at risk.
--------------------------
A U.S consumer advisory group set up by the Department of
Transportation. The government is happy to set standards for flying on
planes. But the DOT and FAA take a stand for humane treatment of
passengers.
-----------------------------------------------------------
Rahul Kumar, 17, climbed into the lions' enclosure at a zoo in
Ahmedabad. He was drunk and 'thought I'd stand a good chance' He was
sitting near the enclosure when he made a dash for the lions, adding:
'Guards had earlier spotted him close to the enclosure'
--------------------------
Rahul Kumar, 17, climbed into the lions' enclosure at a zoo in
Ahmedabad. He was drunk and 'thought I'd stand a good chance' He was
sitting near the enclosure when he made a dash for the lions, add

Ian Bell was on the second day of England's opening tour match. Ian
Bell was on the second day of England's opening tour match. Ian Bell
was on the second day of the St Kitts and Nevis Invitational XI vs
England tour match .
--------------------------
Ian Bell was on the second day of England's opening tour match. Ian
Bell was on the second day of England's opening tour match. Ian Bell
was on the second day of the St Kitts and Nevis Invitational XI vs
England tour match .
-----------------------------------------------------------
A senior U.S. official says the U.S. has made no serious effort to
negotiate for the 73-year-old development expert's release. The
Pakistani official says the capture by al Qaeda made it hard for the
U.S. to negotiate. A senior Pakistani official says the capture by al
Qaeda made it hard for the U.S. to negotiate.
--------------------------
A senior U.S. official says the U.S. has made no serious effort to
negotiate for the 73-year-old development expert's re

ISIS is a problem that is off the charts historically, he says. He
says the U.S. has sent the United States into "uncharted territory"
when it comes to the terror group. He says the U.S. has expanded its
reach to Libya, Egypt and Yemen.
--------------------------
ISIS is a problem that is off the charts historically, he says. He
says the U.S. has sent the United States into "uncharted territory"
when it comes to the terror group. He says the U.S. has expanded its
reach to Libya, Egypt and Yemen.
-----------------------------------------------------------
David Cameron has been secretly using hair dye to turn back the years
Nigel Farage suggested today. Mr Farage said he was 'jealous' of how
Tory leader has reversed the greying process.
--------------------------
David Cameron has been secretly using hair dye to turn back the years
Nigel Farage. Mr Farage said he was 'jealous' of how Tory leader has
reversed the greying process.
----------------------------------------------------------