In [1]:
import re
import string
import numpy as np
import math
import nltk
import io
import csv
import pandas as pd


from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice
from rouge import Rouge



In [2]:
# nltk.download('stopwords')
# nltk.download()  # uncomment these lines once they are not downloaded

In [3]:
def preprocessing(article):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    article_preprocessed = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
        sentence_preprocessed = []
        sentence = re.sub(r"[^a-zA-Z\s]+", "", sentence)
        words = word_tokenize(sentence)
        for word in words:
            if (word not in stopwords_english and word not in string.punctuation):
                word_stemmed = stemmer.stem(word)  
                sentence_preprocessed.append(word_stemmed)
#         sentence_preprocessed = " ".join(sentence_preprocessed)
        if sentence_preprocessed:
            article_preprocessed.append(sentence_preprocessed)
            
        
        
#     words = [word_tokenize(sent) for sent in sentences]
#     words_without_stopwords = [[word for word in sent if word not in stopwords.words('english')] for sent in words]
    return article_preprocessed


In [4]:
def convert_list_to_string(sentences):  # converts list of lists to list of strings
    sentences_modified = []   # list of strings
    for sentence in sentences:
        sentence_modified = ' '.join(sentence)
        sentences_modified.append(sentence_modified)
    return sentences_modified

In [5]:
### Feature 1

def calculate_TF_IDF(content):
    flat_words = [word for sent in content for word in sent]
    words_set = set(flat_words)
    words_num = len(words_set)
#     print(words_set)
    tf = pd.DataFrame(np.zeros((len(content), words_num)), columns = words_set)
    for i in range (len(content)):
        for w in content[i]:
                      tf[w][i] += 1/len(content[i])
                      
    idf = {}
    
    for word in words_set:
        num_docs = 0
        for i in range(len(content)):
            if word in content[i]:
                num_docs += 1
                
        idf[word] = np.log10(len(content) / num_docs)
        
    tf_idf = np.zeros(len(content))
    
    for i in range (len(content)):
        for word in content[i]:
            tf_idf[i] += tf[word][i] * idf[word]
            
#     print(tf_idf/max(tf_idf))
    tf_idf = tf_idf/max(tf_idf)  # might be commented (this normalizes the tf-idf)
            
    return tf_idf

In [6]:
# article = [['data', 'science', 'is' ,'one', 'of', 'the', 'most', 'important', 'fields', 'of', 'science'],
#            ['this', 'is', 'one', 'of', 'the', 'best', 'data', 'science', 'courses'],
#            ['data', 'scientists', 'analyze', 'data']]

# x = calculate_TF_IDF(article)
# print(x)

In [7]:
# article_file = io.open("articles/original (" + str(1) +").txt", "r", encoding='utf-8-sig')
# article_file.readline()
# article = article_file.read()
# article_file.close()

# # print(article)
# article_preprocessed = preprocessing(article)
# x = calculate_TF_IDF(article)
# print(x)

In [8]:
### Feature 2

def sentence_length(article_preprocessed):
    article_preprocessed = convert_list_to_string(article_preprocessed)
    max_length = 0
    for sentence in article_preprocessed:
        # print(sentence)
        if len(sentence.split()) > max_length:
            max_length = len(sentence.split())
            
    sentence_length_feature = []
    for sentence in article_preprocessed:
        sentence_length_feature.append(len(sentence.split()) / max_length)

#     sentence_length_feature = []
#     for sentence in article_preprocessed:
#         sentence_length_feature.append(1 / len(sentence.split()))

    return sentence_length_feature

In [9]:
# article_file = io.open("articles/original (" + str(1) +").txt", "r", encoding='utf-8-sig')
# article_file.readline()
# art = article_file.read()
# article_file.close()

# art = preprocessing(art)
# art = convert_list_to_string(art)
# sentence_length(article_preprocessed)

In [10]:
def generate_X_labels(article_preprocessed):
    # feature 1 (tf_idf)
#     word_scores = calculate_TF_IDF(article_preprocessed)
#     tf_idf_score_feature = calculate_each_sentence_score(article_preprocessed, word_scores)
    tf_idf_score_feature = calculate_TF_IDF(article_preprocessed)
    
    
    # feature 2 (sentence_length)
    sentence_length_feature = sentence_length(article_preprocessed)
    
    matrix = np.column_stack((tf_idf_score_feature, sentence_length_feature))
#     matrix = np.array(tf_idf_score_feature).reshape(len(tf_idf_score_feature), 1)

    return matrix

In [11]:
# article_file = io.open("articles/original (" + str(1) +").txt", "r", encoding='utf-8-sig')
# article_file.readline()
# article = article_file.read()
# article_file.close()

# art = preprocessing(article)
# art = convert_list_to_string(art)
# print(generate_X_labels(article_preprocessed))
# print(len(generate_X_labels(article_preprocessed)))

In [12]:
def generate_Y_labels(original, summarized):
    Y_list = []
    original_sentences = sent_tokenize(original)
    original_sentences[0] = original_sentences[0][1:] # to remove the \n
    summarized_sentences = sent_tokenize(summarized)
    
    for original_sentence in original_sentences:
        added = 0
        for summarized_sentence in summarized_sentences:
            if original_sentence in summarized_sentence:
                Y_list.append(1)
                added = 1
                break
        if added == 0:
            Y_list.append(0)
    
    return Y_list, original_sentences
    

In [19]:
# article_file = io.open("articles/original (" + str(301) +").txt", "r", encoding='utf-8-sig')
# article_file.readline()
# article = article_file.read()
# article_file.close()

# summarized_file = io.open("articles/summarized (" + str(301) +").txt", "r", encoding='utf-8-sig')
# summarized = summarized_file.read()
# summarized_file.close()

# Y,_ = generate_Y_labels(article, summarized)
# print(Y)
# print(len(Y))

In [13]:
X_matrix = []
X = []
Y = []
sentences = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range (1, 51):   # loading business articles
        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf-8-sig')
        article_file.readline()
        article = article_file.read()
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf-8-sig')
        summarized = summarized_file.read()
        summarized_file.close()

        article_preprocessed = preprocessing(article)
    #     article_preprocessed = convert_list_to_string(article_preprocessed)
        X_i = generate_X_labels(article_preprocessed)
        Y_i, original_list_no_first_space = generate_Y_labels(article, summarized)

        if(len(X_i) != len(Y_i)):
            print('Error! features and labels are not equal in length')

        Y.extend(Y_i)
        X_matrix.extend(X_i)
        sentences.extend(original_list_no_first_space)
    

for x in X_matrix:
    X.append(x.tolist())
    
X = np.matrix(X)

m = len(X)

print(len(X))
print(len(Y))

4179
4179


In [14]:
print(type(X))
print(X[: 10, :])
print(type(Y))
print(Y[: 10])

<class 'numpy.matrix'>
[[0.73203128 0.54545455]
 [0.87185081 0.63636364]
 [0.69813911 0.36363636]
 [0.75779078 0.59090909]
 [0.74727107 0.31818182]
 [0.72061857 0.27272727]
 [0.78105634 0.45454545]
 [0.69638967 0.63636364]
 [0.8741924  0.81818182]
 [0.78987147 0.59090909]]
<class 'list'>
[1, 0, 1, 1, 0, 0, 1, 1, 0, 0]


In [15]:
nn_input_dim = 2 # input layer size (we have two input features)
nn_output_dim = 1  # output layer size (we have one output)

# Gradient descent parameters
alpha = 0.1  # learning rate for gradient descent
# print(Y)
# print(X)

In [16]:
def sigmoid(x):
    # TODO 1: Compute the sigmoid function at the given x (~1 line)
    # For example: sigmoid(2) should compute the value of sigmoid function at x = 2.
    # Hint: Use np.exp instead of math.exp to allow for vectorization.
    #----------------------------------------------------------------------------------------------
    sig = (1/(1+np.exp(-x)))
    #----------------------------------------------------------------------------------------------
    
    return sig


In [30]:
def build_model(nn_hdim, num_passes=20000, print_loss=False):
    
    # This function learns parameters for the neural network and returns the model.
    # - nn_hdim: Number of nodes in the hidden layer
    # - num_passes: Number of iterations (epochs) through the training data for gradient descent
    # - print_loss: If True, print the loss every 1000 iterations

    # Initialize the parameters to random values. We need to learn these at the end.
    np.random.seed(0)
    W1 = np.random.randn(nn_hdim, nn_input_dim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((nn_hdim, 1))
    W2 = np.random.randn(nn_hdim, nn_hdim) / np.sqrt(nn_input_dim)
    b2 = np.zeros((nn_hdim, 1))
    W3 = np.random.randn(nn_hdim, nn_hdim) / np.sqrt(nn_input_dim)
    b3 = np.zeros((nn_hdim, 1))
    W4 = np.random.randn(nn_output_dim, nn_hdim) / np.sqrt(nn_hdim)
    b4 = np.zeros((nn_output_dim, 1))

    # This is what we return at the end
    model = {}

    # Batch Gradient descent (We accumulate the loss for each training point before updating the weights)
    # For each iteration:
    for i in range(0, num_passes):
        DW1 = 0
        DW2 = 0
        DW3 = 0
        DW4 = 0
        Db1 = 0
        Db2 = 0
        Db3 = 0
        Db4 = 0
        cost = 0
        # Loop on every training example...
        for j in range(0, m):
            a0 = X[j, :].reshape(-1, 1)  # Every training example is a column vector.
            y = Y[j]
            
            # TODO 2: Apply forward propagation on every training example a0 (a column vector 2x1) with its
            # corresponding label y. It is required to compute z1, a1, z2, and a2
            #----------------------------------------------------------------------------------------------
            # Forward propagation
            z1 = np.dot(W1 , a0 )+ b1
            a1 = np.tanh(z1)
            z2 = np.dot(W2 , a1 )+ b2
            a2 = np.tanh(z2)
            z3 = np.dot(W3 , a2 )+ b3
            a3 = np.tanh(z3)
            z4 = np.dot(W4 , a3) + b4
            a4 = sigmoid(z4)
            #----------------------------------------------------------------------------------------------

            # TODO 3: Compute the cost/loss function for every training example (Hint: use np.log)
            # ---------------------------------------------------------------------------------------------
            cost_j = -1 * ((np.log(a4) * y + (1-y)* np.log(1-a4)))
            # ---------------------------------------------------------------------------------------------

            # TODO 4: Derive the equations of backpropagation to find dW2, db2, dW1, and db1.
            # Hint: Check the dimensions at each step. 
            # Hint: For element-wise multiplication use *, for matrix multiplication use @
            # Example: y = A * B performs element wise multiplication 
            #          y = A @ B performs matrix multiplication
            # ---------------------------------------------------------------------------------------------
            da4 =  ( -y/a4  + (1-y)/(1-a4) )
            dz4 =  da4 * a4 * ( 1 - a4)
            dW4 = np.dot(dz4 , a3.T)
            db4 = dz4
            
            da3 =  np.dot(W4.T, dz4)
            dz3 = np.multiply(da3 , 1 - np.square(a3) )
            dW3 = np.dot(dz3 , a2.T )
            db3 = dz3
            
            da2 =  np.dot(W3.T, dz3)
            dz2 = np.multiply(da2 , 1 - np.square(a2) )
            dW2 = np.dot(dz2 , a1.T )
            db2 = dz2

            da1 =  np.dot(W2.T, dz2)
            dz1 = np.multiply(da1 , 1 - np.square(a1) )
            dW1 = np.dot(dz1 , a0.T )
            db1 = dz1
            
            # ---------------------------------------------------------------------------------------------
            
            # Accumulating the sum of dW1, db1, dW2, db2 and cost_j into the variables DW1, Db1, DW2, Db2 and cost
            # for all training set. 
            DW1 += dW1
            DW2 += dW2
            DW3 += dW3
            DW4 += dW4
            Db4 += db4
            Db3 += db3
            Db2 += db2
            Db1 += db1
            cost += cost_j
        
        # Averaging DW1, DW2, Db1, Db2 and cost over the m training examples. 
        DW1 /= m
        DW2 /= m
        DW3 /= m
        DW4 /= m
        Db1 /= m
        Db2 /= m
        Db3 /= m
        Db4 /= m
        cost /= m

        # TODO 5: Perform the gradient descent parameter update.
        # ---------------------------------------------------------------------------------------------------
        # Gradient descent parameter update
        W1 -= alpha * DW1
        b1 -= alpha * Db1
        W2 -= alpha * DW2
        b2 -= alpha * Db2
        W3 -= alpha * DW3
        b3 -= alpha * Db3
        W4 -= alpha * DW4
        b4 -= alpha * Db4
        # ---------------------------------------------------------------------------------------------------

        # Assign new parameters to the model
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2, 'W3': W3, 'b3': b3, 'W4': W4, 'b4': b4}

        # Optionally print the loss.
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, cost))

    return model


In [17]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    a0 = x.T
    
    # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
    #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
    # -----------------------------------------------------------------------------------------------
    z1 = np.dot(W1 , a0) + b1
    a1 = np.tanh(z1)
    z2 = np.dot(W2 , a1) + b2
    a2 = sigmoid(z2)
    # ------------------------------------------------------------------------------------------------
    # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
#     prediction = np.round(a2)
    prediction = a2
    
    return prediction

# def predict(model, x):
#     W1, b1, W2, b2, W3, b3, W4, b4 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3'], model['W4'], model['b4']
#     a0 = x.T
    
#     # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
#     #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
#     # -----------------------------------------------------------------------------------------------
#     z1 = np.dot(W1 , a0 )+ b1
#     a1 = np.tanh(z1)
#     z2 = np.dot(W2 , a1 )+ b2
#     a2 = np.tanh(z2)
#     z3 = np.dot(W3 , a2 )+ b3
#     a3 = np.tanh(z3)
#     z4 = np.dot(W4 , a3) + b4
#     a4 = sigmoid(z4)
#     # ------------------------------------------------------------------------------------------------
#     # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
#     # prediction = np.round(a2)
#     prediction = a4    
#     return prediction

In [32]:
model = build_model(nn_hdim= 8, num_passes = 10001, print_loss=True)

Loss after iteration 0: 0.670913
Loss after iteration 1000: 0.566402
Loss after iteration 2000: 0.563649
Loss after iteration 3000: 0.563039
Loss after iteration 4000: 0.562859
Loss after iteration 5000: 0.562725
Loss after iteration 6000: 0.562606
Loss after iteration 7000: 0.562495
Loss after iteration 8000: 0.562387
Loss after iteration 9000: 0.562281
Loss after iteration 10000: 0.562172


In [16]:
print(model)

NameError: name 'model' is not defined

In [18]:
# # print(model)
# model tf-idf not normalized
# model = {'W1': np.array([[ 1.38785383,  0.49082933],
#        [ 0.55641748,  1.62806027],
#        [ 1.85376828, -0.3225326 ],
#        [ 1.78168709,  0.05507265],
#        [ 1.5035727 ,  0.15089058],
#        [-0.21027833,  1.92211576],
#        [ 0.46820238,  0.16111129],
#        [ 0.84196808,  0.12407808]]), 'b1': np.array([[ 0.38819622],
#        [-0.02713244],
#        [-0.8429403 ],
#        [-0.91256074],
#        [-0.66510711],
#        [ 0.01664907],
#        [ 0.03959506],
#        [-0.22607371]]), 'W2': np.array([[ 0.6266244 ,  0.50983672, -1.1677148 , -1.47645926, -1.39277239,
#          1.53280975,  0.17978621, -0.50006151]]), 'b2': np.array([[0.58550391]])}

# model = {'W1': np.array([[ 1.18336695,  0.1250435 ], # 3 layers
#        [ 0.38733117,  1.8530943 ],
#        [ 1.75886377, -0.58510996],
#        [ 0.54240451, -0.05259935],
#        [-0.09248666,  0.2141493 ],
#        [ 0.0432581 ,  0.9329085 ],
#        [ 0.66859945,  0.25386436],
#        [ 0.5554128 ,  0.36237266]]), 'b1': np.array([[-0.28405656],
#        [-0.40425758],
#        [-0.99870768],
#        [ 0.05080746],
#        [-0.09605534],
#        [-0.26051699],
#        [-0.18459905],
#        [-0.28472492]]), 'W2': np.array([[ 1.13457184, -0.20822159,  0.38311448, -0.51971462, -1.7927709 ,
#          0.47739742,  0.70361151, -0.44614564],
#        [ 1.63177288, -1.0197636 , -0.08924496, -0.14037122,  1.09361726,
#          1.04981611,  0.09540724,  0.2347938 ],
#        [-0.67150498, -1.45686798, -0.20758122,  0.09659014,  0.85981688,
#          0.80570778, -0.296603  , -0.22930296],
#        [-0.8237228 , -1.15021352, -1.24466209,  1.34648287, -0.36599564,
#         -0.38097044, -0.92866868,  0.50709689],
#        [-1.05799442, -0.01422579, -0.90377796,  0.28580989, -0.35451199,
#         -0.7717647 , -0.02472574,  0.26142491],
#        [ 0.01077594,  0.59710575, -0.83069778, -0.40593486, -0.49098818,
#         -0.20682705, -0.76479984, -1.38033905],
#        [ 0.03789108, -0.00967381, -1.58485507,  0.14915192, -0.62770378,
#          0.15492018,  0.36470344, -0.01501329],
#        [ 0.78647832, -1.02272337,  0.42132782, -0.45981076, -0.62587553,
#         -0.50757541, -0.22120812,  0.02481316]]), 'b2': np.array([[-0.05359941],
#        [ 0.12717962],
#        [-0.061023  ],
#        [-0.05207335],
#        [ 0.31889328],
#        [ 0.25685478],
#        [ 0.1398867 ],
#        [-0.04797326]]), 'W3': np.array([[-0.80349617,  0.71868534,  0.20718625, -1.20396728,  1.00463432,
#          1.57134532,  1.02363706, -0.37870051],
#        [-0.89039626,  0.50355204, -0.02357477,  1.08788051,  0.29202038,
#          0.60301908,  0.07305096,  0.60210272],
#        [ 0.03802195,  1.28299045,  0.06325746,  0.27166014,  1.30692966,
#         -1.09726151, -1.02922169,  0.86082546],
#        [-0.90207611,  1.29694031, -0.26202875, -0.47270786,  1.46564552,
#          1.30205973,  1.4459055 ,  0.57549995],
#        [-0.66125529,  1.43636039, -0.21285799,  0.59702511,  0.6944864 ,
#         -0.04123769,  0.48235273,  0.71124213],
#        [ 0.263836  , -0.77833376,  0.21475251,  0.94851392, -0.48556096,
#         -0.10818819, -0.30816457,  1.30529932],
#        [ 0.487098  ,  0.38296521, -0.62201678,  0.30699157, -0.51424632,
#          0.09400846, -0.3431704 ,  0.40476813],
#        [ 0.39364728, -0.2458679 ,  0.37127184, -0.72310426, -1.03654804,
#          0.2336883 ,  0.0376808 ,  0.47617952]]), 'b3': np.array([[ 0.14588719],
#        [-0.32320868],
#        [ 0.02159727],
#        [-0.02307498],
#        [ 0.08842675],
#        [ 0.00334234],
#        [ 0.09982744],
#        [-0.13245038]]), 'W4': np.array([[ 1.27121312,  0.57705831, -0.72331337,  0.69760112, -0.86182132,
#          0.17466716, -0.23174607,  0.40275134]]), 'b4': np.array([[-0.47140409]])}
# model tf-idf normalized
model = {'W1': np.array([[ 1.29643591,  0.45074824],
       [ 0.70773118,  1.58460852],
       [ 2.67738766, -1.74837318],
       [ 1.33109077,  0.10529432],
       [ 0.69949236,  0.23108707],
       [-0.35599099,  2.07436754],
       [ 0.47995731,  0.24472955],
       [ 0.63720221,  0.2369889 ]]), 'b1': np.array([[ 0.24555316],
       [-0.02617257],
       [-1.61662627],
       [-1.31429877],
       [-0.17949485],
       [ 0.07467697],
       [ 0.13714999],
       [ 0.03253189]]), 'W2': np.array([[-0.20112114,  0.1343918 , -2.29593635, -1.68775395, -1.66578869,
         1.04315377, -0.56503497, -1.11453418]]), 'b2': np.array([[-0.64778329]])}

In [23]:
def test(article_preprocessed_test, original_test, summarized_text, compression_ratio, file_number = 0):
    X_test = generate_X_labels(article_preprocessed_test)
    predicton = predict(model, X_test)
    Y_test, original_sentences = generate_Y_labels(original_test, summarized_text)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(predicton[0]) [::-1]
    highest = highest[: num_sentences_summarized]
#     highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    output_indices = []
    

    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        output_indices.append(highest[i])
        
    output_sentences = ''.join(output_sentences)
    
    rouge = Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    scores_nn = rouge.get_scores(output_sentences, summarized_text)
    
    rouge_1_nn = scores_nn['rouge-1']['f']
    rouge_2_nn = scores_nn['rouge-2']['f']
    rouge_l_nn = scores_nn['rouge-l']['f']
    
#     print('article number: %d' % (file_number))
#     print('nn accuracy')
#     print('Rouge 1 score is: %f' % (rouge_1_nn))
#     print('Rouge 2 score is: %f' % (rouge_2_nn))
#     print('Rouge l score is: %f' % (rouge_l_nn))
    
    
    
    ### classified using TF_idf score

    output_tf_idf = summary_using_tf_idf_only(original_test, num_sentences_summarized)
    
    scores_tf_idf = rouge.get_scores(output_tf_idf, summarized_text)
    
    rouge_1_tf_idf = scores_tf_idf['rouge-1']['f']
    rouge_2_tf_idf = scores_tf_idf['rouge-2']['f']
    rouge_l_tf_idf = scores_tf_idf['rouge-l']['f']
    
#     print('t-idf accuracy')
#     print('Rouge 1 score is: %f' % (rouge_1_tf_idf))
#     print('Rouge 2 score is: %f' % (rouge_2_tf_idf))
#     print('Rouge l score is: %f' % (rouge_l_tf_idf))

#     print(summarized_text)
#     print('-----')
#     print(output_sentences)
#     print('------------------------------')
    
    return rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf

In [24]:
def summary_using_tf_idf_only(text, num_sentences):
    article_sentences = sent_tokenize(text)
    article_preprocessed = preprocessing(text)
    sentence_scores = calculate_TF_IDF(article_preprocessed)
    highest = np.argsort(sentence_scores) [::-1]
    highest = highest[: num_sentences]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    
    for i in range (0, num_sentences):
        output_sentences.append(article_sentences[highest[i]])
        
    
    output_sentences = ' '.join(output_sentences)
    
    return output_sentences

In [25]:
article_file = io.open("cr7.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
summary = summary_using_tf_idf_only(article, 7)
print(summary)

Real Madrid forward Cristiano Ronaldo has said that he is the "best player in history" after winning his fifth Ballon d'Or on Thursday. Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him. He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me. I play well with both feet, I’m quick, powerful, good with the head, I score goals, I make assists. That says something, doesn’t it? Legends like Floyd Mayweather [Jr.] and LeBron James don’t get to their perfect level by chance. To be at the top and to stay there, you have to have more talent than the others."


In [26]:
# precision_nn = []
# recall_nn = []
# precision_tf_idf = []
# recall_tf_idf = []

rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range(101, 131):

        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf-8-sig')
        article_file.readline()
        article = article_file.read()
        article_preprocessed = preprocessing(article)
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf-8-sig')
        summarized = summarized_file.read()
        summarized_file.close()

        rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
            article_preprocessed, article, summarized, 0.35, i)

        rouge_1_list_nn.append(rouge_1_nn)
        rouge_2_list_nn.append(rouge_2_nn)
        rouge_l_list_nn.append(rouge_l_nn)

        rouge_1_list_tf_idf.append(rouge_1_tf_idf)
        rouge_2_list_tf_idf.append(rouge_2_tf_idf)
        rouge_l_list_tf_idf.append(rouge_l_tf_idf)


print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

# print('Neural network accuracy: ')
# print('Average precision score is: %f' % (np.average(precision_nn)))
# print('Average recall score is: %f' % (np.average(recall_nn)))
      
# print('Classical approach accuracy using tf-idf: ')
# print('Average precision score is: %f' % (np.average(precision_tf_idf)))
# print('Average recall score is: %f' % (np.average(recall_tf_idf)))

Using nn
Average Rouge 1 score is: 0.742477
Average Rouge 2 score is: 0.660360
Average Rouge l score is: 0.545702
Using tf_idf only
Average Rouge 1 score is: 0.371251
Average Rouge 2 score is: 0.134294
Average Rouge l score is: 0.213952


In [27]:
rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

df = pd.read_csv('test.csv')
articles_cnn = df['article']
summaries_cnn = df['highlights']

i = 0
for article_cnn, summary_cnn in zip(articles_cnn, summaries_cnn):
    article_cnn_preprocessed = preprocessing(article_cnn)
    rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
        article_cnn_preprocessed, article_cnn, summary_cnn, 0.35, i)
    i += 1
    if (i == 300):
        break
    
    rouge_1_list_nn.append(rouge_1_nn)
    rouge_2_list_nn.append(rouge_2_nn)
    rouge_l_list_nn.append(rouge_l_nn)

    rouge_1_list_tf_idf.append(rouge_1_tf_idf)
    rouge_2_list_tf_idf.append(rouge_2_tf_idf)
    rouge_l_list_tf_idf.append(rouge_l_tf_idf)

print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

    

Using nn
Average Rouge 1 score is: 0.336225
Average Rouge 2 score is: 0.138137
Average Rouge l score is: 0.277759
Using tf_idf only
Average Rouge 1 score is: 0.264159
Average Rouge 2 score is: 0.074046
Average Rouge l score is: 0.209739


In [28]:
def summarize(article, compression_ratio):
    original_sentences = sent_tokenize(article)
    article_preprocessed_entered = preprocessing(article)
    X_test_entered = generate_X_labels(article_preprocessed_entered)
    summary_predicted = predict(model, X_test_entered)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(summary_predicted[0]) [::-1]
    highest = highest[: num_sentences_summarized]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        
    
    output_sentences = ' '.join(output_sentences)
    
    return output_sentences
    

In [29]:
article_file = io.open("cr7.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
# print(article)
summary = summarize(article, 0.35)
print(summary)

Real Madrid forward Cristiano Ronaldo has said that he is the "best player in history" after winning his fifth Ballon d'Or on Thursday. Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him. He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me. No footballer can do the things I can. "There’s no player more complete than me. But I tell you: there’s no one more complete than me.


In [30]:
article_file = io.open("godzilla.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
# print(article)
summary = summarize(article, 0.35)
print(summary)

An actor dressed as the giant creature breathed smoke over photographers on Monday as Godzilla received the 2,271st star on Hollywood Boulevard. "Godzilla should thank you for this historical and monumental star," said Final Wars producer Shogo Tomiyama. Hollywood's honorary mayor, Johnny Grant, said: "I do hereby proclaim this Godzilla Day in Hollywood. The premiere of Godzilla: Final Wars at Grauman's Chinese Theatre followed the ceremony on Hollywood Boulevard. Director Ryuhei Kitamura said it may not be Godzilla's final outing, as it has been billed. And producer Shogo Tomiyama added: "So long as Godzilla can fascinate people, I believe he will be resurrected by new generations of filmmakers in the future."
