In [1]:
import re
import string
import numpy as np
import math
import nltk
import io
import csv
import pandas as pd


from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice
from rouge import Rouge



In [2]:
# nltk.download('stopwords')
# nltk.download()  # uncomment these lines once they are not downloaded

In [3]:
def preprocessing(article):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    article_preprocessed = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
        sentence_preprocessed = []
        sentence = re.sub(r"[^a-zA-Z\s]+", "", sentence)
        words = word_tokenize(sentence)
        for word in words:
            if (word not in stopwords_english and word not in string.punctuation):
                word_stemmed = stemmer.stem(word)  
                sentence_preprocessed.append(word_stemmed)
#         sentence_preprocessed = " ".join(sentence_preprocessed)
        if sentence_preprocessed:
            article_preprocessed.append(sentence_preprocessed)
            
        
        
#     words = [word_tokenize(sent) for sent in sentences]
#     words_without_stopwords = [[word for word in sent if word not in stopwords.words('english')] for sent in words]
    return article_preprocessed


In [4]:
article_file = io.open("articles/original (" + str(1) +").txt", "r", encoding='utf-8-sig')
article_file.readline()
article = article_file.read()
article_file.close()

# print(article)
article_preprocessed = preprocessing(article)
print(article_preprocessed)
print(len(article_preprocessed))

[['quarterli', 'profit', 'us', 'media', 'giant', 'timewarn', 'jump', 'bn', 'three', 'month', 'decemb', 'yearearli'], ['the', 'firm', 'one', 'biggest', 'investor', 'googl', 'benefit', 'sale', 'highspe', 'internet', 'connect', 'higher', 'advert', 'sale'], ['timewarn', 'said', 'fourth', 'quarter', 'sale', 'rose', 'bn', 'bn'], ['it', 'profit', 'buoy', 'oneoff', 'gain', 'offset', 'profit', 'dip', 'warner', 'bro', 'less', 'user', 'aol'], ['time', 'warner', 'said', 'friday', 'own', 'searchengin', 'googl'], ['but', 'internet', 'busi', 'aol', 'mix', 'fortun'], ['it', 'lost', 'subscrib', 'fourth', 'quarter', 'profit', 'lower', 'preced', 'three', 'quarter'], ['howev', 'compani', 'said', 'aol', 'underli', 'profit', 'except', 'item', 'rose', 'back', 'stronger', 'internet', 'advertis', 'revenu'], ['it', 'hope', 'increas', 'subscrib', 'offer', 'onlin', 'servic', 'free', 'timewarn', 'internet', 'custom', 'tri', 'sign', 'aol', 'exist', 'custom', 'highspe', 'broadband'], ['timewarn', 'also', 'restat', '

In [5]:
def convert_list_to_string(sentences):  # converts list of lists to list of strings
    sentences_modified = []   # list of strings
    for sentence in sentences:
        sentence_modified = ' '.join(sentence)
        sentences_modified.append(sentence_modified)
    return sentences_modified

In [6]:
### Feature 1

def calculate_TF_IDF(content):
    flat_words = [word for sent in content for word in sent]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(flat_words)
    feature_names = tfidf_vectorizer.get_feature_names()
    word_scores = {}
    for i, feature in enumerate(feature_names):
        word_scores[feature] = tfidf_matrix[:, i].sum()
    return word_scores
    

In [7]:
def calculate_each_sentence_score(article_preprocessed, word_scores):
    sentence_scores = []
    for sent in article_preprocessed:
        score = 0
        for word in sent:
            score += word_scores.get(word, 0)
        sentence_scores.append(score)
        
    sentence_scores = sentence_scores / max(sentence_scores)
    return sentence_scores

In [8]:
def generate_summary(sentences, sentence_scores, num_sentences):
    top_sentences_idx = sorted(range(len(sentence_scores)), key=lambda i: sentence_scores[i], reverse=True)[:num_sentences]
    summary = [sentences[i] for i in top_sentences_idx]
    return ' '.join(summary)

In [9]:
def test_tf_idf(file_name): # using tf-idf
    
    article_file = io.open("articles/original (" + str(file_name) +").txt", "r", encoding='utf-8-sig')
    article_file.readline()
    article = article_file.read()
    article_file.close()
    
    sentences = sent_tokenize(article)
    sentences[0] = sentences[0][1:]
    
    summarized_file = io.open("articles/summarized (" + str(file_name) +").txt", "r", encoding='utf-8-sig')
    summarized_original = summarized_file.read()
    summarized_file.close()

    article_preprocessed = preprocessing(article)
    word_scores = calculate_TF_IDF(article_preprocessed)
    sentence_scores = calculate_each_sentence_score(article_preprocessed, word_scores)
    summary = generate_summary(sentences, sentence_scores, 7)
    
    print(sentence_scores)
    rouge = Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    scores_tf_idf = rouge.get_scores(summary, summarized_original)
    
    rouge_1_tf_idf = scores_tf_idf['rouge-1']['f']
    rouge_2_tf_idf = scores_tf_idf['rouge-2']['f']
    rouge_l_tf_idf = scores_tf_idf['rouge-l']['f']
    print('t-idf accuracy')
    print('Rouge 1 score is: %f' % (rouge_1_tf_idf))
    print('Rouge 2 score is: %f' % (rouge_2_tf_idf))
    print('Rouge l score is: %f' % (rouge_l_tf_idf))

In [10]:
test_tf_idf(301)

[0.75       0.28571429 0.64285714 0.96428571 0.85714286 0.60714286
 0.75       0.82142857 0.64285714 1.         0.64285714 0.32142857
 0.35714286 0.96428571]
t-idf accuracy
Rouge 1 score is: 0.709402
Rouge 2 score is: 0.629310
Rouge l score is: 0.529915




In [11]:
### Feature 2

def sentence_length(article_preprocessed):
    article_preprocessed = convert_list_to_string(article_preprocessed)
    max_length = 0
    for sentence in article_preprocessed:
        # print(sentence)
        if len(sentence.split()) > max_length:
            max_length = len(sentence.split())
            
    sentence_length_feature = []
    for sentence in article_preprocessed:
        sentence_length_feature.append(len(sentence.split()) / max_length)

#     sentence_length_feature = []
#     for sentence in article_preprocessed:
#         sentence_length_feature.append(1 / len(sentence.split()))

    return sentence_length_feature

In [12]:
article_file = io.open("articles/original (" + str(1) +").txt", "r", encoding='utf-8-sig')
article_file.readline()
art = article_file.read()
article_file.close()

art = preprocessing(art)
art = convert_list_to_string(art)
sentence_length(article_preprocessed)

[0.5454545454545454,
 0.6363636363636364,
 0.36363636363636365,
 0.5909090909090909,
 0.3181818181818182,
 0.2727272727272727,
 0.45454545454545453,
 0.6363636363636364,
 0.8181818181818182,
 0.5909090909090909,
 0.4090909090909091,
 1.0,
 0.45454545454545453,
 0.7727272727272727,
 0.6363636363636364,
 0.5,
 0.4090909090909091,
 0.5909090909090909,
 0.7727272727272727,
 0.4090909090909091]

In [13]:
def generate_X_labels(article_preprocessed):
    # feature 1 (tf_idf)
    word_scores = calculate_TF_IDF(article_preprocessed)
    tf_idf_score_feature = calculate_each_sentence_score(article_preprocessed, word_scores)
    
    # feature 2 (sentence_length)
    sentence_length_feature = sentence_length(article_preprocessed)
    
    matrix = np.column_stack((tf_idf_score_feature, sentence_length_feature))
#     matrix = np.array(tf_idf_score_feature).reshape(len(tf_idf_score_feature), 1)

    return matrix

In [14]:
article_file = io.open("articles/original (" + str(1) +").txt", "r", encoding='utf-8-sig')
article_file.readline()
article = article_file.read()
article_file.close()

art = preprocessing(article)
art = convert_list_to_string(art)
print(generate_X_labels(article_preprocessed))
print(len(generate_X_labels(article_preprocessed)))

[[0.79069767 0.54545455]
 [0.62790698 0.63636364]
 [0.81395349 0.36363636]
 [0.97674419 0.59090909]
 [0.34883721 0.31818182]
 [0.37209302 0.27272727]
 [0.76744186 0.45454545]
 [0.95348837 0.63636364]
 [1.         0.81818182]
 [0.58139535 0.59090909]
 [0.60465116 0.40909091]
 [0.81395349 1.        ]
 [0.88372093 0.45454545]
 [0.53488372 0.77272727]
 [0.81395349 0.63636364]
 [0.62790698 0.5       ]
 [0.39534884 0.40909091]
 [0.48837209 0.59090909]
 [0.86046512 0.77272727]
 [0.65116279 0.40909091]]
20


In [15]:
def generate_Y_labels(original, summarized):
    Y_list = []
    original_sentences = sent_tokenize(original)
    original_sentences[0] = original_sentences[0][1:] # to remove the \n
    summarized_sentences = sent_tokenize(summarized)
    
    for original_sentence in original_sentences:
        added = 0
        for summarized_sentence in summarized_sentences:
            if original_sentence in summarized_sentence:
                Y_list.append(1)
                added = 1
                break
        if added == 0:
            Y_list.append(0)
    
    return Y_list, original_sentences
    

In [16]:
article_file = io.open("articles/original (" + str(301) +").txt", "r", encoding='utf-8-sig')
article_file.readline()
article = article_file.read()
article_file.close()

summarized_file = io.open("articles/summarized (" + str(301) +").txt", "r", encoding='utf-8-sig')
summarized = summarized_file.read()
summarized_file.close()

Y,_ = generate_Y_labels(article, summarized)
print(Y)
print(len(Y))

[1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0]
14


In [17]:
X_matrix = []
X = []
Y = []
sentences = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range (1, 51):   # loading business articles
        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf-8-sig')
        article_file.readline()
        article = article_file.read()
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf-8-sig')
        summarized = summarized_file.read()
        summarized_file.close()

        article_preprocessed = preprocessing(article)
    #     article_preprocessed = convert_list_to_string(article_preprocessed)
        X_i = generate_X_labels(article_preprocessed)
        Y_i, original_list_no_first_space = generate_Y_labels(article, summarized)

        if(len(X_i) != len(Y_i)):
            print('Error! features and labels are not equal in length')

        Y.extend(Y_i)
        X_matrix.extend(X_i)
        sentences.extend(original_list_no_first_space)
    

for x in X_matrix:
    X.append(x.tolist())
    
X = np.matrix(X)

m = len(X)
# m = 500
print(len(X))
print(len(Y))

4179
4179


In [18]:
nn_input_dim = 2 # input layer size (we have two input features)
nn_output_dim = 1  # output layer size (we have one output)

# Gradient descent parameters
alpha = 0.1  # learning rate for gradient descent
# print(Y)
# print(X)

In [19]:
def sigmoid(x):
    # TODO 1: Compute the sigmoid function at the given x (~1 line)
    # For example: sigmoid(2) should compute the value of sigmoid function at x = 2.
    # Hint: Use np.exp instead of math.exp to allow for vectorization.
    #----------------------------------------------------------------------------------------------
    sig = (1/(1+np.exp(-x)))
    #----------------------------------------------------------------------------------------------
    
    return sig


In [20]:
def build_model(nn_hdim, num_passes=20000, print_loss=False):
    
    np.random.seed(0)
    W1 = np.random.randn(nn_hdim, nn_input_dim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((nn_hdim, 1))
    W2 = np.random.randn(nn_output_dim, nn_hdim) / np.sqrt(nn_hdim)
    b2 = np.zeros((nn_output_dim, 1))

    model = {}

    for i in range(0, num_passes):
        DW1 = 0
        DW2 = 0
        Db1 = 0
        Db2 = 0
        cost = 0

        for j in range(0, m):
            a0 = X[j, :].reshape(-1, 1)  # Every training example is a column vector.
            y = Y[j]
            
            z1 = np.dot(W1 , a0 )+ b1
            a1 = np.tanh(z1)
            z2 = np.dot(W2 , a1) + b2
            a2 = sigmoid(z2)
            
#             if (i == num_passes -1 ):
#                 print('True value: %f, got: %f'% (y, a2))

            cost_j = -1 * ((np.log(a2) * y + (1-y)* np.log(1-a2)))

            da2 =  ( -y/a2  + (1-y)/(1-a2) )
            dz2 =  da2 * a2 * ( 1 - a2)
            dW2 = np.dot(dz2 , a1.T)
            db2 = dz2

            da1 =  np.dot(dz2,W2).T
            dz1 = np.multiply(da1 , 1 - np.square(a1) )
            dW1 = np.dot(dz1 , a0.T )
            db1 = dz1

            DW1 += dW1
            DW2 += dW2
            Db2 += db2
            Db1 += db1
            cost += cost_j
        
        DW1 /= m
        DW2 /= m
        Db1 /= m
        Db2 /= m
        cost /= m

        W1 -= alpha * DW1
        b1 -= alpha * Db1
        W2 -= alpha * DW2
        b2 -= alpha * Db2

        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, cost))

    return model

In [21]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    a0 = x.T
    
    # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
    #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
    # -----------------------------------------------------------------------------------------------
    z1 = np.dot(W1 , a0) + b1
    a1 = np.tanh(z1)
    z2 = np.dot(W2 , a1) + b2
    a2 = sigmoid(z2)
    # ------------------------------------------------------------------------------------------------
    # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
#     prediction = np.round(a2)
    prediction = a2
    
    return prediction

In [22]:
model = build_model(nn_hdim= 8, num_passes = 10001, print_loss=True)

Loss after iteration 0: 0.703959
Loss after iteration 1000: 0.480920
Loss after iteration 2000: 0.477243
Loss after iteration 3000: 0.476367
Loss after iteration 4000: 0.476015
Loss after iteration 5000: 0.475833
Loss after iteration 6000: 0.475719
Loss after iteration 7000: 0.475636
Loss after iteration 8000: 0.475568
Loss after iteration 9000: 0.475507
Loss after iteration 10000: 0.475452


In [23]:
print(model)

{'W1': array([[ 2.65985065, -0.5365754 ],
       [ 0.6183042 ,  1.65977413],
       [ 1.9195185 , -1.87884063],
       [ 0.69353291, -0.08007882],
       [-1.08322081,  1.40649314],
       [ 0.20895092,  1.01007634],
       [ 1.3257662 , -0.30570778],
       [ 0.11674894,  0.35820707]]), 'b1': array([[-0.56912543],
       [ 0.02103428],
       [-0.24905846],
       [ 0.1409427 ],
       [ 0.2860866 ],
       [-0.05592954],
       [-0.06771996],
       [ 0.113594  ]]), 'W2': array([[ 2.3577487 , -0.3403027 ,  2.22138009,  0.28017802, -1.93652971,
        -0.08309288,  1.22371814, -0.24322531]]), 'b2': array([[-0.39537232]])}


In [24]:
def test(article_preprocessed_test, original_test, summarized_text, compression_ratio, file_number = 0):
    X_test = generate_X_labels(article_preprocessed_test)
    predicton = predict(model, X_test)
    Y_test, original_sentences = generate_Y_labels(original_test, summarized_text)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(predicton[0]) [::-1]
    highest = highest[: num_sentences_summarized]
#     highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    output_indices = []
    

    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        output_indices.append(highest[i])
        
    output_sentences = ''.join(output_sentences)
    
    rouge = Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    scores_nn = rouge.get_scores(output_sentences, summarized_text)
    
    rouge_1_nn = scores_nn['rouge-1']['f']
    rouge_2_nn = scores_nn['rouge-2']['f']
    rouge_l_nn = scores_nn['rouge-l']['f']
    
#     print('article number: %d' % (file_number))
#     print('nn accuracy')
#     print('Rouge 1 score is: %f' % (rouge_1_nn))
#     print('Rouge 2 score is: %f' % (rouge_2_nn))
#     print('Rouge l score is: %f' % (rouge_l_nn))
    
    
    
    ### classified using TF_idf score

    output_tf_idf = extractive_summary(original_test, num_sentences_summarized)
    
    scores_tf_idf = rouge.get_scores(output_tf_idf, summarized_text)
    
    rouge_1_tf_idf = scores_tf_idf['rouge-1']['f']
    rouge_2_tf_idf = scores_tf_idf['rouge-2']['f']
    rouge_l_tf_idf = scores_tf_idf['rouge-l']['f']
    
#     print('t-idf accuracy')
#     print('Rouge 1 score is: %f' % (rouge_1_tf_idf))
#     print('Rouge 2 score is: %f' % (rouge_2_tf_idf))
#     print('Rouge l score is: %f' % (rouge_l_tf_idf))
    
    return rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf

In [25]:
def extractive_summary(text, num_sentences):
    # Preprocess the text
    text = text.lower()
    text = ''.join(c for c in text if c not in '1234567890')
    sentences = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sentences]
    words_without_stopwords = [[word for word in sent if word not in stopwords.words('english')] for sent in words]
    
    # Calculate TF-IDF scores
    flat_words = [word for sent in words_without_stopwords for word in sent]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(flat_words)
    feature_names = tfidf_vectorizer.get_feature_names()
    word_scores = {}
    for i, feature in enumerate(feature_names):
        word_scores[feature] = tfidf_matrix[:, i].sum()
    
    # Calculate sentence scores
    sentence_scores = []
    for sent in words_without_stopwords:
        score = 0
        for word in sent:
            score += word_scores.get(word, 0)
        sentence_scores.append(score)
    
    # Select top N sentences with highest scores
    top_sentences_idx = sorted(range(len(sentence_scores)), key=lambda i: sentence_scores[i], reverse=True)[:num_sentences]
    summary = [sentences[i] for i in top_sentences_idx]
    return ' '.join(summary)


In [26]:
# precision_nn = []
# recall_nn = []
# precision_tf_idf = []
# recall_tf_idf = []

rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range(101, 131):

        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf-8-sig')
        article_file.readline()
        article = article_file.read()
        article_preprocessed = preprocessing(article)
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf-8-sig')
        summarized = summarized_file.read()
        summarized_file.close()

        rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
            article_preprocessed, article, summarized, 0.35, i)

        rouge_1_list_nn.append(rouge_1_nn)
        rouge_2_list_nn.append(rouge_2_nn)
        rouge_l_list_nn.append(rouge_l_nn)

        rouge_1_list_tf_idf.append(rouge_1_tf_idf)
        rouge_2_list_tf_idf.append(rouge_2_tf_idf)
        rouge_l_list_tf_idf.append(rouge_l_tf_idf)


print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

# print('Neural network accuracy: ')
# print('Average precision score is: %f' % (np.average(precision_nn)))
# print('Average recall score is: %f' % (np.average(recall_nn)))
      
# print('Classical approach accuracy using tf-idf: ')
# print('Average precision score is: %f' % (np.average(precision_tf_idf)))
# print('Average recall score is: %f' % (np.average(recall_tf_idf)))

Using nn
Average Rouge 1 score is: 0.786239
Average Rouge 2 score is: 0.716092
Average Rouge l score is: 0.574576
Using tf_idf only
Average Rouge 1 score is: 0.686983
Average Rouge 2 score is: 0.575671
Average Rouge l score is: 0.503824


In [49]:
rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

df = pd.read_csv('test.csv')
articles_cnn = df['article']
summaries_cnn = df['highlights']

i = 0
for article_cnn, summary_cnn in zip(articles_cnn, summaries_cnn):
    article_cnn_preprocessed = preprocessing(article_cnn)
    rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
        article_cnn_preprocessed, article_cnn, summary_cnn, 0.35, i)
    i += 1
    if (i == 300):
        break
    
    rouge_1_list_nn.append(rouge_1_nn)
    rouge_2_list_nn.append(rouge_2_nn)
    rouge_l_list_nn.append(rouge_l_nn)

    rouge_1_list_tf_idf.append(rouge_1_tf_idf)
    rouge_2_list_tf_idf.append(rouge_2_tf_idf)
    rouge_l_list_tf_idf.append(rouge_l_tf_idf)

print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))
p

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

    

<class 'pandas.core.series.Series'>




Using nn
Average Rouge 1 score is: 0.326038
Average Rouge 2 score is: 0.130973
Average Rouge l score is: 0.268904
Using tf_idf only
Average Rouge 1 score is: 0.309745
Average Rouge 2 score is: 0.109878
Average Rouge l score is: 0.251321


In [45]:
def TextToSummarize(article, compression_ratio):
    original_sentences = sent_tokenize(article)
    article_preprocessed_entered = preprocessing(article)
    X_test_entered = generate_X_labels(article_preprocessed_entered)
    summary_predicted = predict(model, X_test_entered)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(summary_predicted[0]) [::-1]
    highest = highest[: num_sentences_summarized]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    

    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        
    output_sentences = ''.join(output_sentences)
    
    return output_sentences
    

In [48]:
article_file = io.open("cr7.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
# print(article)
summary = TextToSummarize(article, 0.35)
print(summary)

Real Madrid forward Cristiano Ronaldo has said that he is the "best player in history" after winning his fifth Ballon d'Or on Thursday.Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him.He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me."There’s no player more complete than me."No one has won as many individual trophies as me.It’s the sum of many things.
