In [1]:
import re
import string
import numpy as np
import math
import nltk

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice



In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocessing(article):
    # lines = article.split(".")   # splits the whole article into lines
#     print(article)
    article = article.replace('"', "").replace("'", "")
#     print(article)
    lines = re.split(r"\n\n|\.(?!\d)", article)
    
#     lines = [x for x in lines if x]
    
#     for i in range(len(lines)):
#         if lines[i][0] == '"':
#             lines[i] = lines[i][1:len(lines[i])]
#             lines[i-1] += '"' 
        
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    article_preprocessed = []    # list that contains the main sentences after being preprocessed 
    for line in lines:
        line_preprocessed = []
        words_in_line = line.split()
        for word in words_in_line:
            if (word not in stopwords_english and word not in string.punctuation):   # make sure word is not a stop word
                                                                                     # and not a punctuation 
                word_stemmed = stemmer.stem(word)  
                line_preprocessed.append(word_stemmed)
        article_preprocessed.append(line_preprocessed)
    article_preprocessed = [x for x in article_preprocessed if x]
    return article_preprocessed

In [4]:
with open("original (" + str(1) +").txt", "r", encoding="utf8") as file:
    contents = file.read()
    article_preprocessed = preprocessing(contents)
    
for sentence in article_preprocessed:
    print(sentence)
print(len(article_preprocessed))

['ad', 'sale', 'boost', 'time', 'warner', 'profit']
['quarterli', 'profit', 'us', 'media', 'giant', 'timewarn', 'jump', '76%', '$1.13bn', '(£600m)', 'three', 'month', 'december,', '$639m', 'year-earli']
['the', 'firm,', 'one', 'biggest', 'investor', 'google,', 'benefit', 'sale', 'high-spe', 'internet', 'connect', 'higher', 'advert', 'sale']
['timewarn', 'said', 'fourth', 'quarter', 'sale', 'rose', '2%', '$11.1bn', '$10.9bn']
['it', 'profit', 'buoy', 'one-off', 'gain', 'offset', 'profit', 'dip', 'warner', 'bros,', 'less', 'user', 'aol']
['time', 'warner', 'said', 'friday', 'own', '8%', 'search-engin', 'googl']
['but', 'internet', 'business,', 'aol,', 'mix', 'fortun']
['it', 'lost', '464,000', 'subscrib', 'fourth', 'quarter', 'profit', 'lower', 'preced', 'three', 'quarter']
['however,', 'compani', 'said', 'aol', 'underli', 'profit', 'except', 'item', 'rose', '8%', 'back', 'stronger', 'internet', 'advertis', 'revenu']
['it', 'hope', 'increas', 'subscrib', 'offer', 'onlin', 'servic', 'free

In [5]:
print(contents)

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL

In [6]:
def convert_list_to_string(content):  # converts list of lists to list of strings
    content_modified = []   # list of strings
    for line in content:
        line_as_string = " ".join(line)
        content_modified.append(line_as_string)
#     print(content_modified)
    return content_modified

In [7]:
print(convert_list_to_string(article_preprocessed))

['ad sale boost time warner profit', 'quarterli profit us media giant timewarn jump 76% $1.13bn (£600m) three month december, $639m year-earli', 'the firm, one biggest investor google, benefit sale high-spe internet connect higher advert sale', 'timewarn said fourth quarter sale rose 2% $11.1bn $10.9bn', 'it profit buoy one-off gain offset profit dip warner bros, less user aol', 'time warner said friday own 8% search-engin googl', 'but internet business, aol, mix fortun', 'it lost 464,000 subscrib fourth quarter profit lower preced three quarter', 'however, compani said aol underli profit except item rose 8% back stronger internet advertis revenu', 'it hope increas subscrib offer onlin servic free timewarn internet custom tri sign aol exist custom high-spe broadband', 'timewarn also restat 2000 2003 result follow probe us secur exchang commiss (sec), close conclud', 'time warner fourth quarter profit slightli better analyst expect', 'but film divis saw profit slump 27% $284m, help box-

In [8]:
### Feature 1

def calculate_TF_IDF(content):
#     print(content)
    vectorizer = TfidfVectorizer()   # Create a TfidfVectorizer object
    vectorizer.fit(content)   # Fit the vectorizer to the documents
    feature_names = vectorizer.get_feature_names()

#     print("Feature names:")   # Print the feature names
#     print(feature_names)   
#     print(len(feature_names))
    tfidf_matrix = vectorizer.transform(content)   # Transform the documents into a TF-IDF matrix
    np.set_printoptions(threshold=np.inf)
#     print("TF-IDF matrix:")   # Print the TF-IDF matrix
#     print(tfidf_matrix.toarray())
    return tfidf_matrix.toarray()
    

In [9]:
article_modified = convert_list_to_string(article_preprocessed)
calculate_TF_IDF(article_modified)



array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.51618949, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.45459347,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [10]:
def calculate_each_sentence_score(tf_idf_matrix):
    score_dict = {}   # dictionary that stores keys as summation of tf_idf scores for all word in current line 
                      # and values of dictionary as index of the line in the article
    for index, line in enumerate(tf_idf_matrix):
#         print(line)
#         print('at')
#         print(index)
        score = np.sum(line)
        score_dict[score] = index
        
    sorted_keys = sorted(score_dict.items(), reverse=True)   # sort the dictionary by keys in the descending order
    score_dict_reversed = dict(sorted_keys)

#     print(score_dict_reversed)
    return score_dict_reversed

In [11]:
def summary_generation(article, score_dict_reversed, number_of_sentences):
    lines = article.split(".")   # list contains strings, where these strings are original lines
    # note: this should split on different characters; for example: \n\n and dot followed by number
    score_dict = dict(islice(score_dict_reversed.items(), number_of_sentences))   # select only certain number of lines
                                                                                  # to be displayed 
        
    sort_data = sorted(score_dict.items(), key=lambda x: x[1])   # sort the dictionary by value (index of lines) in
                                                                 # the ascending order to display lines ordered as the 
                                                                 # original article 
    score_dict_ascending = dict(sort_data)
    print(score_dict_ascending)
    print(score_dict_reversed)
    output_list = []
    for key in score_dict_ascending:
        output_list.append(lines[score_dict_ascending[key]])
            
    output_string = ".".join(output_list)
    output_string += "."
    return output_string

In [12]:
article_modified = convert_list_to_string(article_preprocessed)
for sentence in article_modified:
    print(sentence)
tf_idf_matrix = calculate_TF_IDF(article_modified)
dict_scores = calculate_each_sentence_score(tf_idf_matrix)
output = summary_generation(contents, dict_scores, 8)
print(output)

ad sale boost time warner profit
quarterli profit us media giant timewarn jump 76% $1.13bn (£600m) three month december, $639m year-earli
the firm, one biggest investor google, benefit sale high-spe internet connect higher advert sale
timewarn said fourth quarter sale rose 2% $11.1bn $10.9bn
it profit buoy one-off gain offset profit dip warner bros, less user aol
time warner said friday own 8% search-engin googl
but internet business, aol, mix fortun
it lost 464,000 subscrib fourth quarter profit lower preced three quarter
however, compani said aol underli profit except item rose 8% back stronger internet advertis revenu
it hope increas subscrib offer onlin servic free timewarn internet custom tri sign aol exist custom high-spe broadband
timewarn also restat 2000 2003 result follow probe us secur exchang commiss (sec), close conclud
time warner fourth quarter profit slightli better analyst expect
but film divis saw profit slump 27% $284m, help box-offic flop alexand catwoman, sharp con

In [13]:
### Feature 2

def sentence_length(content):
    max_length = 0
    for sentence in content:
        # print(sentence)
        if len(sentence) > max_length:
            max_length = len(sentence)
            
    sentence_length_feature = []
    for sentence in content:
        sentence_length_feature.append(len(sentence) / max_length)
    return sentence_length_feature

In [14]:
sentence_length(article_preprocessed)

[0.25,
 0.625,
 0.5833333333333334,
 0.375,
 0.5416666666666666,
 0.3333333333333333,
 0.25,
 0.4583333333333333,
 0.625,
 0.75,
 0.625,
 0.375,
 1.0,
 0.5416666666666666,
 0.7083333333333334,
 0.6666666666666666,
 0.4583333333333333,
 0.4166666666666667,
 0.5833333333333334,
 0.7083333333333334,
 0.375]

In [15]:
### Feature 3

def numerical_data(content):
    numerical_data_feature = []
#     temp = []
    for sentence in content:
        sentence_removed_dots_commas = sentence.replace(",", "").replace(".", "")
        numerical_data = re.findall(r'\d+', sentence_removed_dots_commas)
        numerical_data_feature.append(len(numerical_data) / len(sentence_removed_dots_commas))
#         temp.append(numerical_data)
#     print(temp)
    return numerical_data_feature

In [16]:
print(numerical_data(convert_list_to_string(article_preprocessed)))
print(convert_list_to_string(article_preprocessed))

[0.0, 0.0392156862745098, 0.0, 0.05454545454545454, 0.0, 0.02040816326530612, 0.0, 0.013888888888888888, 0.01020408163265306, 0.0, 0.021052631578947368, 0.0, 0.013071895424836602, 0.05813953488372093, 0.0, 0.02040816326530612, 0.0, 0.017857142857142856, 0.012048192771084338, 0.0, 0.0]
['ad sale boost time warner profit', 'quarterli profit us media giant timewarn jump 76% $1.13bn (£600m) three month december, $639m year-earli', 'the firm, one biggest investor google, benefit sale high-spe internet connect higher advert sale', 'timewarn said fourth quarter sale rose 2% $11.1bn $10.9bn', 'it profit buoy one-off gain offset profit dip warner bros, less user aol', 'time warner said friday own 8% search-engin googl', 'but internet business, aol, mix fortun', 'it lost 464,000 subscrib fourth quarter profit lower preced three quarter', 'however, compani said aol underli profit except item rose 8% back stronger internet advertis revenu', 'it hope increas subscrib offer onlin servic free timewar

In [17]:
with open("original (" + str(1) +").txt", "r", encoding="utf8") as file:
    original = file.read()
    article_preprocessed = preprocessing(original)
#     print(original)

with open("summarized (" + str(1) +").txt", "r", encoding="utf8") as file:
    summarized = file.read()
#     print(summarized)

In [48]:
def generate_Y_labels(original, summarized):
    Y_list = []
    original_list = re.split(r"\n\n|\.(?!\d)", original)
    
    original_list_removed_empty = [x for x in original_list if x]   # removes empty elements
    
#     for i in range(len(original_list_removed_empty)):
#         if original_list_removed_empty[i][0] == '"':
#             original_list_removed_empty[i] = original_list_removed_empty[i][1:len(original_list_removed_empty[i])]
#             original_list_removed_empty[i-1] += '"'
            
    original_list_no_quotation = [x.replace('"', '') for x in original_list_removed_empty]
    original_list_no_quotation = [x.replace("'", '') for x in original_list_no_quotation]
    original_list_no_first_space = [x.lstrip() for x in original_list_no_quotation]
    original_list_no_first_space = [x for x in original_list_no_first_space if x]
    
    # summarized_list = summarized.replace('"', '')
    summarized_list = re.split(r"\.(?!\d)", summarized)
    
    summarized_list_removed_empty = [x for x in summarized_list if x]   # removes empty elements
    
#     for i in range(len(summarized_list_removed_empty)):
#         if summarized_list_removed_empty[i][0] == '"':
#             summarized_list_removed_empty[i] = summarized_list_removed_empty[i][1:len(summarized_list_removed_empty[i])]
#             summarized_list_removed_empty[i-1] += '"'
            
    summarized_list_no_quotation = [x.replace('"', '') for x in summarized_list_removed_empty]
    summarized_list_no_quotation = [x.replace("'", '') for x in summarized_list_no_quotation]
    summarized_list_no_first_space = [x.lstrip() for x in summarized_list_no_quotation]
    summarized_list_no_first_space = [x for x in summarized_list_no_first_space if x]
    
    
#     print(original_list_no_first_space)
#     print(summarized_list_no_first_space)
    
#     print(len(original_list_no_first_space))
#     print(len(summarized_list_no_first_space))
    
    for sentence in original_list_no_first_space:
        
        if sentence in summarized_list_no_first_space:
            Y_list.append(1)
#             print(sentence)
            
        else:
            Y_list.append(0)
    return Y_list, original_list_no_first_space

In [49]:
Y = generate_Y_labels(original, summarized)
print(Y)

([0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0], ['Ad sales boost Time Warner profit', 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier', 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales', 'TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn', 'Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL', 'Time Warner said on Friday that it now owns 8% of search-engine Google', 'But its own internet business, AOL, had has mixed fortunes', 'It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters', 'However, the company said AOLs underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues', 'It hopes to increase subscribers by offering t

In [50]:
def generate_X_labels(preprocessed_artcile):
#     print(preprocessed_artcile)
    sentence_length_feature = sentence_length(preprocessed_artcile)
#     print(len(sentence_length_feature))
#     article_modified = convert_list_to_string(preprocessed_artcile)
    numerical_data_feature = numerical_data(convert_list_to_string(preprocessed_artcile))
#     print(len(article_modified))
    tf_idf_matrix = calculate_TF_IDF(convert_list_to_string(preprocessed_artcile))
    tf_idf_score = []
    for index, line in enumerate(tf_idf_matrix):
#         print(line)
#         print('at')
#         print(index)
        tf_idf_score.append(np.sum(line))
    max_score = max(tf_idf_score)
    tf_idf_score = tf_idf_score/max_score
    
    matrix = np.column_stack((tf_idf_score, sentence_length_feature, numerical_data_feature))
#     matrix = np.column_stack((sentence_length_feature, numerical_data_feature))
    
#     print(matrix)
#     matrix = np.array(tf_idf_score).reshape(len(tf_idf_score), 1)
#     print(len(matrix))
    # matrix = matrix[:len(matrix)-1]
    return matrix

In [21]:
# X = generate_X_labels(article_preprocessed)
# print(X)

In [22]:
# m = len(Y)  # training set size
# m2 = len(X)
# print(m)
# print(m2)
nn_input_dim = 3  # input layer dimensionality (we have two input features)
nn_output_dim = 1  # output layer dimensionality (we have one output)

# Gradient descent parameters
alpha = 0.2  # learning rate for gradient descent
# print(Y)
# print(X)

In [23]:
def sigmoid(x):
    # TODO 1: Compute the sigmoid function at the given x (~1 line)
    # For example: sigmoid(2) should compute the value of sigmoid function at x = 2.
    # Hint: Use np.exp instead of math.exp to allow for vectorization.
    #----------------------------------------------------------------------------------------------
    sig = (1/(1+np.exp(-x)))
    #----------------------------------------------------------------------------------------------
    
    return sig


In [24]:
X_matrix = []
X = []
Y = []
sentences = []
for i in range (1, 31):
    with open("original (" + str(i) +").txt", "r", encoding="utf8") as file:
        original_test = file.read()
        article_preprocessed_test = preprocessing(original_test)
    # print(original_test)
    
    with open("summarized (" + str(i) +").txt", "r", encoding="utf8") as file:
        summarized_text = file.read()
    # print(summarized_text)
    
    X_i = generate_X_labels(article_preprocessed_test)
    Y_i, original_list_no_first_space = generate_Y_labels(original_test, summarized_text)
    Y.extend(Y_i)
    X_matrix.extend(X_i)
    sentences.extend(original_list_no_first_space)
    # print(i)

# for article in X_matrix:
#     for x in article:
#         X.append(x)

for x in X_matrix:
    X.append(x.tolist())
    
X = np.matrix(X)
# print(X)
# print(Y)


    
# for i in range (0, len(X)):
#     print("tf-idf: %f and true value: %f at sentence\n %s\n" % (X[i], Y[i], sentences[i]))

m = len(X)
print(len(X))
print(len(Y))
# print(type(x))


# #     predicton = predict(model, X_test)
# #     print(predicton)

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier
TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn
Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL
It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters
However, the company said AOLs underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues
Time Warners fourth quarter profits were slightly better than analysts expectations
For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn
For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins
The dollar has hit its highest level against the euro in almost three months after the Federal Reserve 

The Producers, starring Nathan Lane and Lee Evans, is up for best musical at the ceremony at the National Theatre
Ferris - best known for her television roles in programmes such as The Darling Buds of May - has made the shortlist for her role in Notes on Falling Leaves, at the Royal Court Theatre
Meanwhile, Richard Griffiths, who plays Hector in The History Boys at the National Theatre, will battle it out for the best actor award with Douglas Hodge (Dumb Show) and Stanley Townsend (Shining City)
Festen is also shortlisted in the best designer category where Ian MacNeil, Jean Kalman and Paul Arditti will be up against Hildegard Bechtler, for Iphigenia at Aulis, and Paul Brown, for False Servant
The plot of an international bestseller that thousands of readers are likely to receive as a Christmas present is laughable, a clergyman has said
Despite enjoying Dan Browns conspiracy theory, the Bishop said there was a lack of evidence to back up its claims
A great deal of credible evidence pro

In [25]:
def build_model(nn_hdim, num_passes=20000, print_loss=False):
    
    np.random.seed(0)
    W1 = np.random.randn(nn_hdim, nn_input_dim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((nn_hdim, 1))
    W2 = np.random.randn(nn_output_dim, nn_hdim) / np.sqrt(nn_hdim)
    b2 = np.zeros((nn_output_dim, 1))

    model = {}

    for i in range(0, num_passes):
        DW1 = 0
        DW2 = 0
        Db1 = 0
        Db2 = 0
        cost = 0

        for j in range(0, m):
#             if j == 10:
#                 break
            a0 = X[j, :].reshape(-1, 1)  # Every training example is a column vector.
            y = Y[j]
            
            z1 = np.dot(W1 , a0 )+ b1
            a1 = np.tanh(z1)
            z2 = np.dot(W2 , a1) + b2
            a2 = sigmoid(z2)
            
            if (i == num_passes -1 ):
                print('True value: %f, got: %f'% (y, a2))

            cost_j = -1 * ((np.log(a2) * y + (1-y)* np.log(1-a2)))

            da2 =  ( -y/a2  + (1-y)/(1-a2) )
            dz2 =  da2 * a2 * ( 1 - a2)
            dW2 = np.dot(dz2 , a1.T)
            db2 = dz2

            da1 =  np.dot(dz2,W2).T
            dz1 = np.multiply(da1 , 1 - np.square(a1) )
            dW1 = np.dot(dz1 , a0.T )
            db1 = dz1

            DW1 += dW1
            DW2 += dW2
            Db2 += db2
            Db1 += db1
            cost += cost_j
        
        DW1 /= m
        DW2 /= m
        Db1 /= m
        Db2 /= m
        cost /= m

        W1 -= alpha * DW1
        b1 -= alpha * Db1
        W2 -= alpha * DW2
        b2 -= alpha * Db2

        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, cost))

    return model

In [26]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    a0 = x.T
    
    # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
    #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
    # -----------------------------------------------------------------------------------------------
    z1 = np.dot(W1 , a0) + b1
    a1 = np.tanh(z1)
    z2 = np.dot(W2 , a1) + b2
    a2 = sigmoid(z2)
    # ------------------------------------------------------------------------------------------------
    # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
#     prediction = np.round(a2)
    prediction = a2
    
    return prediction

In [27]:
model = build_model(nn_hdim=8, num_passes=10001, print_loss=True)

Loss after iteration 0: 0.788027
Loss after iteration 1000: 0.621704
Loss after iteration 2000: 0.613705
Loss after iteration 3000: 0.609747
Loss after iteration 4000: 0.607805
Loss after iteration 5000: 0.606754
Loss after iteration 6000: 0.606078
Loss after iteration 7000: 0.605563
Loss after iteration 8000: 0.605124
Loss after iteration 9000: 0.604730
True value: 0.000000, got: 0.078829
True value: 1.000000, got: 0.500359
True value: 0.000000, got: 0.477612
True value: 1.000000, got: 0.382963
True value: 1.000000, got: 0.466114
True value: 0.000000, got: 0.258032
True value: 0.000000, got: 0.079332
True value: 1.000000, got: 0.422188
True value: 1.000000, got: 0.491781
True value: 0.000000, got: 0.496465
True value: 0.000000, got: 0.494049
True value: 1.000000, got: 0.317869
True value: 0.000000, got: 0.493070
True value: 1.000000, got: 0.496553
True value: 0.000000, got: 0.485680
True value: 1.000000, got: 0.501338
True value: 0.000000, got: 0.419537
True value: 0.000000, got: 0.39

In [57]:
def test(file_number, compression_ratio):
    with open("original (" + str(file_number) +").txt", "r", encoding="utf8") as file:
        original_test = file.read()
        article_preprocessed_test = preprocessing(original_test)
        # print(original_test)

    with open("summarized (" + str(file_number) +").txt", "r", encoding="utf8") as file:
        summarized_text = file.read()
        # print(summarized_text)
    
    X_test = generate_X_labels(article_preprocessed_test)
    # print(X_test)
    predicton = predict(model, X_test)
    print(predicton)
    Y_test,_ = generate_Y_labels(original_test, summarized_text)
    print(Y_test)
    print(len(Y_test))
    
    highest = np.argsort(predicton[0]) [::-1]
    print(highest)
    sentences = re.split(r"\n\n|\.(?!\d)", original_test)
    sentences = [x for x in sentences if x]
    
    for i in range(len(sentences)):
        if sentences[i][0] == '"':
            sentences[i] = sentences[i][1:len(sentences[i])]
            sentences[i-1] += '"' 
    
#     for sentence in sentences:
#         print(sentence)
        
    output_sentences = []
    output_indices = []
    
    num_sentences_summarized = math.ceil(compression_ratio * len(sentences))
#     print(num_sentences_summarized)
    for i in range (0, num_sentences_summarized):
        output_sentences.append(sentences[highest[i]])
        output_indices.append(highest[i])
        
    output_sentences = '. '.join(output_sentences)
    output_sentences += '.'
    print(output_sentences)
#     print(output_indices)
    
    correct = 0
    missed = 0
    
    Y_true_indices = [i for i, x in enumerate(Y_test) if x == 1]
            
    for true_index in Y_true_indices:
        if true_index in output_indices:
            correct += 1
        else:
            missed += 1
            
#     missed = num_sentences_summarized - correct
    wrong = num_sentences_summarized - correct
    
    precision = correct / (correct + wrong)
    recall = correct / (correct + missed)
    print('correct: %f , wrong: %f , missed: %f' % (correct, wrong, missed))
    print('Precision for document: %i is : %f' % (file_number, precision))
    print('Recall for document: %i is : %f \n' % (file_number, recall))    

In [58]:
# highest = np.argsort(predicton[0]) [::-1]
# print(highest)
# lines = original_test.split('.')
# output = []
# for i in range (0, 6):
#     output.append(lines[highest[i]])
# print(output)
test(1, 0.35)
test(16, 0.35)
test(17, 0.35)
test(8, 0.35)

[[0.07882752 0.50035964 0.47761027 0.38296827 0.4661135  0.25803492
  0.07932993 0.42218952 0.49178138 0.49646511 0.49404899 0.31787093
  0.49307375 0.49655514 0.4856775  0.50133849 0.41953713 0.39224178
  0.48359601 0.49042407 0.25670863]]
[0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0]
21
[15  1 13  9 10 12  8 19 14 18  2  4  7 16 17  3 11  5 20  6  0]
 For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins. Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.  It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband.  TimeWarner also has to restate 2000 and 2003 re