In [1]:
import sys
import os
import re
import string
import numpy as np
import math
import nltk
import io
import csv
import pandas as pd
import trax
import textwrap
wrapper = textwrap.TextWrapper(width=70)


from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import islice
from rouge import Rouge
from trax import layers as tlayer
from trax.fastmath import numpy as trax_np
from trax.supervised import training





In [2]:
# nltk.download('stopwords')
nltk.download('stopwords')  # uncomment these lines once they are not downloaded
nltk.download('punkt')  # uncomment these lines once they are not downloaded

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mohammedzaki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/mohammedzaki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Extractive Summary

In [3]:
def preprocessing(article):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    article_preprocessed = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
        sentence_preprocessed = []
        sentence = re.sub(r"[^a-zA-Z\s]+", "", sentence)
        words = word_tokenize(sentence)
        for word in words:
            if (word not in stopwords_english and word not in string.punctuation):
                word_stemmed = stemmer.stem(word)  
                sentence_preprocessed.append(word_stemmed)
        if sentence_preprocessed:
            article_preprocessed.append(sentence_preprocessed)
            
    return article_preprocessed


In [4]:
def convert_list_to_string(sentences):  # converts list of lists to list of strings
    sentences_modified = []   # list of strings
    for sentence in sentences:
        sentence_modified = ' '.join(sentence)
        sentences_modified.append(sentence_modified)
    return sentences_modified

In [5]:
### Feature 1

def calculate_TF_IDF(content):
    flat_words = [word for sent in content for word in sent]
    words_set = set(flat_words)
    words_num = len(words_set)
    tf = pd.DataFrame(np.zeros((len(content), words_num)), columns = list(words_set))
    for i in range (len(content)):
        for w in content[i]:
                      tf[w][i] += 1/len(content[i])
                      
    idf = {}
    
    for word in words_set:
        num_docs = 0
        for i in range(len(content)):
            if word in content[i]:
                num_docs += 1
                
        idf[word] = np.log10(len(content) / num_docs)
        
    tf_idf = np.zeros(len(content))
    
    for i in range (len(content)):
        for word in content[i]:
            tf_idf[i] += tf[word][i] * idf[word]
            
    tf_idf = tf_idf/max(tf_idf)  # might be commented (this normalizes the tf-idf)
            
    return tf_idf

In [6]:
### Feature 2

def sentence_length(article_preprocessed):
    article_preprocessed = convert_list_to_string(article_preprocessed)
    max_length = 0
    for sentence in article_preprocessed:
        if len(sentence.split()) > max_length:
            max_length = len(sentence.split())
            
    sentence_length_feature = []
    for sentence in article_preprocessed:
        sentence_length_feature.append(len(sentence.split()) / max_length)


    return sentence_length_feature

In [7]:
def generate_X_labels(article_preprocessed):
    # feature 1 (tf_idf)
    tf_idf_score_feature = calculate_TF_IDF(article_preprocessed)
    
    
    # feature 2 (sentence_length)
    sentence_length_feature = sentence_length(article_preprocessed)
    
    matrix = np.column_stack((tf_idf_score_feature, sentence_length_feature))

    return matrix

In [8]:
def generate_Y_labels(original, summarized):
    Y_list = []
    original_sentences = sent_tokenize(original)
    original_sentences[0] = original_sentences[0][1:] # to remove the \n
    summarized_sentences = sent_tokenize(summarized)
    
    for original_sentence in original_sentences:
        added = 0
        for summarized_sentence in summarized_sentences:
            if original_sentence in summarized_sentence:
                Y_list.append(1)
                added = 1
                break
        if added == 0:
            Y_list.append(0)
    
    return Y_list, original_sentences
    

In [9]:
X_matrix = []
X = []
Y = []
sentences = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range (1, 51):   # loading business articles
        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf-8-sig')
        article_file.readline()
        article = article_file.read()
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf-8-sig')
        summarized = summarized_file.read()
        summarized_file.close()

        article_preprocessed = preprocessing(article)
        X_i = generate_X_labels(article_preprocessed)
        Y_i, original_list_no_first_space = generate_Y_labels(article, summarized)

        if(len(X_i) != len(Y_i)):
            print('Error! features and labels are not equal in length')

        Y.extend(Y_i)
        X_matrix.extend(X_i)
        sentences.extend(original_list_no_first_space)
    

for x in X_matrix:
    X.append(x.tolist())
    
X = np.matrix(X)

m = len(X)

print(len(X))
print(len(Y))

4179
4179


In [10]:
print(type(X))
print(X[: 10, :])
print(type(Y))
print(Y[: 10])

<class 'numpy.matrix'>
[[0.73203128 0.54545455]
 [0.87185081 0.63636364]
 [0.69813911 0.36363636]
 [0.75779078 0.59090909]
 [0.74727107 0.31818182]
 [0.72061857 0.27272727]
 [0.78105634 0.45454545]
 [0.69638967 0.63636364]
 [0.8741924  0.81818182]
 [0.78987147 0.59090909]]
<class 'list'>
[1, 0, 1, 1, 0, 0, 1, 1, 0, 0]


In [11]:
nn_input_dim = 2 # input layer size (we have two input features)
nn_output_dim = 1  # output layer size (we have one output)

# Gradient descent parameters
alpha = 0.1  # learning rate for gradient descent

In [12]:
def sigmoid(x):
    sig = (1/(1+np.exp(-x)))
    return sig


In [14]:
def build_model(nn_hdim, num_passes=20000, print_loss=False):
    np.random.seed(0)
    W1 = np.random.randn(nn_hdim, nn_input_dim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((nn_hdim, 1))
    W2 = np.random.randn(nn_hdim, nn_hdim) / np.sqrt(nn_input_dim)
    b2 = np.zeros((nn_hdim, 1))
    W3 = np.random.randn(nn_hdim, nn_hdim) / np.sqrt(nn_input_dim)
    b3 = np.zeros((nn_hdim, 1))
    W4 = np.random.randn(nn_output_dim, nn_hdim) / np.sqrt(nn_hdim)
    b4 = np.zeros((nn_output_dim, 1))

    # This is what we return at the end
    model = {}

    # Batch Gradient descent (We accumulate the loss for each training point before updating the weights)
    # For each iteration:
    for i in range(0, num_passes):
        DW1 = 0
        DW2 = 0
        DW3 = 0
        DW4 = 0
        Db1 = 0
        Db2 = 0
        Db3 = 0
        Db4 = 0
        cost = 0
        # Loop on every training example...
        for j in range(0, m):
            a0 = X[j, :].reshape(-1, 1)  # Every training example is a column vector.
            y = Y[j]
            
            # TODO 2: Apply forward propagation on every training example a0 (a column vector 2x1) with its
            # corresponding label y. It is required to compute z1, a1, z2, and a2
            #----------------------------------------------------------------------------------------------
            # Forward propagation
            z1 = np.dot(W1 , a0 )+ b1
            a1 = np.tanh(z1)
            z2 = np.dot(W2 , a1 )+ b2
            a2 = np.tanh(z2)
            z3 = np.dot(W3 , a2 )+ b3
            a3 = np.tanh(z3)
            z4 = np.dot(W4 , a3) + b4
            a4 = sigmoid(z4)
            #----------------------------------------------------------------------------------------------

            # TODO 3: Compute the cost/loss function for every training example (Hint: use np.log)
            # ---------------------------------------------------------------------------------------------
            cost_j = -1 * ((np.log(a4) * y + (1-y)* np.log(1-a4)))
            # ---------------------------------------------------------------------------------------------

            # TODO 4: Derive the equations of backpropagation to find dW2, db2, dW1, and db1.
            # Hint: Check the dimensions at each step. 
            # Hint: For element-wise multiplication use *, for matrix multiplication use @
            # Example: y = A * B performs element wise multiplication 
            #          y = A @ B performs matrix multiplication
            # ---------------------------------------------------------------------------------------------
            da4 =  ( -y/a4  + (1-y)/(1-a4) )
            dz4 =  da4 * a4 * ( 1 - a4)
            dW4 = np.dot(dz4 , a3.T)
            db4 = dz4
            
            da3 =  np.dot(W4.T, dz4)
            dz3 = np.multiply(da3 , 1 - np.square(a3) )
            dW3 = np.dot(dz3 , a2.T )
            db3 = dz3
            
            da2 =  np.dot(W3.T, dz3)
            dz2 = np.multiply(da2 , 1 - np.square(a2) )
            dW2 = np.dot(dz2 , a1.T )
            db2 = dz2

            da1 =  np.dot(W2.T, dz2)
            dz1 = np.multiply(da1 , 1 - np.square(a1) )
            dW1 = np.dot(dz1 , a0.T )
            db1 = dz1
            
            # ---------------------------------------------------------------------------------------------
            
            # Accumulating the sum of dW1, db1, dW2, db2 and cost_j into the variables DW1, Db1, DW2, Db2 and cost
            # for all training set. 
            DW1 += dW1
            DW2 += dW2
            DW3 += dW3
            DW4 += dW4
            Db4 += db4
            Db3 += db3
            Db2 += db2
            Db1 += db1
            cost += cost_j
        
        # Averaging DW1, DW2, Db1, Db2 and cost over the m training examples. 
        DW1 /= m
        DW2 /= m
        DW3 /= m
        DW4 /= m
        Db1 /= m
        Db2 /= m
        Db3 /= m
        Db4 /= m
        cost /= m

        # TODO 5: Perform the gradient descent parameter update.
        # ---------------------------------------------------------------------------------------------------
        # Gradient descent parameter update
        W1 -= alpha * DW1
        b1 -= alpha * Db1
        W2 -= alpha * DW2
        b2 -= alpha * Db2
        W3 -= alpha * DW3
        b3 -= alpha * Db3
        W4 -= alpha * DW4
        b4 -= alpha * Db4
        # ---------------------------------------------------------------------------------------------------

        # Assign new parameters to the model
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2, 'W3': W3, 'b3': b3, 'W4': W4, 'b4': b4}

        # Optionally print the loss.
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, cost))

    return model


In [13]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    a0 = x.T
    z1 = np.dot(W1 , a0) + b1
    a1 = np.tanh(z1)
    z2 = np.dot(W2 , a1) + b2
    a2 = sigmoid(z2)
    prediction = a2
    
    return prediction

# def predict(model, x):
#     W1, b1, W2, b2, W3, b3, W4, b4 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3'], model['W4'], model['b4']
#     a0 = x.T
    
#     # TODO 6 (aka TODO 2): Apply forward propagation on every test example a0 (a column vector 2x1) with its
#     #  corresponding label y. It is required to compute z1, a1, z2, and a2  (SAME AS TODO2).
#     # -----------------------------------------------------------------------------------------------
#     z1 = np.dot(W1 , a0 )+ b1
#     a1 = np.tanh(z1)
#     z2 = np.dot(W2 , a1 )+ b2
#     a2 = np.tanh(z2)
#     z3 = np.dot(W3 , a2 )+ b3
#     a3 = np.tanh(z3)
#     z4 = np.dot(W4 , a3) + b4
#     a4 = sigmoid(z4)
#     # ------------------------------------------------------------------------------------------------
#     # Applying a threshold of 0.5 (i.e. predictions greater than 0.5 are mapped to 1, and 0 otherwise)
#     # prediction = np.round(a2)
#     prediction = a4    
#     return prediction

In [32]:
model = build_model(nn_hdim= 8, num_passes = 10001, print_loss=True)

Loss after iteration 0: 0.670913
Loss after iteration 1000: 0.566402
Loss after iteration 2000: 0.563649
Loss after iteration 3000: 0.563039
Loss after iteration 4000: 0.562859
Loss after iteration 5000: 0.562725
Loss after iteration 6000: 0.562606
Loss after iteration 7000: 0.562495
Loss after iteration 8000: 0.562387
Loss after iteration 9000: 0.562281
Loss after iteration 10000: 0.562172


In [16]:
print(model)

NameError: name 'model' is not defined

In [14]:
# # print(model)
# model tf-idf not normalized
# model = {'W1': np.array([[ 1.38785383,  0.49082933],
#        [ 0.55641748,  1.62806027],
#        [ 1.85376828, -0.3225326 ],
#        [ 1.78168709,  0.05507265],
#        [ 1.5035727 ,  0.15089058],
#        [-0.21027833,  1.92211576],
#        [ 0.46820238,  0.16111129],
#        [ 0.84196808,  0.12407808]]), 'b1': np.array([[ 0.38819622],
#        [-0.02713244],
#        [-0.8429403 ],
#        [-0.91256074],
#        [-0.66510711],
#        [ 0.01664907],
#        [ 0.03959506],
#        [-0.22607371]]), 'W2': np.array([[ 0.6266244 ,  0.50983672, -1.1677148 , -1.47645926, -1.39277239,
#          1.53280975,  0.17978621, -0.50006151]]), 'b2': np.array([[0.58550391]])}

# model = {'W1': np.array([[ 1.18336695,  0.1250435 ], # 3 layers
#        [ 0.38733117,  1.8530943 ],
#        [ 1.75886377, -0.58510996],
#        [ 0.54240451, -0.05259935],
#        [-0.09248666,  0.2141493 ],
#        [ 0.0432581 ,  0.9329085 ],
#        [ 0.66859945,  0.25386436],
#        [ 0.5554128 ,  0.36237266]]), 'b1': np.array([[-0.28405656],
#        [-0.40425758],
#        [-0.99870768],
#        [ 0.05080746],
#        [-0.09605534],
#        [-0.26051699],
#        [-0.18459905],
#        [-0.28472492]]), 'W2': np.array([[ 1.13457184, -0.20822159,  0.38311448, -0.51971462, -1.7927709 ,
#          0.47739742,  0.70361151, -0.44614564],
#        [ 1.63177288, -1.0197636 , -0.08924496, -0.14037122,  1.09361726,
#          1.04981611,  0.09540724,  0.2347938 ],
#        [-0.67150498, -1.45686798, -0.20758122,  0.09659014,  0.85981688,
#          0.80570778, -0.296603  , -0.22930296],
#        [-0.8237228 , -1.15021352, -1.24466209,  1.34648287, -0.36599564,
#         -0.38097044, -0.92866868,  0.50709689],
#        [-1.05799442, -0.01422579, -0.90377796,  0.28580989, -0.35451199,
#         -0.7717647 , -0.02472574,  0.26142491],
#        [ 0.01077594,  0.59710575, -0.83069778, -0.40593486, -0.49098818,
#         -0.20682705, -0.76479984, -1.38033905],
#        [ 0.03789108, -0.00967381, -1.58485507,  0.14915192, -0.62770378,
#          0.15492018,  0.36470344, -0.01501329],
#        [ 0.78647832, -1.02272337,  0.42132782, -0.45981076, -0.62587553,
#         -0.50757541, -0.22120812,  0.02481316]]), 'b2': np.array([[-0.05359941],
#        [ 0.12717962],
#        [-0.061023  ],
#        [-0.05207335],
#        [ 0.31889328],
#        [ 0.25685478],
#        [ 0.1398867 ],
#        [-0.04797326]]), 'W3': np.array([[-0.80349617,  0.71868534,  0.20718625, -1.20396728,  1.00463432,
#          1.57134532,  1.02363706, -0.37870051],
#        [-0.89039626,  0.50355204, -0.02357477,  1.08788051,  0.29202038,
#          0.60301908,  0.07305096,  0.60210272],
#        [ 0.03802195,  1.28299045,  0.06325746,  0.27166014,  1.30692966,
#         -1.09726151, -1.02922169,  0.86082546],
#        [-0.90207611,  1.29694031, -0.26202875, -0.47270786,  1.46564552,
#          1.30205973,  1.4459055 ,  0.57549995],
#        [-0.66125529,  1.43636039, -0.21285799,  0.59702511,  0.6944864 ,
#         -0.04123769,  0.48235273,  0.71124213],
#        [ 0.263836  , -0.77833376,  0.21475251,  0.94851392, -0.48556096,
#         -0.10818819, -0.30816457,  1.30529932],
#        [ 0.487098  ,  0.38296521, -0.62201678,  0.30699157, -0.51424632,
#          0.09400846, -0.3431704 ,  0.40476813],
#        [ 0.39364728, -0.2458679 ,  0.37127184, -0.72310426, -1.03654804,
#          0.2336883 ,  0.0376808 ,  0.47617952]]), 'b3': np.array([[ 0.14588719],
#        [-0.32320868],
#        [ 0.02159727],
#        [-0.02307498],
#        [ 0.08842675],
#        [ 0.00334234],
#        [ 0.09982744],
#        [-0.13245038]]), 'W4': np.array([[ 1.27121312,  0.57705831, -0.72331337,  0.69760112, -0.86182132,
#          0.17466716, -0.23174607,  0.40275134]]), 'b4': np.array([[-0.47140409]])}
# model tf-idf normalized
model = {'W1': np.array([[ 1.29643591,  0.45074824],
       [ 0.70773118,  1.58460852],
       [ 2.67738766, -1.74837318],
       [ 1.33109077,  0.10529432],
       [ 0.69949236,  0.23108707],
       [-0.35599099,  2.07436754],
       [ 0.47995731,  0.24472955],
       [ 0.63720221,  0.2369889 ]]), 'b1': np.array([[ 0.24555316],
       [-0.02617257],
       [-1.61662627],
       [-1.31429877],
       [-0.17949485],
       [ 0.07467697],
       [ 0.13714999],
       [ 0.03253189]]), 'W2': np.array([[-0.20112114,  0.1343918 , -2.29593635, -1.68775395, -1.66578869,
         1.04315377, -0.56503497, -1.11453418]]), 'b2': np.array([[-0.64778329]])}

In [15]:
def test(article_preprocessed_test, original_test, summarized_text, compression_ratio, file_number = 0):
    X_test = generate_X_labels(article_preprocessed_test)
    predicton = predict(model, X_test)
    Y_test, original_sentences = generate_Y_labels(original_test, summarized_text)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(predicton[0]) [::-1]
    highest = highest[: num_sentences_summarized]
#     highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    output_indices = []
    

    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        output_indices.append(highest[i])
        
    output_sentences = ''.join(output_sentences)
    
    rouge = Rouge()
#     rouge = Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    scores_nn = rouge.get_scores(output_sentences, summarized_text)
    scores_nn = scores_nn[0]
    
    rouge_1_nn = scores_nn['rouge-1']['f']
    rouge_2_nn = scores_nn['rouge-2']['f']
    rouge_l_nn = scores_nn['rouge-l']['f']
    
    
    
    ### classified using TF_idf score

    output_tf_idf = summary_using_tf_idf_only(original_test, num_sentences_summarized)
    
    scores_tf_idf = rouge.get_scores(output_tf_idf, summarized_text)
    scores_tf_idf = scores_tf_idf[0]
    
    rouge_1_tf_idf = scores_tf_idf['rouge-1']['f']
    rouge_2_tf_idf = scores_tf_idf['rouge-2']['f']
    rouge_l_tf_idf = scores_tf_idf['rouge-l']['f']

    
    return rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf

In [16]:
def summary_using_tf_idf_only(text, num_sentences):
    article_sentences = sent_tokenize(text)
    article_preprocessed = preprocessing(text)
    sentence_scores = calculate_TF_IDF(article_preprocessed)
    highest = np.argsort(sentence_scores) [::-1]
    highest = highest[: num_sentences]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    
    for i in range (0, num_sentences):
        output_sentences.append(article_sentences[highest[i]])
        
    
    output_sentences = ' '.join(output_sentences)
    
    return output_sentences

In [17]:
article_file = io.open("cr7.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
summary = summary_using_tf_idf_only(article, 7)
print(summary)

Real Madrid forward Cristiano Ronaldo has said that he is the "best player in history" after winning his fifth Ballon d'Or on Thursday. Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him. He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me. I play well with both feet, I’m quick, powerful, good with the head, I score goals, I make assists. That says something, doesn’t it? Legends like Floyd Mayweather [Jr.] and LeBron James don’t get to their perfect level by chance. To be at the top and to stay there, you have to have more talent than the others."


In [18]:
# testing on BBC news summary

rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

article_types = ['business', 'entertainment', 'politics', 'sport', 'tech']

for article_type in article_types:
    for i in range(101, 131):

        article_file = io.open("train_original/" + article_type + "/article (" + str(i) +").txt", "r", encoding='utf-8-sig')
        article_file.readline()
        article = article_file.read()
        article_preprocessed = preprocessing(article)
        article_file.close()

        summarized_file = io.open("train_summary/" + article_type + "/summary (" + str(i) +").txt", "r", encoding='utf-8-sig')
        summarized = summarized_file.read()
        summarized_file.close()

        rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
            article_preprocessed, article, summarized, 0.35, i)

        rouge_1_list_nn.append(rouge_1_nn)
        rouge_2_list_nn.append(rouge_2_nn)
        rouge_l_list_nn.append(rouge_l_nn)

        rouge_1_list_tf_idf.append(rouge_1_tf_idf)
        rouge_2_list_tf_idf.append(rouge_2_tf_idf)
        rouge_l_list_tf_idf.append(rouge_l_tf_idf)


print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

Using nn
Average Rouge 1 score is: 0.779896
Average Rouge 2 score is: 0.712791
Average Rouge l score is: 0.777029
Using tf_idf only
Average Rouge 1 score is: 0.361936
Average Rouge 2 score is: 0.188567
Average Rouge l score is: 0.341960


In [19]:
# testing on cnn dailymail dataset
rouge_1_list_nn = []
rouge_2_list_nn = []
rouge_l_list_nn = []

rouge_1_list_tf_idf = []
rouge_2_list_tf_idf = []
rouge_l_list_tf_idf = []

df = pd.read_csv('test.csv')
articles_cnn = df['article']
summaries_cnn = df['highlights']

i = 0
for article_cnn, summary_cnn in zip(articles_cnn, summaries_cnn):
    article_cnn_preprocessed = preprocessing(article_cnn)
    rouge_1_nn, rouge_2_nn, rouge_l_nn, rouge_1_tf_idf, rouge_2_tf_idf, rouge_l_tf_idf = test(
        article_cnn_preprocessed, article_cnn, summary_cnn, 0.35, i)
    i += 1
    if (i == 300):
        break
    
    rouge_1_list_nn.append(rouge_1_nn)
    rouge_2_list_nn.append(rouge_2_nn)
    rouge_l_list_nn.append(rouge_l_nn)

    rouge_1_list_tf_idf.append(rouge_1_tf_idf)
    rouge_2_list_tf_idf.append(rouge_2_tf_idf)
    rouge_l_list_tf_idf.append(rouge_l_tf_idf)

print('Using nn')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_nn)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_nn)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_nn)))

print('Using tf_idf only')
print('Average Rouge 1 score is: %f' % (np.average(rouge_1_list_tf_idf)))
print('Average Rouge 2 score is: %f' % (np.average(rouge_2_list_tf_idf)))
print('Average Rouge l score is: %f' % (np.average(rouge_l_list_tf_idf)))

    

Using nn
Average Rouge 1 score is: 0.287608
Average Rouge 2 score is: 0.111131
Average Rouge l score is: 0.273179
Using tf_idf only
Average Rouge 1 score is: 0.192190
Average Rouge 2 score is: 0.050362
Average Rouge l score is: 0.177994


In [20]:
def summarize(article, compression_ratio):
    original_sentences = sent_tokenize(article)
    article_preprocessed_entered = preprocessing(article)
    X_test_entered = generate_X_labels(article_preprocessed_entered)
    summary_predicted = predict(model, X_test_entered)
    num_sentences_summarized = math.ceil(compression_ratio * len(original_sentences))
    
    
    highest = np.argsort(summary_predicted[0]) [::-1]
    highest = highest[: num_sentences_summarized]
    highest = sorted(highest) # uncomment to arrange the article
    output_sentences = []
    
    for i in range (0, num_sentences_summarized):
        output_sentences.append(original_sentences[highest[i]])
        
    
    output_sentences = ' '.join(output_sentences)
    
    return output_sentences
    

In [21]:
article_file = io.open("cr7.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
summary = summarize(article, 0.35)
print(summary)

Real Madrid forward Cristiano Ronaldo has said that he is the "best player in history" after winning his fifth Ballon d'Or on Thursday. Ronaldo picked up the award for the second year in a row to equal the record of Barcelona star Lionel Messi, and he said he does not believe any player is better than him. He told France Football (h/t Goal's Robin Bairner): "I've never seen anyone better than me. No footballer can do the things I can. "There’s no player more complete than me. But I tell you: there’s no one more complete than me.


In [22]:
article_file = io.open("godzilla.txt", "r", encoding='utf-8-sig')
article = article_file.read()
article_file.close
summary = summarize(article, 0.35)
print(summary)

An actor dressed as the giant creature breathed smoke over photographers on Monday as Godzilla received the 2,271st star on Hollywood Boulevard. "Godzilla should thank you for this historical and monumental star," said Final Wars producer Shogo Tomiyama. Hollywood's honorary mayor, Johnny Grant, said: "I do hereby proclaim this Godzilla Day in Hollywood. The premiere of Godzilla: Final Wars at Grauman's Chinese Theatre followed the ceremony on Hollywood Boulevard. Director Ryuhei Kitamura said it may not be Godzilla's final outing, as it has been billed. And producer Shogo Tomiyama added: "So long as Godzilla can fascinate people, I believe he will be resurrected by new generations of filmmakers in the future."


# Abstractive Summary

In [2]:
train_fn = trax.data.TFDS('cnn_dailymail', data_dir='data/', keys=('article', 'highlights'), train=True)

eval_fn = trax.data.TFDS('cnn_dailymail', data_dir='data/', keys=('article', 'highlights'), train=False)

2023-05-13 16:32:14.740312: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [3]:
VOCAB_DIR = 'data/'
VOCAB_FILE = 'summarize32k.subword.subwords'
MODEL_DIR = 'model'

In [4]:
def tokenize(input_str):
    integers =  next(trax.data.tokenize(iter([input_str]), vocab_dir = VOCAB_DIR, vocab_file = VOCAB_FILE))
    return list(integers) + [1]

In [5]:
def detokenize(integers):
    sentence = trax.data.detokenize(integers, vocab_dir = VOCAB_DIR, vocab_file = VOCAB_FILE)
    return wrapper.fill(sentence)

In [6]:
def preprocess_abstractive(stream):
    for (article, summary) in stream:
        combined = np.array(list(article) + [1, 0] + list(summary) + [1])
        mask = [0] * (len(list(article)) + 2) + [1] * (len(list(summary)) + 1)
        yield combined, combined, np.array(mask)

# You can combine a few data preprocessing steps into a pipeline like this.
input_pipeline = trax.data.Serial(trax.data.Tokenize(vocab_dir = VOCAB_DIR, vocab_file = VOCAB_FILE), preprocess_abstractive, trax.data.FilterByLength(4096))

train_stream = input_pipeline(train_fn())
eval_stream = input_pipeline(eval_fn())

In [7]:
# train_input, train_target, train_mask = next(train_stream)

# assert sum((train_input - train_target)**2) == 0  # They are the same in Language Model (LM).
# # prints mask, 0s on article, 1s on summary
# print(f'Single example mask:\n\n {train_mask}')
# print(f'Single example:\n\n {detokenize(train_input)}')

In [8]:
boundaries =  [128, 256,  512, 1024]
batch_sizes = [16,    8,    4,    2, 1]

# Create the streams.
train_batch_stream = trax.data.BucketByLength(boundaries, batch_sizes)(train_stream)

eval_batch_stream = trax.data.BucketByLength(boundaries, batch_sizes)(eval_stream)

In [9]:
def DotProductAttention(query, key, value, mask):
    """Dot product self-attention.
    Args:
        query (jax.interpreters.xla.DeviceArray): array of query representations with shape (L_q by d)
        key (jax.interpreters.xla.DeviceArray): array of key representations with shape (L_k by d)
        value (jax.interpreters.xla.DeviceArray): array of value representations with shape (L_k by d) where L_v = L_k
        mask (jax.interpreters.xla.DeviceArray): attention-mask, gates attention with shape (L_q by L_k)

    Returns:
        jax.interpreters.xla.DeviceArray: Self-attention array for q, k, v arrays. (L_q by L_k)
    """

    assert query.shape[-1] == key.shape[-1] == value.shape[-1], "Embedding dimensions of q, k, v aren't all the same"

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # Save depth/dimension of the query embedding for scaling down the dot product

    # Calculate scaled query key dot product according to formula above
    dots = trax_np.matmul(query, trax_np.swapaxes(key, -1, -2)) / trax_np.sqrt(query.shape[-1])
    
    # Apply the mask
    if mask is not None: # The 'None' in this line does not need to be replaced
        dots = trax_np.where(mask, dots, trax_np.full_like(dots, -1e9))
    
    # Softmax formula implementation
    # Use trax.fastmath.logsumexp of dots to avoid underflow by division by large numbers
    # Hint: Last axis should be used and keepdims should be True
    # Note: softmax = e^(dots - logsumexp(dots)) = E^dots / sumexp(dots)
    logsumexp = trax.fastmath.logsumexp(dots, axis=-1, keepdims=True)

    # Take exponential of dots minus logsumexp to get softmax
    # Use jnp.exp()
    dots = trax_np.exp(dots - logsumexp)

    # Multiply dots by value to get self-attention
    # Use jnp.matmul()
    attention = trax_np.matmul(dots, value)

    ## END CODE HERE ###
    
    return attention

In [10]:
def compute_attention_heads_closure(n_heads, d_head):
    """ Function that simulates environment inside CausalAttention function.
    Args:
        d_head (int):  dimensionality of heads.
        n_heads (int): number of attention heads.
    Returns:
        function: compute_attention_heads function
    """

    def compute_attention_heads(x):
        """ Compute the attention heads.
        Args:
            x (jax.interpreters.xla.DeviceArray): tensor with shape (batch_size, seqlen, n_heads X d_head).
        Returns:
            jax.interpreters.xla.DeviceArray: reshaped tensor with shape (batch_size X n_heads, seqlen, d_head).
        """
        ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
        
        # Size of the x's batch dimension
        batch_size = x.shape[0]
        # Length of the sequence
        # Should be size of x's first dimension without counting the batch dim
        seqlen = x.shape[1]
        # Reshape x using jnp.reshape()
        # batch_size, seqlen, n_heads*d_head -> batch_size, seqlen, n_heads, d_head
        x = trax_np.reshape(x, (batch_size, seqlen, n_heads, d_head))
        # Transpose x using jnp.transpose()
        # batch_size, seqlen, n_heads, d_head -> batch_size, n_heads, seqlen, d_head
        # Note that the values within the tuple are the indexes of the dimensions of x and you must rearrange them
        x = trax_np.transpose(x, (0, 2, 1, 3))
        # Reshape x using jnp.reshape()
        # batch_size, n_heads, seqlen, d_head -> batch_size*n_heads, seqlen, d_head
        x = trax_np.reshape(x, (-1, seqlen, d_head))
        
        ### END CODE HERE ###
        
        return x
    
    return compute_attention_heads

In [11]:
def dot_product_self_attention(q, k, v):
    """ Masked dot product self attention.
    Args:
        q (jax.interpreters.xla.DeviceArray): queries.
        k (jax.interpreters.xla.DeviceArray): keys.
        v (jax.interpreters.xla.DeviceArray): values.
    Returns:
        jax.interpreters.xla.DeviceArray: masked dot product self attention tensor.
    """
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # Hint: mask size should be equal to L_q. Remember that q has shape (batch_size, L_q, d)
    # NOTE: there is a revision underway with the autograder to tolerate better indexing. 
    # Until then, please index q.shape using negative values (this is equivalent to counting from right to left)
    mask_size = q.shape[-2]

    # Creates a matrix with ones below the diagonal and 0s above. It should have shape (1, mask_size, mask_size)
    # Notice that 1's and 0's get casted to True/False by setting dtype to jnp.bool_
    # Use jnp.tril() - Lower triangle of an array and jnp.ones()
    mask = trax_np.tril(trax_np.ones((1, mask_size, mask_size), dtype=trax_np.bool_), k=0)
    
    ### END CODE HERE ###
    
    return DotProductAttention(q, k, v, mask)

In [12]:
def compute_attention_output_closure(n_heads, d_head):
    """ Function that simulates environment inside CausalAttention function.
    Args:
        d_head (int):  dimensionality of heads.
        n_heads (int): number of attention heads.
    Returns:
        function: compute_attention_output function
    """
    
    def compute_attention_output(x):
        """ Compute the attention output.
        Args:
            x (jax.interpreters.xla.DeviceArray): tensor with shape (batch_size X n_heads, seqlen, d_head).
        Returns:
            jax.interpreters.xla.DeviceArray: reshaped tensor with shape (batch_size, seqlen, n_heads X d_head).
        """
        ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
        
        # Length of the sequence
        # Should be size of x's first dimension without counting the batch dim
        seqlen = x.shape[1]
        # Reshape x using jnp.reshape() to shape (batch_size, n_heads, seqlen, d_head)
        x = trax_np.reshape(x, ( -1, n_heads, seqlen, d_head))
        # Transpose x using trax_np.transpose() to shape (batch_size, seqlen, n_heads, d_head)
        x = trax_np.transpose(x, ( 0, 2, 1 , 3))
        
        ### END CODE HERE ###
        
        # Reshape to allow to concatenate the heads
        return trax_np.reshape(x, (-1, seqlen, n_heads * d_head))
    
    return compute_attention_output

In [13]:
def CausalAttention(d_feature, 
                    n_heads, 
                    compute_attention_heads_closure=compute_attention_heads_closure,
                    dot_product_self_attention=dot_product_self_attention,
                    compute_attention_output_closure=compute_attention_output_closure,
                    mode='train'):
    """Transformer-style multi-headed causal attention.

    Args:
        d_feature (int):  dimensionality of feature embedding.
        n_heads (int): number of attention heads.
        compute_attention_heads_closure (function): Closure around compute_attention heads.
        dot_product_self_attention (function): dot_product_self_attention function. 
        compute_attention_output_closure (function): Closure around compute_attention_output. 
        mode (str): 'train' or 'eval'.

    Returns:
        trax.layers.combinators.Serial: Multi-headed self-attention model.
    """
    
    assert d_feature % n_heads == 0
    d_head = d_feature // n_heads

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # HINT: The second argument to tl.Fn() is an uncalled function (without the parentheses)
    # Since you are dealing with closures you might need to call the outer 
    # function with the correct parameters to get the actual uncalled function.
    ComputeAttentionHeads = tlayer.Fn('AttnHeads', compute_attention_heads_closure(n_heads, d_head), n_out=1)
        

    return tlayer.Serial(
        tlayer.Branch( # creates three towers for one input, takes activations and creates queries keys and values
            [tlayer.Dense(d_feature), ComputeAttentionHeads], # queries
            [tlayer.Dense(d_feature), ComputeAttentionHeads], # keys
            [tlayer.Dense(d_feature), ComputeAttentionHeads], # values
        ),
        
        tlayer.Fn('DotProductAttn', dot_product_self_attention, n_out=1), # takes QKV
        # HINT: The second argument to tl.Fn() is an uncalled function
        # Since you are dealing with closures you might need to call the outer 
        # function with the correct parameters to get the actual uncalled function.
        tlayer.Fn('AttnOutput', compute_attention_output_closure(n_heads, d_head), n_out=1), # to allow for parallel
        tlayer.Dense(d_feature) # Final dense layer
    )

    ### END CODE HERE ###

In [14]:
def DecoderBlock(d_model, d_ff, n_heads,
                 dropout, mode, ff_activation):
    """Returns a list of layers that implements a Transformer decoder block.

    The input is an activation tensor.

    Args:
        d_model (int):  depth of embedding.
        d_ff (int): depth of feed-forward layer.
        n_heads (int): number of attention heads.
        dropout (float): dropout rate (how much to drop out).
        mode (str): 'train' or 'eval'.
        ff_activation (function): the non-linearity in feed-forward layer.

    Returns:
        list: list of trax.layers.combinators.Serial that maps an activation tensor to an activation tensor.
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # Create masked multi-head attention block using CausalAttention function
    causal_attention = CausalAttention( 
                        d_model,
                        n_heads=n_heads,
                        mode=mode
                        )

    # Create feed-forward block (list) with two dense layers with dropout and input normalized
    feed_forward = [ 
        # Normalize layer inputs
        tlayer.LayerNorm(),
        # Add first feed forward (dense) layer (don't forget to set the correct value for n_units)
        tlayer.Dense(d_ff),
        # Add activation function passed in as a parameter (you need to call it!)
        ff_activation(), # Generally ReLU
        # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
        tlayer.Dropout(rate=dropout, mode=mode),
        # Add second feed forward layer (don't forget to set the correct value for n_units)
        tlayer.Dense(d_model),
        # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
        tlayer.Dropout(rate=dropout,mode=mode)
    ]

    # Add list of two Residual blocks: the attention with normalization and dropout and feed-forward blocks
    return [
      tlayer.Residual(
          # Normalize layer input
          tlayer.LayerNorm(),
          # Add causal attention block previously defined (without parentheses)
          causal_attention,
          # Add dropout with rate and mode specified
          tlayer.Dropout(rate=dropout, mode=mode)
        ),
      tlayer.Residual(
          # Add feed forward block (without parentheses)
          feed_forward
        ),
      ]
    ### END CODE HERE ###

In [15]:
def TransformerLM(vocab_size=33300,
                  d_model=512,
                  d_ff=2048,
                  n_layers=6,
                  n_heads=8,
                  dropout=0.1,
                  max_len=4096,
                  mode='train',
                  ff_activation=tlayer.Relu):
    """Returns a Transformer language model.

    The input to the model is a tensor of tokens. (This model uses only the
    decoder part of the overall Transformer.)

    Args:
        vocab_size (int): vocab size.
        d_model (int):  depth of embedding.
        d_ff (int): depth of feed-forward layer.
        n_layers (int): number of decoder layers.
        n_heads (int): number of attention heads.
        dropout (float): dropout rate (how much to drop out).
        max_len (int): maximum symbol length for positional encoding.
        mode (str): 'train', 'eval' or 'predict', predict mode is for fast inference.
        ff_activation (function): the non-linearity in feed-forward layer.

    Returns:
        trax.layers.combinators.Serial: A Transformer language model as a layer that maps from a tensor of tokens
        to activations over a vocab set.
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # Embedding inputs and positional encoder
    positional_encoder = [ 
        # Add embedding layer of dimension (vocab_size, d_model)
        tlayer.Embedding(vocab_size, d_model),
        # Use dropout with rate and mode specified
        tlayer.Dropout(rate=dropout, mode=mode),
        # Add positional encoding layer with maximum input length and mode specified
        tlayer.PositionalEncoding(max_len=max_len, mode=mode)]

    # Create stack (list) of decoder blocks with n_layers with necessary parameters
    decoder_blocks = [ 
        DecoderBlock(d_model, d_ff, n_heads,
                    dropout, mode, ff_activation) for _ in range(n_layers)]

    # Create the complete model as written in the figure
    return tlayer.Serial(
        # Use teacher forcing (feed output of previous step to current step)
        tlayer.ShiftRight(mode=mode), # Specify the mode!
        # Add positional encoder
        positional_encoder,
        # Add decoder blocks
        decoder_blocks,
        # Normalize layer
        tlayer.LayerNorm(),

        # Add dense layer of vocab_size (since need to select a word to translate to)
        # (a.k.a., logits layer. Note: activation already set by ff_activation)
        tlayer.Dense(vocab_size),
        # Get probabilities with Logsoftmax
        tlayer.LogSoftmax()
    )

    ### END CODE HERE ###

In [16]:
def training_loop(TransformerLM, train_gen, eval_gen, output_dir):
    '''
    Input:
        TransformerLM (trax.layers.combinators.Serial): The model you are building.
        train_gen (generator): Training stream of data.
        eval_gen (generator): Evaluation stream of data.
        output_dir (str): folder to save your file.
        
    Returns:
        trax.supervised.training.Loop: Training loop.
    '''
    output_dir = os.path.expanduser(output_dir)  # trainer is an object
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01)

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    train_task = training.TrainTask( 
      labeled_data=train_gen, # The training generator
      loss_layer=tlayer.CrossEntropyLoss(), # Loss function 
      optimizer=trax.optimizers.Adam(0.01), # Optimizer (Don't forget to set LR to 0.01)
      lr_schedule=lr_schedule,
      n_steps_per_checkpoint=10
    )

    eval_task = training.EvalTask( 
      labeled_data=eval_gen, # The evaluation generator
      metrics=[tlayer.CrossEntropyLoss(), tlayer.Accuracy()] # CrossEntropyLoss and Accuracy
    )

    ### END CODE HERE ###

    loop = training.Loop(TransformerLM(),
                         train_task,
                         eval_tasks=[eval_task],
                         output_dir=output_dir)
    
    return loop

In [23]:
loop = training_loop(TransformerLM, train_batch_stream, eval_batch_stream, MODEL_DIR)
loop.run(10)

  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 55144980
Step      1: Ran 1 train steps in 46.49 secs
Step      1: train CrossEntropyLoss |  10.44148540


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  10.42037392
Step      1: eval          Accuracy |  0.00000000

Step     10: Ran 9 train steps in 172.55 secs
Step     10: train CrossEntropyLoss |  10.30172920
Step     10: eval  CrossEntropyLoss |  10.13704491
Step     10: eval          Accuracy |  0.07446808


In [17]:
# Get the model architecture
model = TransformerLM(mode='eval')

# Load the pre-trained weights
model.init_from_file('model/model.pkl.gz', weights_only=True)

((((), (), ()),
  array([[ 0.00185256, -0.00332041, -0.00516087, ...,  0.02195336,
           0.01051251, -0.00567034],
         [-0.00739519,  0.00183287,  0.00196656, ...,  0.05615862,
           0.00132284, -0.00291792],
         [ 0.01545155,  0.00096676, -0.00778854, ..., -0.01111389,
           0.01328059, -0.00775655],
         ...,
         [ 0.01310026,  0.0053312 , -0.00110446, ...,  0.0018376 ,
          -0.01304161,  0.01059895],
         [ 0.01033204,  0.00199992, -0.00960302, ..., -0.00053245,
          -0.00377625,  0.00046856],
         [ 0.00082984, -0.00329287, -0.00404815, ...,  0.00028511,
          -0.00377798, -0.00656456]], dtype=float32),
  (),
  array([[ 0.01798599,  0.00387794,  0.00273774, ..., -0.07271248,
           0.01859818,  0.0302424 ],
         [ 0.05207964,  0.02332034,  0.00967251, ..., -0.02486727,
           0.0165352 , -0.00352491],
         [ 0.03194407, -0.02127359, -0.00068408, ..., -0.01532798,
           0.01319801, -0.00402305],
         ..

In [18]:
def next_symbol(cur_output_tokens, model):
    """Returns the next symbol for a given sentence.

    Args:
        cur_output_tokens (list): tokenized sentence with EOS and PAD tokens at the end.
        model (trax.layers.combinators.Serial): The transformer model.

    Returns:
        int: tokenized symbol.
    """
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # current output tokens length
    token_length = len(cur_output_tokens)
    # calculate the minimum power of 2 big enough to store token_length
    # HINT: use np.ceil() and np.log2()
    # add 1 to token_length so np.log2() doesn't receive 0 when token_length is 0
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    # Fill cur_output_tokens with 0's until it reaches padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length)
    padded_with_batch = np.array(padded)[None, :] # Don't replace this 'None'! This is a way of setting the batch dim

    # model expects a tuple containing two padded tensors (with batch)
    output, _ = model((padded_with_batch, padded_with_batch))  
    # HINT: output has shape (1, padded_length, vocab_size)
    # To get log_probs you need to index output with 0 in the first dim
    # token_length in the second dim and all of the entries for the last dim.
    log_probs = output[0, token_length, :]
    
    ### END CODE HERE ###
    
    return int(np.argmax(log_probs))

In [19]:
def greedy_decode(input_sentence, model):
    """Greedy decode function.

    Args:
        input_sentence (string): a sentence or article.
        model (trax.layers.combinators.Serial): Transformer model.

    Returns:
        string: summary of the input.
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # Use tokenize()
    cur_output_tokens = tokenize(input_sentence) + [0]
    generated_output = [] 
    cur_output = 0 
    EOS = 1 
    
    while cur_output != EOS:
        # Get next symbol
        cur_output = next_symbol(cur_output_tokens, model)
        # Append next symbol to original sentence
        cur_output_tokens.append(cur_output)
        # Append next symbol to generated sentence
        generated_output.append(cur_output)
#         print(detokenize(generated_output))
    
    ### END CODE HERE ###
    
    return detokenize(generated_output)

In [20]:
test_sentence = "It was a sunny day when I went to the market to buy some flowers. But I only found roses, not tulips."
print(wrapper.fill(test_sentence), '\n')
print(greedy_decode(test_sentence, model))

It was a sunny day when I went to the market to buy some flowers. But
I only found roses, not tulips. 

: I just found roses, not tulips.<EOS>


In [21]:
article = "It’s the posing craze sweeping the U.S. after being brought to fame by skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert Pujols - and even Republican politician Rick Perry. But now four students at Riverhead High School on Long Island, New York, have been suspended for dropping to a knee and taking up a prayer pose to mimic Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll were all suspended for one day because the ‘Tebowing’ craze was blocking the hallway and presenting a safety hazard to students. Scroll down for video. Banned: Jordan Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured left) were all suspended for one day by Riverhead High School on Long Island, New York, for their tribute to Broncos quarterback Tim Tebow. Issue: Four of the pupils were suspended for one day because they allegedly did not heed to warnings that the 'Tebowing' craze at the school was blocking the hallway and presenting a safety hazard to students."
print(wrapper.fill(article), '\n')
print(greedy_decode(article, model))

It’s the posing craze sweeping the U.S. after being brought to fame by
skier Lindsey Vonn, soccer star Omar Cummings, baseball player Albert
Pujols - and even Republican politician Rick Perry. But now four
students at Riverhead High School on Long Island, New York, have been
suspended for dropping to a knee and taking up a prayer pose to mimic
Denver Broncos quarterback Tim Tebow. Jordan Fulcoly, Wayne Drexel,
Tyler Carroll and Connor Carroll were all suspended for one day
because the ‘Tebowing’ craze was blocking the hallway and presenting a
safety hazard to students. Scroll down for video. Banned: Jordan
Fulcoly, Wayne Drexel, Tyler Carroll and Connor Carroll (all pictured
left) were all suspended for one day by Riverhead High School on Long
Island, New York, for their tribute to Broncos quarterback Tim Tebow.
Issue: Four of the pupils were suspended for one day because they
school was blocking the hallway and presenting a safety hazard to
students. 

Jordan Fulcoly, Wayne Drexel, Ty