In [104]:
import pandas as pd
import numpy as np
import re
from os import listdir
from nltk.corpus import stopwords

from pickle import dump, load

from keras.models import Model
from keras.layers import Input, LSTM, Dense

Using TensorFlow backend.


In [78]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [79]:
# split a document into news story and highlights
def split_story(doc):
	# find first highlight
	index = doc.find('@highlight')
	# split into story and highlights
	story, highlights = doc[:index], doc[index:].split('@highlight')
	# strip extra white space around each highlight
	highlights = [h.strip() for h in highlights if len(h) > 0]
	return story, highlights

In [80]:
# load all stories in a directory
def load_stories(directory):
	all_stories = list()
	for name in listdir(directory):
		filename = directory + '/' + name
		# load document
		doc = load_doc(filename)
		# split into story and highlights
		story, highlights = split_story(doc)
		# store
		all_stories.append({'story':story, 'highlights':highlights})
	return all_stories

In [81]:
reviews = pd.DataFrame.from_dict(load_stories('../../ProjectDataset/cnn/examples/'))

print(reviews.shape)

print(reviews.head())

print(reviews.isnull().sum())

(20, 2)
                                          highlights  \
0  [The 15 new cardinals will be installed on Feb...   
1  [NEW: Bermudan premier: "Above all, this was a...   
2  [A 4-year-old boy is the latest victim of a ma...   
3  [NEW: Kyle White: "Without this team, there wo...   
4  [Captive boys and men were rescued from an Isl...   

                                               story  
0  (CNN)For the second time during his papacy, Po...  
1  HAMILTON, Bermuda (CNN) -- Four Chinese nation...  
2  Kathmandu, Nepal (CNN) -- A ferocious leopard ...  
3  (CNN) -- Kyle White now has two pieces of meta...  
4  (CNN) -- The 54 men and 14 boys rescued after ...  
highlights    0
story         0
dtype: int64


In [82]:
display(reviews.head())

Unnamed: 0,highlights,story
0,[The 15 new cardinals will be installed on Feb...,"(CNN)For the second time during his papacy, Po..."
1,"[NEW: Bermudan premier: ""Above all, this was a...","HAMILTON, Bermuda (CNN) -- Four Chinese nation..."
2,[A 4-year-old boy is the latest victim of a ma...,"Kathmandu, Nepal (CNN) -- A ferocious leopard ..."
3,"[NEW: Kyle White: ""Without this team, there wo...",(CNN) -- Kyle White now has two pieces of meta...
4,[Captive boys and men were rescued from an Isl...,(CNN) -- The 54 men and 14 boys rescued after ...


In [83]:
reviews = reviews.dropna()

reviews = reviews.reset_index(drop=True)

display(reviews.head())

for i in range(5):

    print("Review #",i+1)

    display(reviews.story[i])

    display(reviews.highlights[i])

print()

Unnamed: 0,highlights,story
0,[The 15 new cardinals will be installed on Feb...,"(CNN)For the second time during his papacy, Po..."
1,"[NEW: Bermudan premier: ""Above all, this was a...","HAMILTON, Bermuda (CNN) -- Four Chinese nation..."
2,[A 4-year-old boy is the latest victim of a ma...,"Kathmandu, Nepal (CNN) -- A ferocious leopard ..."
3,"[NEW: Kyle White: ""Without this team, there wo...",(CNN) -- Kyle White now has two pieces of meta...
4,[Captive boys and men were rescued from an Isl...,(CNN) -- The 54 men and 14 boys rescued after ...


Review # 1


'(CNN)For the second time during his papacy, Pope Francis has announced a new group of bishops and archbishops set to become cardinals -- and they come from all over the world.\n\nPope Francis said Sunday that he would hold a meeting of cardinals on February 14 "during which I will name 15 new Cardinals who, coming from 13 countries from every continent, manifest the indissoluble links between the Church of Rome and the particular Churches present in the world," according to Vatican Radio.\n\nNew cardinals are always important because they set the tone in the church and also elect the next pope, CNN Senior Vatican Analyst John L. Allen said. They are sometimes referred to as the princes of the Catholic Church.\n\nThe new cardinals come from countries such as Ethiopia, New Zealand and Myanmar.\n\n"This is a pope who very much wants to reach out to people on the margins, and you clearly see that in this set," Allen said. "You\'re talking about cardinals from typically overlooked places, 

['The 15 new cardinals will be installed on February 14',
 'They come from countries such as Myanmar and Tonga',
 "No Americans made the list this time or the previous time in Francis' papacy"]

Review # 2


'HAMILTON, Bermuda (CNN) -- Four Chinese nationals of Uyghur ethnicity who had been held at the U.S. military\'s Guantanamo Bay, Cuba, detention facility have been resettled in Bermuda, officials said Thursday.\n\nAttorney General Eric Holder says the U.S. is "extremely grateful to the government of Bermuda."\n\n"Above all, this was a humanitarian act," Bermudan Premier Ewart Brown told CNN in an interview at his Cabinet office in Hamilton, Bermuda. "We don\'t see it as quid pro quo."\n\nThe four were twice cleared for release -- once by the Bush administration and again this year, according to a Justice Department statement.\n\nThey were among 17 Uyghur detainees at the facility set up to hold terror suspects.\n\nThe four were flown by private plane Wednesday night from Cuba to Bermuda and were accompanied by U.S. and Bermudan representatives as well as their attorneys, according to Susan Baker Manning, part of the men\'s legal team.\n\nPresident Obama has pledged to close the Guantan

['NEW: Bermudan premier: "Above all, this was a humanitarian act"',
 'Uyghurs are native Chinese Muslims; the detainees were apprehended in Pakistan',
 'China urges U.S. to hand over all 17 Uyghurs held at Guantanamo Bay, Cuba',
 'Official says U.S. still negotiating with Palau to take remaining 13 Uyghurs']

Review # 3


'Kathmandu, Nepal (CNN) -- A ferocious leopard may have killed 15 people in Nepal in a 15-month span, its latest victim a 4-year-old boy that the creature dragged away into the jungle to eat.\n\nThe head of boy was found in the forest a kilometer from his home Saturday morning, said Kamal Prasad Kharel, the police chief of the Baitadi district, an area about 600 kilometers (373 miles) west of Kathmandu.\n\nThe grisly discovery, which came after teams of people searched for the child, marks the 15th victim in the past 15 months in that remote district in western Nepal.\n\nThe police chief suspects that a single man-eating leopard is responsible for the deaths. If not, there are at most two of the man-eating creatures around, he believes.\n\nMaheshwor Dhakal, an ecologist at the Department of National Parks and Wildlife Conservation in Kathmandu, agreed that it is unusual to find more than one or two man-eating animals in one area. Most leopards live on wild prey.\n\nMore human victims c

['A 4-year-old boy is the latest victim of a man-eating leopard, a local police chief says',
 'He suspects one leopard is behind the deaths of 15 people in the past 15 months',
 'A reward has been offered to anyone who captures or kills the man-eating creature',
 'Leopards are common in low mountain areas of Nepal but usually eat wild prey like deer']

Review # 4


'(CNN) -- Kyle White now has two pieces of metal to wear -- one, a bracelet inscribed with the names of his six comrades killed in an ambush in Afghanistan, the other, a Medal of Honor given to him for his valor that ensured that death toll wasn\'t higher.\n\nSpeaking minutes after President Barack Obama gave him the highest military honor, White insisted the two emblems are equally significant. They both represent his family on that day six years ago -- the seven others who, like him, survived as well as those who did not.\n\nThe former Army sergeant said Tuesday he owes it to these men, whom he calls "my heroes," to live his life well, even now that he\'s left the military, and with honor.\n\n"Though I am still uncomfortable with hearing my name and the word \'hero\' in the same sentence, I am now ready for the challenge of proudly wearing this piece of blue fabric and carved metal with the same reverence that I wear the bracelet. And I vow to live up to the responsibility of doing s

['NEW: Kyle White: "Without this team, there would be no Medal of Honor"',
 'NEW: He vows to "live up to the responsibility" of having the top military award',
 'NEW: Obama calls White "a soldier who embodies the courage of his generation"',
 'The Army vet, then 20, braved enemy fire to save his wounded comrades in Afghanistan']

Review # 5


'(CNN) -- The 54 men and 14 boys rescued after being found chained this week at an Islamic religious school in Pakistan have been reunited with their families or placed in shelters, authorities said.\n\nThe group was discovered in an underground room with heavy chains linking them together.\n\nThe school, Al-Arabiya Aloom Jamia Masjid Zikirya, which also was a drug rehab clinic, is in Sohrab Goth, a suburb of Gadap in Karachi.\n\nAll 14 boys were returned to their families, senior police official Ahsanullah Marwat told CNN.\n\nOf the adults, 47 had been released to their families, and seven were handed over to a shelter for the homeless, he said.\n\nThree people who worked at the facility were arrested, but the four men who ran the place were still at large, Marwat said.\n\nOfficials said the facility was part madrassa and part drug-rehab facility, and the captives were chained at night apparently to prevent their escape.\n\n"The operation was successful, and we plan on continuing our 

['Captive boys and men were rescued from an Islamic religious school in Pakistan',
 'They were reunited with their families this week',
 'The facility was a school and drug rehab clinic',
 "Authorities say they're searching for the owners; three others arrested at the facility"]




In [84]:
contractions = {

"ain't": "am not",

"aren't": "are not",

"can't": "cannot",

"can't've": "cannot have",

"'cause": "because",

"could've": "could have",

"couldn't": "could not",

"couldn't've": "could not have",

"didn't": "did not",

"doesn't": "does not",

"don't": "do not",

"hadn't": "had not",

"hadn't've": "had not have",

"hasn't": "has not",

"haven't": "have not",

"he'd": "he would",

"he'd've": "he would have"}

In [85]:
def clean_text(text, remove_stopwords=True, is_list=False):

    # Convert words to lower case
    if not is_list:
        
        text = text.lower()

        text = text.split()

        new_text = []

        for word in text:

            if word in contractions:

                new_text.append(contractions[word])

        else:

            new_text.append(word)

        text = " ".join(new_text)

        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

        text = re.sub(r'\<a href', ' ', text)

        text = re.sub(r'&amp;', '', text)

        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)

        text = re.sub(r'<br />', ' ', text)

        text = re.sub(r'\'', ' ', text)

        if remove_stopwords:

            text = text.split()

            stops = set(stopwords.words("english"))

            text = [w for w in text if not w in stops]

            text = " ".join(text)
        
        return text
    
    else:
        
        summ = []
        
        for highlight in text:
            
            highlight = highlight.lower()

            highlight = highlight.split()

            new_text = []

            for word in highlight:

                if word in contractions:

                    new_text.append(contractions[word])

            else:

                new_text.append(word)

            highlight = " ".join(new_text)

            highlight = re.sub(r'https?:\/\/.*[\r\n]*', '', highlight, flags=re.MULTILINE)

            highlight = re.sub(r'\<a href', ' ', highlight)

            highlight = re.sub(r'&amp;', '', highlight)

            highlight = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', highlight)

            highlight = re.sub(r'<br />', ' ', highlight)

            highlight = re.sub(r'\'', ' ', highlight)

            if remove_stopwords:

                highlight = highlight.split()

                stops = set(stopwords.words("english"))

                highlight = [w for w in highlight if not w in stops]

                highlight = " ".join(highlight)
                
            summ.append(highlight)
        
        return summ

In [86]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\lenovo
[nltk_data]     pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
# Clean the summaries and texts

clean_summaries = []

for summary in reviews.highlights:

    clean_summaries.append(clean_text(summary, remove_stopwords=False, is_list=True))

print("Summaries are complete.")

clean_texts = []

for text in reviews.story:

    clean_texts.append(clean_text(text))

print("Texts are complete.")

Summaries are complete.
Texts are complete.


In [89]:
stories = list()

for i, text in enumerate(clean_texts):

    stories.append({'story': text, 'highlights': clean_summaries[i]})

# save to file

dump(stories, open('cnn_stories.pkl', 'wb'))

In [90]:
batch_size = 64

epochs = 110

latent_dim = 256

num_samples = 10000

In [92]:
stories = load(open('cnn_stories.pkl', 'rb'))

print('Loaded Stories %d' % len(stories))

print(type(stories))

Loaded Stories 20
<class 'list'>


In [93]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
for story in stories:
    input_text = story['story']
    for highlight in story['highlights']:
        target_text = highlight

    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 20
Number of unique input tokens: 20
Number of unique output tokens: 23
Max sequence length for inputs: 31
Max sequence length for outputs: 13


In [98]:
def define_models(n_input, n_output, n_units):

    # define training encoder

    encoder_inputs = Input(shape=(None, n_input))

    encoder = LSTM(n_units, return_state=True)

    encoder_outputs, state_h, state_c = encoder(encoder_inputs)

    encoder_states = [state_h, state_c]

    # define training decoder

    decoder_inputs = Input(shape=(None, n_output))

    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)

    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

    decoder_dense = Dense(n_output, activation='softmax')

    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # define inference encoder

    encoder_model = Model(encoder_inputs, encoder_states)

    # define inference decoder

    decoder_state_input_h = Input(shape=(n_units,))

    decoder_state_input_c = Input(shape=(n_units,))

    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,  initial_state=decoder_states_inputs)

    decoder_states = [state_h, state_c]

    decoder_outputs = decoder_dense(decoder_outputs)

    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    # return all models

    return model, encoder_model, decoder_model

In [105]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('model.h5')

Train on 16 samples, validate on 4 samples
Epoch 1/110
Epoch 2/110
Epoch 3/110
Epoch 4/110
Epoch 5/110
Epoch 6/110
Epoch 7/110
Epoch 8/110
Epoch 9/110
Epoch 10/110
Epoch 11/110
Epoch 12/110
Epoch 13/110
Epoch 14/110
Epoch 15/110
Epoch 16/110
Epoch 17/110
Epoch 18/110
Epoch 19/110
Epoch 20/110
Epoch 21/110
Epoch 22/110
Epoch 23/110
Epoch 24/110
Epoch 25/110
Epoch 26/110
Epoch 27/110
Epoch 28/110
Epoch 29/110
Epoch 30/110
Epoch 31/110
Epoch 32/110
Epoch 33/110
Epoch 34/110
Epoch 35/110
Epoch 36/110
Epoch 37/110
Epoch 38/110
Epoch 39/110
Epoch 40/110
Epoch 41/110
Epoch 42/110
Epoch 43/110
Epoch 44/110
Epoch 45/110
Epoch 46/110
Epoch 47/110
Epoch 48/110
Epoch 49/110
Epoch 50/110
Epoch 51/110
Epoch 52/110
Epoch 53/110
Epoch 54/110
Epoch 55/110
Epoch 56/110
Epoch 57/110
Epoch 58/110
Epoch 59/110
Epoch 60/110
Epoch 61/110
Epoch 62/110
Epoch 63/110
Epoch 64/110
Epoch 65/110
Epoch 66/110
Epoch 67/110
Epoch 68/110
Epoch 69/110
Epoch 70/110
Epoch 71/110
Epoch 72/110
Epoch 73/110
Epoch 74/110
Epoc

Epoch 83/110
Epoch 84/110
Epoch 85/110
Epoch 86/110
Epoch 87/110
Epoch 88/110
Epoch 89/110
Epoch 90/110
Epoch 91/110
Epoch 92/110
Epoch 93/110
Epoch 94/110
Epoch 95/110
Epoch 96/110
Epoch 97/110
Epoch 98/110
Epoch 99/110
Epoch 100/110
Epoch 101/110
Epoch 102/110
Epoch 103/110
Epoch 104/110
Epoch 105/110
Epoch 106/110
Epoch 107/110
Epoch 108/110
Epoch 109/110
Epoch 110/110


  '. They will not be included '


In [106]:
# generate target given source sequence
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
	# encode
	state = infenc.predict(source)
	# start of sequence input
	target_seq = array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
	# collect predictions
	output = list()
	for t in range(n_steps):
		# predict next char
		yhat, h, c = infdec.predict([target_seq] + state)
		# store prediction
		output.append(yhat[0,0,:])
		# update state
		state = [h, c]
		# update target sequence
		target_seq = yhat
	return array(output)

In [108]:
_, infenc, infdec = define_models(51, 51, 128)

In [110]:
target = predict_sequence(infenc, infdec, stories[0]['story'], 3, 51)

AttributeError: 'str' object has no attribute 'ndim'