# Reading the Data
We'll start by reading in all the headlines from the articles. The articles are in CVS format, so we use *pandas* to read them in.

In [8]:
import os
import pandas as pd
nyt_dir = 'data/nyt_dataset/articles/'

all_headlines = []
for filename in os.listdir(nyt_dir):
    if 'Articles' in filename:
        # Read in all the data from csv
        headlines_df = pd.read_csv(nyt_dir + filename)
        # Add all the headlines to our list
        all_headlines.extend(list(headlines_df.headline.values)) #todo lookup .extend
len(all_headlines)

9335

In [12]:
all_headlines[20:40]

['Initial Description',
 'Rough Estimates',
 'El Pasatiempo Nacional',
 'Cooling Off on a Hot Day at Yankee Stadium',
 'Trump’s Staff Mixed Politics and Paydays',
 'A Virtuoso Rebuilding Act Requires Everyone in Tune',
 '‘Homeland,’ Season 6, Episode 11: Is Quinn Just a Natural Killer?',
 '‘Big Little Lies’ and the Art of Empathy',
 'Upending a Whodunit',
 '‘Feud: Bette and Joan’ Episode 5: Taking the Stage',
 '‘Billions’ Season 2, Episode 7: Greed Is Good. Except When It’s Not.',
 'Unknown',
 'What’s Going On in This Picture? | April 3, 2017',
 'Unknown',
 'Have You Ever Felt Pressured by Family or Others in Making an Important Decision About Your Future?',
 'Unknown',
 'A Cornerstone of Peace at Risk',
 'Trump Is  Wimping Out on Trade',
 'The Dwindling Odds of Coincidence',
 'What Was Lenin Thinking?']

# Cleaning the data

In [16]:
all_headlines = [h for h in all_headlines if h != 'Unknown'] # TODO lup
len(all_headlines)

8603

We also want to remove punctuation and make our sentences all lower case, because this will make our model easier to train. For our purposes, there is little or no difference between a line ending with "!" or "?" or whether words are capitalized, as in "The" or lower-case, as in "the". With fewer unique tokens, our model will be easier to train.
# Tokenization
```python
tensorflow.keras.preprocessing.text.Tokenizer(
    num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
    split=' ', char_level=False, oov_token=None, document_count=0, **kwargs
)
```

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the words in our headlines
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_headlines)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  11753


We can take a quick look at word_index dictionary to see how the tokenizer saves the words

In [21]:
# Print a subset of the word_index dictionary created by Tokenizer
subset_dict = {key: value for key, value in tokenizer.word_index.items() \
               if key in ['a','man','a','plan','a','canal','panama']}
print(subset_dict)

{'a': 2, 'plan': 82, 'man': 138, 'panama': 3379, 'canal': 7144}


In [27]:
tokenizer.sequences_to_texts([[1]]) # 1st word met by tokenizer
tokenizer.texts_to_sequences(['a','man','a','plan','a','canal','panama'])

['in']

# Creating a Sequences


In [35]:
# Convert data to sequence of tokens
input_sequences = []
for line in all_headlines:
    # Convert our headline into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([line])[0]

    # Create a series of sequences for each headline
    for i in range(1,len(token_list)):
        partial_sequence = token_list[:i+1]
        input_sequences.append(partial_sequence)
print(tokenizer.sequences_to_texts(input_sequences[:9]))
input_sequences[:9]

['finding an', 'finding an expansive', 'finding an expansive view', 'finding an expansive view of', 'finding an expansive view of a', 'finding an expansive view of a forgotten', 'finding an expansive view of a forgotten people', 'finding an expansive view of a forgotten people in', 'finding an expansive view of a forgotten people in niger']


[[403, 17],
 [403, 17, 5242],
 [403, 17, 5242, 543],
 [403, 17, 5242, 543, 4],
 [403, 17, 5242, 543, 4, 2],
 [403, 17, 5242, 543, 4, 2, 1616],
 [403, 17, 5242, 543, 4, 2, 1616, 151],
 [403, 17, 5242, 543, 4, 2, 1616, 151, 5],
 [403, 17, 5242, 543, 4, 2, 1616, 151, 5, 1992]]

# Padding Sequences

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

#Determine max sequence length
max_sequence_length =  max([len(x) for x in input_sequences])

# Pad all sequences with zeros at the beginning to make them all max length
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_length, padding='pre'))
input_sequences[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
       403,  17])