# Setup

In [2]:
# Google Only
from google.colab import drive
drive.mount('/content/drive')
ROOT_FOLDER = '/content/drive/My Drive/Code/autocomplete_me/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
## Set Variables for Local and Cloud File Finding
import os
import sys
sys.path.append(ROOT_FOLDER)

In [4]:
!ls -l '/content/drive/My Drive/Code/autocomplete_me/src'

total 27
-rw------- 1 root root 2516 Jul 14 22:20 predict_utils.py
drwx------ 2 root root 4096 Jul 11 13:20 __pycache__
-rw------- 1 root root 3340 Jul 15 12:47 reader.py
-rw------- 1 root root 3580 Jul 14 22:20 train_model_baseline.py
-rw------- 1 root root 3203 Jul 14 22:20 train_utils.py
-rw------- 1 root root 9341 Jul 12 14:24 utils.py


In [5]:
from src import utils, reader, predict_utils, train_utils
from importlib import reload
reload(utils)
reload(reader)
reload(predict_utils)
reload(train_utils)

Using TensorFlow backend.


<module 'src.train_utils' from '/content/drive/My Drive/Code/autocomplete_me/src/train_utils.py'>

## Load Text Data

In [6]:
text = reader.read_bbc_politics()
content_type = 'BBC-Politics'

In [7]:
text[0]



# Modelling

## Process Text Data

In [8]:
sequences, num_words, word_idx, idx_word = train_utils.preprocess_text(text)

In [9]:
features, labels = train_utils.pass_sliding_window(sequences, sequence_len=10)

There are 186099 sequences.


In [9]:
labels = train_utils.one_hot_labels_and_improve_efficiency(labels)

Labels matrix shape:  (186099, 11963)
Labels matrix shape:  (186099, 11963)


In [10]:
# Create Test Train Set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=42, shuffle=True)

In [11]:
import gc
gc.enable()
del labels
gc.collect()

0

In [12]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

X_train shape:  (148879, 10)
X_test shape:  (37220, 10)
y_train shape:  (148879, 11963)
y_test shape:  (37220, 11963)


In [13]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                       y_train:  1.7 GiB
                        y_test: 424.6 MiB
                      features: 14.2 MiB
                       X_train: 11.4 MiB
                        X_test:  2.8 MiB
                      word_idx: 576.1 KiB
                      idx_word: 576.1 KiB
                          text:  3.7 KiB
                     sequences:  3.5 KiB
                            __:  985.0 B


In [14]:
# Embedding Matrix
# embedding_matrix = utils.create_embedding_matrix(word_idx, num_words, '/Users/jaipancholi/data/glove.6B.100d.txt')
embedding_matrix = utils.create_embedding_matrix(word_idx, num_words, '/content/drive/My Drive/Code/autocomplete_me/data/glove.6B.100d.txt')
embedding_matrix

Glove Vectors loading with dimension 100
There were 986 words without pre-trained embeddings.



invalid value encountered in true_divide



array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00656124, -0.04206555,  0.12508174, ..., -0.02506376,
         0.14220549,  0.04648907],
       [-0.02940788,  0.00775488,  0.02958461, ..., -0.0617054 ,
         0.07386386, -0.02477734],
       ...,
       [-0.00428263,  0.25175653,  0.0238415 , ...,  0.0984367 ,
        -0.01810912, -0.17835365],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.04594895,  0.09532217, -0.11963347, ...,  0.12868619,
        -0.04211046,  0.03951213]])

# Design Model

In [15]:
from tensorflow.keras.models import load_model

In [16]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [17]:
def train_model(filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=False, epochs=100):
  if not model and not use_pretrained_model:
    print('Provide one of either model or use_pretrained_model.')
  elif model and use_pretrained_model:
      print('Provide one of either model or use_pretrained_model.')
  elif use_pretrained_model:
    model = load_model(model_filepath)
  
  callbacks = [
      EarlyStopping(monitor='val_accuracy', patience=25),
      ModelCheckpoint(f'{model_filepath}', save_best_only=True, save_weights_only=False, monitor='val_accuracy')
  ]

  history = model.fit(
      X_train, 
      y_train, 
      epochs=epochs, 
      batch_size=2048, 
      validation_data=(X_test, y_test), 
      verbose=1,
      callbacks=callbacks
  )

  return history

##V1

In [None]:
model = Sequential()

model.add(    
    Embedding(
    input_dim=num_words,
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True)
)

model.add(LSTM(64))

model.add(Dropout(0.2))

model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-1.h5')
train_model(model_filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=model, epochs=500)

## V2

In [18]:
model = Sequential()

model.add(    
    Embedding(
    input_dim=num_words,
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True)
)

model.add(LSTM(256))

model.add(Dropout(0.2))

model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-2.h5')
train_model(model_filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=model, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f66ffdb8240>

## V3

In [None]:
model = Sequential()

model.add(    
    Embedding(
    input_dim=num_words,
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True)
)

model.add(LSTM(256))

model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-3.h5')
train_model(model_filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=model, epochs=500)

## V4

In [None]:
model = Sequential()

model.add(    
    Embedding(
    input_dim=num_words,
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True)
)

model.add(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))

model.add(Dropout(0.2))

model.add(LSTM(64))

model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-4.h5')
train_model(model_filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=model, epochs=500)

## V5

In [22]:
model = Sequential()

model.add(    
    Embedding(
    input_dim=num_words,
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True)
)

model.add(LSTM(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))

model.add(Dropout(0.2))

model.add(LSTM(256))

model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [23]:
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-5.h5')
train_model(model_filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=model, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f671da050f0>

## V6

In [21]:
model = Sequential()

model.add(    
    Embedding(
    input_dim=num_words,
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True)
)

model.add(LSTM(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))

model.add(Dropout(0.5))

model.add(LSTM(256))

model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-6.h5')
train_model(model_filepath, X_train, y_train, X_test, y_test, use_pretrained_model=False, model=model, epochs=500)

In [None]:
train_utils.plot_history(history)

# Comparison

### 100 Epochs

<tr>
    <th>LSTM Layers</th>
    <th>LSTM Cells per Layer</th>
    <th>Dropout %</th>
    <th>Validation Loss</th>
    <th>Validation Accuracy</th>
</tr>
<tr>
    <td>1</td>
    <td>64</td>
    <td>0.2</td>
    <td>8.9918</td>
    <td>0.2271</td>
</tr>
<tr>
    <td>1</td>
    <td>256</td>
    <td>0.2</td>
    <td>10.7950</td>
    <td>0.2354</td>
</tr>
<tr>
    <td>1</td>
    <td>256</td>
    <td>0.5</td>
    <td>8.9682</td>
    <td>0.2153</td>
</tr>
<tr>
    <td>2</td>
    <td>64</td>
    <td>0.2, 0.5</td>
    <td>6.9549</td>
    <td>0.1490</td>
</tr>
<tr>
    <td>2</td>
    <td>256</td>
    <td>0.2, 0.5</td>
    <td>7.4581</td>
    <td>0.1683</td>
</tr>
<tr>
    <td>2</td>
    <td>256</td>
    <td>0.5, 0.5</td>
    <td>7.3286</td>
    <td>0.1650</td>
</tr>

### 500 Epochs

<tr>
    <th>LSTM Layers</th>
    <th>LSTM Cells per Layer</th>
    <th>Dropout %</th>
    <th>Validation Loss</th>
    <th>Validation Accuracy</th>
</tr>
<tr>
    <td>1</td>
    <td>64</td>
    <td>0.2</td>
    <td>16.4351</td>
    <td>0.3229</td>
</tr>
<tr>
    <td>1</td>
    <td>256</td>
    <td>0.2</td>
    <td>22.5922</td>
    <td>0.3373</td>
</tr>
<tr>
    <td>1</td>
    <td>256</td>
    <td>0.5</td>
    <td>18.8511</td>
    <td>0.3329</td>
</tr>
<tr>
    <td>2</td>
    <td>64</td>
    <td>0.2, 0.5</td>
    <td>8.8127</td>
    <td>0.2230</td>
</tr>
<tr>
    <td>2</td>
    <td>256</td>
    <td>0.2, 0.5</td>
    <td>10.8240</td>
    <td>0.2854</td>
</tr>
<tr>
    <td>2</td>
    <td>256</td>
    <td>0.5, 0.5</td>
    <td>10.4388</td>
    <td>0.2666</td>
</tr>

# Generate Text Data

### Load Objects To Infer

In [10]:
from tensorflow.keras.models import load_model
model_filepath = os.path.join(ROOT_FOLDER, 'models', f'{content_type}-custom-2.h5')
model = load_model(model_filepath)
TRAINING_LENGTH = 10

## Existing Sentences

In [23]:
original_sequence, gen_list, a = predict_utils.generate_output(
    model,
    sequences,
    idx_word,
    seed_length=TRAINING_LENGTH,
    new_words=20,
    diversity=1,
    n_gen=1
)


divide by zero encountered in log



In [24]:
' '.join(word for word in original_sequence)

'executive faces more than 1 000 similar claims for damages'

In [25]:
' '.join(word for word in gen_list[0])

'< --- > from their final crisis with senior police officers and more with half their people across custody and then in many'

In [14]:
' '.join(word for word in a)

'< --- > law firm tods murray where he is a partner mr mcletchie said he has taken advice from holyrood officials about'

In [19]:
original_sequence, gen_list, a = predict_utils.generate_output(
    model,
    sequences,
    idx_word,
    seed_length=TRAINING_LENGTH,
    new_words=20,
    diversity=0.9,
    n_gen=1
)


divide by zero encountered in log



In [20]:
' '.join(word for word in original_sequence)

'the new year in the meantime we will be studying'

In [21]:
' '.join(word for word in gen_list[0])

"< --- > the announcement on his spending plans on the same after a meeting on labour's media media lord woolf for labour's"

In [22]:
' '.join(word for word in a)

'< --- > the judgment carefully to see whether it is possible to modify our legislation to address the concerns raised by the'

## Custom Sentences

In [None]:
sentence = 'Stocks of major large technology firms are becoming even more fragile even though'
predict_utils.generate_custom_sentence(sentence, word_idx, idx_word, model, new_words=20)

[None, 3, 546, 490, 45, 126, 13, 518, 150, 24, 9544, 150, 456]


ValueError: ignored

In [None]:
sentence = 'However, there have been many instances of'
predict_utils.generate_custom_sentence(sentence, word_idx, idx_word, model, new_words=50)

[None, 56, 18, 46, 67, 7424, 3]


ValueError: ignored