In [2]:
import csv

import numpy as np

try:
    from gensim import models
except ModuleNotFoundError as e:
    !pip install gensim==3.8.0
    from gensim import models
try:
    import pandas as pd
except ModuleNotFoundError as e:
    !pip install pandas
    import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# This line prevents TF crashing when using convolutional networks
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
def prepare_data(model_path, data_prefix, seq_len, embedding_size):
    """
    
    Args:
        model_path (str): path to embedding model
        data_prefix (str): path to data prefix
    
    """
    
    def load_data(path):
        output = []
        
        with open(path, 'r', encoding='latin-1') as data:
        #with open(path, 'r', encoding='utf-8') as data:
            for row in csv.reader(data):
                output.append(row)
                
        return output
    
    # Opens embedding model
    model_ = models.Word2Vec.load(model_path)
    
    # Open dataset
    data_train = load_data(data_prefix + "X_train.csv")
    data_val = load_data(data_prefix + "X_val.csv")
    label_train = np.loadtxt(data_prefix + "y_train.csv")
    label_val = np.loadtxt(data_prefix + "y_val.csv")
    
    # Gets embeddings from model
    dt = []
    lt = []
    omissions_ = 0
    
    for i, seq in enumerate(data_train):
        
        try:        
            embedding = model_.wv[seq]
            dt.append(embedding)
            lt.append(label_train[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
            
#     print(omissions_)
    
    # Gets embeddings from model
    dv = []
    lv = []
    omissions_ = 0
    
    for i, seq in enumerate(data_val):
        
        try:
            embedding = model_.wv[seq]
            dv.append(embedding)
            lv.append(label_val[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
    
#     print(omissions_)
    
    # Pads sequences
    dt = pad_sequences(dt, padding='post', dtype='float64', maxlen=seq_len)
    dv = pad_sequences(dv, padding='post', dtype='float64', maxlen=seq_len)
    
    # Converts lists to numpy arrays
#     dt = np.asarray(dt).reshape((len(dt), seq_len * embedding_size))
#     dv = np.asarray(dv).reshape((len(dv), seq_len * embedding_size))
    
    lt = np.asarray(lt)
    lv = np.asarray(lv)
    
    return dt, dv, lt, lv

In [5]:
simpson_model_5_path = "./resources/embeddings/Simpsons_5_7.model"
simpson_model_25_path = "./resources/embeddings/Simpsons_25_7.model"
simpson_model_125_path = "./resources/embeddings/Simpsons_125_7.model"
simpson_prefix = "./data/simpsons/"

In [6]:
def create_checkpoint_callback(filepath):
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=filepath,
        save_weights_only=False,
        monitor='accuracy',
        mode='max',
        save_best_only=True)
    return model_checkpoint_callback

## Baseline

In [7]:
def create_base_model(embedding_size, seq_len, classes):
    model = keras.Sequential([
        layers.Dense((embedding_size * seq_len)/2, activation="relu", input_shape = (X_train.shape)),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    model.summary()
    return model

In [10]:
SEQ_LEN_ = [8, 11, 16]
epochs = 20
embedding_size = 5
metrics_df = pd.DataFrame()
for seq_len in SEQ_LEN_:
    X_train, X_val, y_train, y_val = prepare_data(simpson_model_5_path, simpson_prefix, seq_len, embedding_size)
    X_train = np.asarray(X_train).reshape((len(X_train), seq_len * embedding_size))
    X_val = np.asarray(X_val).reshape((len(X_val), seq_len * embedding_size))
    callback = create_checkpoint_callback('./resources/checkpoints/baseline_' + 
                                          str(embedding_size) + '_' +str(seq_len))
    model = create_base_model(embedding_size, seq_len, 4)
    model.fit(X_train, y_train, epochs = epochs, verbose=0, callbacks=[callback])
    model_metrics = model.evaluate(x=X_val, y=y_val)
    data = {'model_name': 'baseline', 'embedding_size': [embedding_size], 'seq_len': [seq_len],
           'accuracy': [model_metrics[1]], 'precision': [model_metrics[2]], 'recall': [model_metrics[3]]}
    metrics_df = pd.concat([metrics_df, pd.DataFrame(data=data)])
    print('----------------------Model trained--------------------------------')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 45297, 20)         820       
_________________________________________________________________
dense_3 (Dense)              (None, 45297, 4)          84        
Total params: 904
Trainable params: 904
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_5_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_5_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_5_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_5_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_5_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_5_8/assets
----------------------Model trained---------

----------------------Model trained--------------------------------


In [17]:
embedding_size = 25
for seq_len in SEQ_LEN_:
    X_train, X_val, y_train, y_val = prepare_data(simpson_model_25_path, simpson_prefix, seq_len, embedding_size)
    X_train = np.asarray(X_train).reshape((len(X_train), seq_len * embedding_size))
    X_val = np.asarray(X_val).reshape((len(X_val), seq_len * embedding_size))
    callback = create_checkpoint_callback('./resources/checkpoints/baseline_' + 
                                          str(embedding_size) + '_' +str(seq_len))
    model = create_base_model(embedding_size, seq_len, 4)
    model.fit(X_train, y_train, epochs = epochs, verbose=0, callbacks=[callback])
    model_metrics = model.evaluate(x=X_val, y=y_val)
    data = {'model_name': 'baseline', 'embedding_size': [embedding_size], 'seq_len': [seq_len],
           'accuracy': [model_metrics[1]], 'precision': [model_metrics[2]], 'recall': [model_metrics[3]]}
    metrics_df = pd.concat([metrics_df, pd.DataFrame(data=data)])
    print('----------------------Model trained--------------------------------')

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 45297, 100)        20100     
_________________________________________________________________
dense_13 (Dense)             (None, 45297, 4)          404       
Total params: 20,504
Trainable params: 20,504
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_8/assets
INFO:tensorflow:Assets written t

INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_16/assets
INFO:tensorflow:Assets writt

In [18]:
embedding_size = 125
for seq_len in SEQ_LEN_:
    X_train, X_val, y_train, y_val = prepare_data(simpson_model_125_path, simpson_prefix, seq_len, embedding_size)
    X_train = np.asarray(X_train).reshape((len(X_train), seq_len * embedding_size))
    X_val = np.asarray(X_val).reshape((len(X_val), seq_len * embedding_size))
    model_name = 'baseline_' + str(embedding_size) + '_' + str(seq_len)
    callback = create_checkpoint_callback('./resources/checkpoints/baseline_' + 
                                          str(embedding_size) + '_' +str(seq_len))
    model = create_base_model(embedding_size, seq_len, 4)
    model.fit(X_train, y_train, epochs = epochs, verbose=0, callbacks=[callback])
    model_metrics = model.evaluate(x=X_val, y=y_val)
    data = {'model_name': 'baseline', 'embedding_size': [embedding_size], 'seq_len': [seq_len],
           'accuracy': [model_metrics[1]], 'precision': [model_metrics[2]], 'recall': [model_metrics[3]]}
    metrics_df = pd.concat([metrics_df, pd.DataFrame(data=data)])
    print('----------------------Model trained--------------------------------')

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 45297, 500)        500500    
_________________________________________________________________
dense_19 (Dense)             (None, 45297, 4)          2004      
Total params: 502,504
Trainable params: 502,504
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_8/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_8/assets
INFO:tensorflow:Assets w

INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_125_16/assets
INFO:tensorflow:

In [33]:
metrics_df.groupby('seq_len').mean()

Unnamed: 0_level_0,embedding_size,accuracy,precision,recall
seq_len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,51.666667,0.404364,0.451705,0.216547
11,51.666667,0.402861,0.436883,0.241944
16,51.666667,0.400748,0.426946,0.265513


## Fully conected

## Simple RNN

In [36]:
def create_RNN_model(embedding_size, seq_len, classes):
    model = keras.Sequential([
        layers.SimpleRNN(units = 20, input_shape=(seq_len, embedding_size)),
        layers.Flatten(),
        layers.Dense((embedding_size * seq_len)/2, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    model.summary()
    return model

In [38]:
epochs = 100
embedding_size = 25
seq_len = 11
X_train, X_val, y_train, y_val = prepare_data(simpson_model_25_path, simpson_prefix, seq_len, embedding_size)
callback = create_checkpoint_callback('./resources/checkpoints/baseline_' + 
                                      str(embedding_size) + '_' +str(seq_len))
model = create_RNN_model(embedding_size, seq_len, 4)
model.fit(X_train, y_train, epochs = epochs, verbose=1, callbacks=[callback])
model_metrics = model.evaluate(x=X_val, y=y_val)
data = {'model_name': 'simpleRNN', 'embedding_size': [embedding_size], 'seq_len': [seq_len],
       'accuracy': [model_metrics[1]], 'precision': [model_metrics[2]], 'recall': [model_metrics[3]]}
metrics_df = pd.concat([metrics_df, pd.DataFrame(data=data)])
print('----------------------Model trained--------------------------------')

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_4 (SimpleRNN)     (None, 20)                920       
_________________________________________________________________
flatten_4 (Flatten)          (None, 20)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 137)               2877      
_________________________________________________________________
dense_33 (Dense)             (None, 4)                 552       
Total params: 4,349
Trainable params: 4,349
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 2/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 3/100
INFO:tensorflow:Assets written to: ./resources/chec

Epoch 36/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 37/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 38/100
Epoch 39/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 40/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 41/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 42/100
Epoch 43/100
Epoch 44/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 45/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 52/100
Epoch 53/100
Epoch 54/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 55/100
Epoch 56/10

Epoch 84/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 91/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
INFO:tensorflow:Assets written to: ./resources/checkpoints/baseline_25_11/assets
Epoch 98/100
Epoch 99/100
Epoch 100/100
----------------------Model trained--------------------------------


## LSTM

In [22]:
def create_LSTM_model(embedding_size, padding):
    model = keras.Sequential([
        layers.LSTM(units = 20, return_sequences = True, input_shape=(padding, embedding_size)),
        layers.LSTM(units = 20, return_sequences = True),
        layers.LSTM(units = 20, return_sequences = True),
        layers.Flatten(),
        layers.Dense(125, activation="relu"),
        layers.Dense(4, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )model.fit(simpson_data_train, simpson_label_train, epochs=100, verbose=1)
    model.summary()
    return model

In [15]:
model.fit(simpson_data_train, simpson_label_train, epochs=100, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3611f039b0>

In [16]:
simpson_data_train.shape

(45297, 1000)

In [30]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 500)               500500    
_________________________________________________________________
dense_19 (Dense)             (None, 4)                 2004      
Total params: 502,504
Trainable params: 502,504
Non-trainable params: 0
_________________________________________________________________


In [48]:
simpson_data_train[0]

array([ 2.04025239e-01, -8.18579018e-01,  8.46859634e-01, -6.90131664e-01,
       -1.79423022e+00, -1.27281499e+00, -1.73045766e+00,  1.83969393e-01,
       -1.69298339e+00, -2.11381465e-01, -6.47310257e-01, -1.32479417e+00,
       -1.69165060e-01,  2.88933337e-01,  8.84831846e-01, -6.91309452e-01,
        1.97555518e+00,  3.74160439e-01,  6.75695390e-02, -1.33795285e+00,
       -4.36639637e-01, -5.68941474e-01,  3.85461152e-01,  1.07632530e+00,
        4.19354588e-01, -8.91717672e-02,  1.15629165e-02, -1.20293081e+00,
       -8.77407789e-01,  4.17461038e-01, -4.70195040e-02,  1.23368061e+00,
       -8.37320089e-01, -4.98897098e-02, -1.11370429e-01,  9.31479968e-03,
       -1.11085927e+00, -1.68201238e-01, -8.46841276e-01, -7.21324980e-01,
        4.90954936e-01, -4.27353591e-01,  6.75653517e-01, -5.59205174e-01,
       -3.87647212e-01,  1.61865270e+00,  1.35027838e+00, -1.17099094e+00,
        7.92436838e-01,  1.27537251e+00, -3.05841178e-01,  1.34385586e-01,
       -3.87544423e-01,  