# Deep models

In [None]:
import csv

import numpy as np

try:
    from gensim import models
except ModuleNotFoundError as e:
    !pip install gensim==3.8.0
    from gensim import models
try:
    import pandas as pd
except ModuleNotFoundError as e:
    !pip install pandas
    import pandas as pd
    
try:
    import matplotlib.pyplot as plt
except ModuleNotFoundError as e:
    !pip install matplitlib
    import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# This line prevents TF crashing when using convolutional networks
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

## Necessary functions

In [None]:
def prepare_data(model_path, data_prefix, seq_len, embedding_size):
    """Function to read specified data and organize it in the desired way
    
    Args:
        model_path (str): path to embedding model
        data_prefix (str): path to data prefix
        seq_len (int): length of each training observation
        embedding_size (int): size of the embedding
    
    """
    
    def load_data(path):
        output = []
        
        with open(path, 'r', encoding='latin-1') as data:
        #with open(path, 'r', encoding='utf-8') as data:
            for row in csv.reader(data):
                output.append(row)
                
        return output
    
    # Opens embedding model
    model_ = models.Word2Vec.load(model_path)
    
    # Open dataset
    data_train = load_data(data_prefix + "X_train.csv")
    data_val = load_data(data_prefix + "X_val.csv")
    label_train = np.loadtxt(data_prefix + "y_train.csv")
    label_val = np.loadtxt(data_prefix + "y_val.csv")
    
    # Gets embeddings from model
    dt = []
    lt = []
    omissions_ = 0
    
    for i, seq in enumerate(data_train):
        
        try:        
            embedding = model_.wv[seq]
            dt.append(embedding)
            lt.append(label_train[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
            
#     print(omissions_)
    
    # Gets embeddings from model
    dv = []
    lv = []
    omissions_ = 0
    
    for i, seq in enumerate(data_val):
        
        try:
            embedding = model_.wv[seq]
            dv.append(embedding)
            lv.append(label_val[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
    
#     print(omissions_)
    
    # Pads sequences
    dt = pad_sequences(dt, padding='post', dtype='float64', maxlen=seq_len)
    dv = pad_sequences(dv, padding='post', dtype='float64', maxlen=seq_len)
    
    # Converts lists to numpy arrays
#     dt = np.asarray(dt).reshape((len(dt), seq_len * embedding_size))
#     dv = np.asarray(dv).reshape((len(dv), seq_len * embedding_size))
    
    lt = np.asarray(lt)
    lv = np.asarray(lv)
    
    return dt, dv, lt, lv

## Setting variables and creating functions

In [None]:
simpson_dict = {15: "./resources/embeddings/Simpsons_15_7.model",
                75: "./resources/embeddings/Simpsons_75_7.model",
                150: "./resources/embeddings/Simpsons_150_7.model",
                'prefix': "./data/simpsons/",
                'classes': 4}

friends_dict = {5: "./resources/embeddings/Friends_5_7.model",
                25: "./resources/embeddings/Friends_25_7.model",
                125: "./resources/embDictionaries to organize codeeddings/Friends_125_7.model",
                'prefix': "./data/friends/",
                'classes': 6}

In [None]:
def create_checkpoint_callback(filepath):
    """
    Function to create instance of keras early stop callback
    
    Args:
        filepath (str): path to save the model
    """
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath = filepath,
        save_weights_only = False,
        monitor = 'val_loss',
        mode = 'min',
        save_best_only = True)
    return model_checkpoint_callback

In [None]:
def create_early_stop_callback():
    """
    Function to create instance of keras early stop callback
    """
    callback = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss', min_delta = 0, patience = 30, verbose = 0,
    mode = 'min', baseline = None, restore_best_weights = False)
    return callbacklength

## Model creators

In [None]:
def create_base_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create baseline model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: base model 
    """
    
    model = keras.Sequential([
        layers.Dense((embedding_size * seq_len)/2, activation="relu", input_shape = input_shape),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [None]:
def create_deepFC_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create deepFC model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: deep fully conected model 
    """
    
    model = keras.Sequential([
        layers.Dense((embedding_size * seq_len)/2, activation="relu", input_shape = input_shape),
        layers.Dropout(0.2),
        layers.Dense((embedding_size * seq_len)/2, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense((embedding_size * seq_len)/2, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense((embedding_size * seq_len)/2, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [None]:
def create_RNN_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create RNN model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: RNN model 
    """
    
    model = keras.Sequential([
        layers.SimpleRNN(units = 75, input_shape=input_shape),
#         layers.SimpleRNN(units = 20, return_sequences=True),
#         layers.SimpleRNN(units = 20, return_sequences=True),
        layers.Flatten(),
        layers.Dense((embedding_size * seq_len)/2, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [None]:
def create_LSTM_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create LSTM model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: LSTM model 
    """
    
    model = keras.Sequential([
        layers.LSTM(units = 75, input_shape = input_shape),
#         layers.LSTM(units = 20, return_sequences=True),
#         layers.LSTM(units = 20, return_sequences=True),
        layers.Flatten(),
        layers.Dense((embedding_size * seq_len)/2, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    model.summary()
    return model

In [None]:
# Dictionary to store model creators
model_creators = {'baseline': create_base_model,
                  'DeepFC': create_deepFC_model,
                  'SimpleRNN': create_RNN_model,
                  'LSTM': create_LSTM_model}

## Training method

In [None]:
def train_val_model(dataset, em_size, seq_len, model_type, epochs):
    """
    Function to train the given model
    
    Args:
        dataset (str): Name of the dataset to use
        em_size (int): size of the embedding to load
        seq_len (int): lenght of each observation
        model_type (str): model to train
        epochs (int): total epochs to train the model for
    
    Returns:
        PENDING.
    """
    
    if dataset == 'simpson':
        data_path = simpson_dict
    elif dataset == 'friends':
        data_path = friends_dict
    else: 
        raise 'Not valid dataset'
        
    X_train, X_val, y_train, y_val = prepare_data(data_path[em_size], data_path['prefix'],
                                                  seq_len, em_size)
    
    if model_type in  ['baseline', 'DeepFC']:
        X_train = np.asarray(X_train).reshape((len(X_train), seq_len * em_size))
        X_val = np.asarray(X_val).reshape((len(X_val), seq_len * em_size))
    
    checkpoint_callback = create_checkpoint_callback('./resources/checkpoints/' + model_type + '_' + 
                                          str(em_size) + '_' +str(seq_len))
    
    early_stop_callback = create_early_stop_callback()
    
    input_shape = (len(X_train), seq_len * em_size) if model_type in ['baseline','DeepFC'] else (seq_len, em_size)
    
    model = model_creators[model_type](em_size, seq_len, data_path['classes'], 
                                       input_shape=input_shape)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
              epochs = epochs, verbose = 0, 
              callbacks=[checkpoint_callback, early_stop_callback])
    
    model_metrics_train = model.evaluate(x = X_train, y = y_train)
    model_metrics_val = model.evaluate(x = X_val, y = y_val)
    data = {'model_name': [model_type], 'embedding_size': [em_size], 'seq_len': [seq_len],
            'train_accuracy': [model_metrics_train[1]], 'train_precision': [model_metrics_train[2]], 'train_recall': [model_metrics_train[3]],
           'val_accuracy': [model_metrics_val[1]], 'val_precision': [model_metrics_val[2]], 'val_recall': [model_metrics_val[3]]}
    
    return pd.DataFrame(data=data), history

## Model Training

In [None]:
SEQ_LEN_ = [8, 15, 25, 30, 50]
EM_SIZE_ = [15,75,150]
MODEL_TYPES_ = ['baseline','DeepFC', 'SimpleRNN', 'LSTM']
metrics_df = pd.DataFrame()
histories = []
for seq_len in SEQ_LEN_:
    for em_size in EM_SIZE_:
        for model_type in MODEL_TYPES_:
            model_results, history = train_val_model('simpson', em_size, seq_len, model_type, 1000)
            metrics_df = pd.concat([metrics_df, model_results])
            histories.append(history)

In [None]:
SEQ_LEN_ = [8, 15, 25, 30, 50]
EM_SIZE_ = [5, 25, 125]
MODEL_TYPES_ = ['baseline','DeepFC', 'SimpleRNN', 'LSTM']
metrics_df1 = pd.DataFrame()
for seq_len in SEQ_LEN_:
    for em_size in EM_SIZE_:
        for model_type in MODEL_TYPES_:0.2077
            model_results, history = train_val_model('friends', em_size, seq_len, model_type, 1000)
            metrics_df1 = pd.concat([metrics_df1, model_results])
            histories.append(history)