# Deep models

In [1]:
import csv

import numpy as np
!pip install silence_tensorflow
from silence_tensorflow import silence_tensorflow
silence_tensorflow()

try:
    from gensim import models
except ModuleNotFoundError as e:
    !pip install gensim==3.8.0
    from gensim import models
try:
    import pandas as pd
except ModuleNotFoundError as e:
    !pip install pandas
    import pandas as pd
    
try:
    import matplotlib.pyplot as plt
except ModuleNotFoundError as e:
    !pip install matplitlib
    import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import os
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# Create necessar folders
try:
    os.mkdir('./results/friends')
except FileExistsError as e:
    print("File exists")
try:
    os.mkdir('./results/simpson')
except FileExistsError as e:
    print("File exists")
try:
    os.mkdir('./results/friends/deepModels/')
except FileExistsError as e:
    print("File exists")
try:
    os.mkdir('./results/simpson/deepModels/')
except FileExistsError as e:
    print("File exists")

File exists
File exists
File exists
File exists


In [3]:
# This line prevents TF crashing when using convolutional networks
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

## Necessary functions

In [4]:
def prepare_data(model_path, data_prefix, seq_len, embedding_size):
    """Function to read specified data and organize it in the desired way
    
    Args:
        model_path (str): path to embedding model
        data_prefix (str): path to data prefix
        seq_len (int): length of each training observation
        embedding_size (int): size of the embedding
    
    """
    
    def load_data(path):
        output = []
        
        with open(path, 'r', encoding='latin-1') as data:
        #with open(path, 'r', encoding='utf-8') as data:
            for row in csv.reader(data):
                output.append(row)
                
        return output
    
    # Opens embedding model
    model_ = models.Word2Vec.load(model_path)
    
    # Open dataset
    data_train = load_data(data_prefix + "X_train.csv")
    data_val = load_data(data_prefix + "X_val.csv")
    label_train = np.loadtxt(data_prefix + "y_train.csv")
    label_val = np.loadtxt(data_prefix + "y_val.csv")
    
    # Gets embeddings from model
    dt = []
    lt = []
    omissions_ = 0
    
    for i, seq in enumerate(data_train):
        
        try:        
            embedding = model_.wv[seq]
            dt.append(embedding)
            lt.append(label_train[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
            
#     print(omissions_)
    
    # Gets embeddings from model
    dv = []
    lv = []
    omissions_ = 0
    
    for i, seq in enumerate(data_val):
        
        try:
            embedding = model_.wv[seq]
            dv.append(embedding)
            lv.append(label_val[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
    
#     print(omissions_)
    
    # Pads sequences
    dt = pad_sequences(dt, padding='post', dtype='float64', maxlen=seq_len)
    dv = pad_sequences(dv, padding='post', dtype='float64', maxlen=seq_len)
    
    # Converts lists to numpy arrays
#     dt = np.asarray(dt).reshape((len(dt), seq_len * embedding_size))
#     dv = np.asarray(dv).reshape((len(dv), seq_len * embedding_size))
    
    lt = np.asarray(lt)
    lv = np.asarray(lv)
    
    return dt, dv, lt, lv

## Setting variables and creating functions

In [5]:
simpson_dict = {15: "./resources/embeddings/Simpsons_15_7.model",
                75: "./resources/embeddings/Simpsons_75_7.model",
                150: "./resources/embeddings/Simpsons_150_7.model",
                'prefix': "./data/simpsons/",
                'classes': 4,
               'weights':{0:2.5, 1:1, 2:3, 3:2.5}}

friends_dict = {15: "./resources/embeddings/Friends_15_7.model",
                75: "./resources/embeddings/Friends_75_7.model",
                150: "./resources/embeddings/Friends_150_7.model",
                'prefix': "./data/friends/",
                'classes': 6,
               'weights':{0: 1, 1:1, 2:1, 3:1, 4:1, 5:1}}

In [6]:
def create_checkpoint_callback(filepath):
    """
    Function to create instance of keras early stop callback
    
    Args:
        filepath (str): path to save the model
    """
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath = filepath,
        save_weights_only = False,
        monitor = 'val_accuracy',
        mode = 'max',
        save_best_only = True)
    return model_checkpoint_callback

In [7]:
def create_early_stop_callback():
    """
    Function to create instance of keras early stop callback
    """
    callback = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss', min_delta = 0, patience = 5, verbose = 0,
    mode = 'min', baseline = None, restore_best_weights = False)
    return callback

## Model creators

In [8]:
def create_base_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create baseline model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: base model 
    """
    
    model = keras.Sequential([
        layers.Dense((embedding_size * seq_len)/2, activation="relu", input_shape = input_shape),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [9]:
def create_deepFC_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create deepFC model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): lengt'baseline','DeepFC', 'SimpleRNN', h of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: deep fully conected model 
    """
    
    model = keras.Sequential([
        layers.Dense((embedding_size * seq_len)/2, activation="relu", input_shape = input_shape),
        layers.Dropout(0.5),
        layers.Dense((embedding_size * seq_len)/4, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense((embedding_size * seq_len)/8, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense((embedding_size * seq_len)/16, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [10]:
def create_RNN_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create RNN model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: RNN model 
    """
    
    model = keras.Sequential([
        layers.SimpleRNN(units = embedding_size, input_shape=input_shape),
        layers.Flatten(),
        layers.Dense((embedding_size * seq_len)/8, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy", keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [11]:
def create_LSTM_model(embedding_size, seq_len, classes, input_shape):
    """
    Function to create LSTM model
    
    Args:
        embedding_size (int): Size of the embedding to be used
        seq_len (int): length of each training observation
        classes (int): number of possible classes
        input_shape (int): Shape in which the input will be provided
    
    Returns:
        tensorflow.python.keras.engine.sequential.Sequential: LSTM model 
    """
    
    model = keras.Sequential([
        layers.LSTM(units = embedding_size, input_shape = input_shape),
        layers.Flatten(),
        layers.Dense((embedding_size * seq_len)/8, activation="relu"),
        layers.Dense(classes, activation='softmax')
    ])
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=[keras.metrics.Accuracy(), keras.metrics.Precision(), keras.metrics.Recall()]
    )
    return model

In [12]:
# Dictionary to store model creators
model_creators = {'baseline': create_base_model,
                  'DeepFC': create_deepFC_model,
                  'SimpleRNN': create_RNN_model,
                  'LSTM': create_LSTM_model}

## Training method

In [13]:
def train_val_model(dataset, em_size, seq_len, model_type, epochs):
    """
    Function to train the given model
    
    Args:
        dataset (str): Name of the dataset to use
        em_size (int): size of the embedding to load
        seq_len (int): lenght of each observation
        model_type (str): model to train
        epochs (int): total epochs to train the model for
    
    Returns:
        PENDING.
    """
    
    if dataset == 'simpson':
        data_path = simpson_dict
    elif dataset == 'friends':
        data_path = friends_dict
    else: 
        raise 'Not valid dataset'
        
    X_train, X_val, y_train, y_val = prepare_data(data_path[em_size], data_path['prefix'],
                                                  seq_len, em_size)
    
    if model_type in  ['baseline', 'DeepFC']:
        X_train = np.asarray(X_train).reshape((len(X_train), seq_len * em_size))
        X_val = np.asarray(X_val).reshape((len(X_val), seq_len * em_size))
    
    checkpoint_callback = create_checkpoint_callback('./results/' + dataset + '/deepModels/checkpoints/' + model_type + '_' + 
                                          str(em_size) + '_' +str(seq_len))
    
    early_stop_callback = create_early_stop_callback()
    
    input_shape = (len(X_train), seq_len * em_size) if model_type in ['baseline','DeepFC'] else (seq_len, em_size)
    
    model = model_creators[model_type](em_size, seq_len, data_path['classes'], 
                                       input_shape=input_shape)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                        epochs = epochs, verbose = 0, 
                        callbacks=[checkpoint_callback, early_stop_callback], 
                        class_weight=data_path['weights'])
    
    y_pred_train = np.argmax(model.predict(X_train), axis = 1)
    y_pred_val = np.argmax(model.predict(X_val), axis = 1)
    
    y_train = np.argmax(y_train, axis = 1)
    y_val = np.argmax(y_val, axis = 1)
    
    sample_weights_train = [data_path['weights'][i] for i in y_train]
    sample_weights_val = [data_path['weights'][i] for i in y_val]
    
    data = {'model_name': [model_type], 'embedding_size': [em_size], 'seq_len': [seq_len],
            'train_accuracy': [accuracy_score(y_train, y_pred_train, sample_weight=sample_weights_train)],
            'train_precision': [precision_score(y_train, y_pred_train, sample_weight=sample_weights_train, average='weighted',zero_division=0)],
            'train_recall': [recall_score(y_train, y_pred_train, sample_weight=sample_weights_train, average='weighted',zero_division=0)], 
            'train_f1': [f1_score(y_train, y_pred_train, sample_weight=sample_weights_train, average='weighted',zero_division=0)], 
            'val_accuracy': [accuracy_score(y_val, y_pred_val, sample_weight=sample_weights_val)], 
            'val_precision': [precision_score(y_val, y_pred_val, sample_weight=sample_weights_val, average='weighted',zero_division=0)], 
            'val_recall': [recall_score(y_val, y_pred_val, sample_weight=sample_weights_val, average='weighted',zero_division=0)], 
            'val_f1': [f1_score(y_val, y_pred_val, sample_weight=sample_weights_val, average='weighted',zero_division=0)]}
    
    return pd.DataFrame(data=data), history

## Model Training

In [14]:
SEQ_LEN_ = [15, 35, 50]
EM_SIZE_ = [15, 75, 150]
MODEL_TYPES_ = ['baseline','DeepFC', 'SimpleRNN', 'LSTM']
metrics_df = pd.DataFrame()
histories = []
for seq_len in SEQ_LEN_:
    for em_size in EM_SIZE_:
        for model_type in MODEL_TYPES_:
            model_results, history = train_val_model('simpson', em_size, seq_len, model_type, 20)
            metrics_df = pd.concat([metrics_df, model_results])
            histories.append(history)
metrics_df.to_csv('./results/simpson/deepModels.csv')



In [None]:
SEQ_LEN_ = [15, 35, 50]
EM_SIZE_ = [15, 75, 150]
MODEL_TYPES_ = ['baseline','DeepFC', 'SimpleRNN', 'LSTM']
metrics_df = pd.DataFrame()
for seq_len in SEQ_LEN_:
    for em_size in EM_SIZE_:
        for model_type in MODEL_TYPES_:
            model_results, history = train_val_model('friends', em_size, seq_len, model_type, 20)
            metrics_df = pd.concat([metrics_df, model_results])
            histories.append(history)
metrics_df.to_csv('./results/friends/deepModels.csv')

