# Result analysis

In [18]:
import csv

import numpy as np

try:
    from gensim import models
except ModuleNotFoundError as e:
    !pip install gensim==3.8.0
    from gensim import models
try:
    import pandas as pd
except ModuleNotFoundError as e:
    !pip install pandas
    import pandas as pd
    
try:
    import matplotlib.pyplot as plt
except ModuleNotFoundError as e:
    !pip install matplitlib
    import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
def prepare_data(model_path, data_prefix, seq_len, embedding_size):
    """Function to read specified data and organize it in the desired way
    
    Args:
        model_path (str): path to embedding model
        data_prefix (str): path to data prefix
        seq_len (int): length of each training observation
        embedding_size (int): size of the embedding
    
    """
    
    def load_data(path):
        output = []
        
        with open(path, 'r', encoding='latin-1') as data:
        #with open(path, 'r', encoding='utf-8') as data:
            for row in csv.reader(data):
                output.append(row)
                
        return output
    
    # Opens embedding model
    model_ = models.Word2Vec.load(model_path)
    
    # Open dataset
    data_train = load_data(data_prefix + "X_train.csv")
    data_val = load_data(data_prefix + "X_val.csv")
    data_test = load_data(data_prefix + "X_test.csv")
    label_train = np.loadtxt(data_prefix + "y_train.csv")
    label_val = np.loadtxt(data_prefix + "y_val.csv")
    label_test = np.loadtxt(data_prefix + "y_test.csv")
    
    # Gets embeddings from model
    dt = []
    lt = []
    omissions_ = 0
    
    for i, seq in enumerate(data_train):
        
        try:        
            embedding = model_.wv[seq]
            dt.append(embedding)
            lt.append(label_train[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
            
#     print(omissions_)
    
    # Gets embeddings from model
    dv = []
    lv = []
    omissions_ = 0
    
    for i, seq in enumerate(data_val):
        
        try:
            embedding = model_.wv[seq]
            dv.append(embedding)
            lv.append(label_val[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:
            omissions_ += 1
            
    # Gets embeddings from model
    dtest = []
    ltest = []
    omissions_ = 0
    
    for i, seq in enumerate(data_test):
        X_train
        try:
            embedding = model_.wv[seq]
            dtest.append(embedding)
            ltest.append(label_test[i])
        
        except KeyError as ke:
            for word in seq:
                if word not in model_.wv.vocab.keys():
                    seq.remove(word)
            
        except ValueError as ve:pad_sequences(dt, padding='post', dtype='float64', maxlen=seq_len)
            omissions_ += 1
    
#     print(omissions_)
    
    # Pads sequences
    dt = pad_sequences(dt, padding='post', dtype='float64', maxlen=seq_len)
    dv = pad_sequences(dv, padding='post', dtype='float64', maxlen=seq_len)
    dtest = pad_sequences(dt, padding='post', dtype='float64', maxlen=seq_len)
    
    # Converts lists to numpy arrays
#     dt = np.asarray(dt).reshape((len(dt), seq_len * embedding_size))
#     dv = np.asarray(dv).reshape((len(dv), seq_len * embedding_size))
    
    lt = np.asarray(lt)
    lv = np.asarray(lv)
    ltest = np.asarray(ltest)
    
    return dt, dv, lt, lv, dtest, ltest

In [20]:
dt, dv, lt, lv, dtest, ltest = prepare_data('./resources/embeddings/Simpsons_150_7.model', './data/simpsons/', 15, 150)
simpson_dict = {'X_train': dt, 'X_val': dv, 'X_test': dtest,
               'y_train': lt, 'y_val': lv, 'y_test':ltest}

In [23]:
model = tf.keras.models.load_model('./resources/checkpoints/LSTM_150_15/')

In [24]:
model.predict(dtest)

array([[0.31510678, 0.5870791 , 0.05746957, 0.04034454],
       [0.13761112, 0.36451074, 0.11768039, 0.3801978 ],
       [0.36142012, 0.5062217 , 0.07948212, 0.05287609],
       ...,
       [0.18346974, 0.44215328, 0.15196383, 0.22241315],
       [0.16833088, 0.37829557, 0.16084674, 0.2925268 ],
       [0.03106422, 0.9211353 , 0.01047721, 0.03732322]], dtype=float32)