In [14]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

from keras.models import Sequential
from keras.layers import Embedding, Reshape, Activation, Input, Flatten, Dense, GRU
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical

In [23]:
# HELPER FUNCTIONS
# function to get unique values 
def unique(list1): 
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
#     # print list 
#     for x in unique_list: 
#         print(x)

In [24]:
# Récupération des données
lines = open("./data.csv", encoding="utf8").readlines()
lines = np.array(lines)

# Formattage des données, et exclusion des données inutiles
lines = [l.split('"') for l in lines]
lines = np.array([[l[3], l[5]] for l in lines])
# print(lines)

In [25]:
# TOKENIZER WORK
data_text = [t[1] for t in lines]

num_words = 10000
# Tokenizer converts words to integers
corpus = [sentence for sentence in data_text if sentence.count(' ') >= 2]
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(corpus)

idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    # Concatenate all words.
    text = " ".join(words)
    return text

# print(x_train[2])
# print(tokens_to_string(x_train_pad[2]))
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

1328


In [26]:
pad = 'pre' #we add zeros to the beginning and also truncating from beginning

In [27]:
def getXPad(x_train, x_test):
    x_train_tokens = tokenizer.texts_to_sequences(x_train)
    print(x_train_tokens[1])

    x_test_tokens = tokenizer.texts_to_sequences(x_test)
    #calculating length of every training tokens and converted it to an array
    num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
    num_tokens = np.array(num_tokens)

    # # mean length of token in training data
    # np.mean(num_tokens)

    # #max length of training token data before truncate
    # np.max(num_tokens)

    #we find max number of tokens we will allow is set to the average plus 2 standard deviations.
    max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
    max_tokens = int(max_tokens)
    max_tokens  #maximum token length in our data

    # #This covers about 95% of the data-set.
    # np.sum(num_tokens < max_tokens) / len(num_tokens)

    x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
    x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
    return x_train_pad, x_test_pad

In [28]:
embedding_size = 8

# TODO create and compile model
model = Sequential()
# Turns positive integers (indexes) into dense vectors of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
model.add(Embedding(vocab_size, embedding_size))
model.add(GRU(16))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

# Allow us to see all actors available in dataset
actors = unique([l[0] for l in lines])
actors_filtered = ['VADER', 'ANAKIN']

# Loop on actor on which I wish to train
for actor in actors_filtered:
    print("----------------------------------------------------")
    print("---------   Training with " + actor + " :       ----")
    print("----------------------------------------------------")

    # Get first half data - TRAIN
    train_data = lines[:337]
    x_train = [t[1] for t in train_data]
    y_train = [(1, 0)[t[0] == actor] for t in train_data]
    
    # Get second half data - TEST
    test_data = lines[337:]
    x_test = [t[1] for t in test_data]
    y_test = [(1, 0)[t[0] == actor] for t in test_data]
    
    # Generate pad based on train data    
    x_train_pad, x_test_pad = getXPad(x_train, x_test)
    
    # Fit model    
    model.fit(x_train_pad, y_train, epochs=25, batch_size=15)
    accuracy = model.evaluate(x_test_pad,y_test)
    print("Accuracy =", accuracy[1]*100,"%")

# FINAL TEST - Generate pad based on all data    
x_train_pad, x_test_pad = getXPad([t[1] for t in lines],  [t[1] for t in lines])
y_test = [(1, 0)[t[0] == 'VADER' or t[0] == 'ANAKIN'] for t in lines]

# FINAL TEST - EVALUATE    
accuracy = model.evaluate(x_test_pad,y_test)
print("Accuracy =", accuracy[1]*100,"%")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 8)           10624     
_________________________________________________________________
gru_7 (GRU)                  (None, 16)                1200      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 17        
Total params: 11,841
Trainable params: 11,841
Non-trainable params: 0
_________________________________________________________________
----------------------------------------------------
---------   Training with VADER :       ----
----------------------------------------------------
[2, 420, 253, 65, 11, 15, 421, 81, 22, 17, 636, 6, 5, 162, 637, 254, 82, 1, 20, 255, 4, 422]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25