In [261]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, TensorBoard
from keras import layers

from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

# import texthero as hero
# tf.random.set_seed(42)

<img src='../emotion_diagramme.svg' alt='emotions' style='background-color: white' />

In [262]:
dataset_used = 'kaggle clean'

In [263]:
df = pd.read_csv('..\data\kaggle.csv')

In [264]:
# Nettoyage

import string
punctuation = string.punctuation
for p in punctuation:
    df['Text'] = df['Text'].str.replace(p,'')

df['Text'] = df['Text'].replace('(http://|https://|ftp://|ssh://)\S*','',regex=True)
df['Text'] = df['Text'].replace('@\S*','',regex=True)

stops_r = list(map(lambda x: x.replace("'",""), stopwords.words('english')))
stops = set(stopwords.words('english') + stops_r)
for word in stops:
    df['Text'] = df['Text'].str.replace(' ' + word + ' ', ' ')
df['Text'] = df['Text'].str.replace('^i ', '', regex=True).replace('^im ', '', regex=True).replace('^I ', '', regex=True).replace('^Im ', '', regex=True)

  df['Text'] = df['Text'].str.replace(p,'')


In [265]:
X = df['Text']
y = df['Emotion']

In [266]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=42)

In [267]:
X_train.shape

(17167,)

In [268]:
vocab_size = 20000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

In [269]:
word_index = tokenizer.word_index

In [270]:
len(word_index)

17165

In [271]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [272]:
embeddings_index = {}
f = open('../data/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [273]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))#max_length
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [274]:
X_train_avg = []
for i in range(len(X_train_sequences)):
    avg = np.zeros(100)
    for w in X_train_sequences[i]:
        avg += embedding_matrix[w]
    avg = avg/len(X_train_sequences[i])
    X_train_avg.append(avg)
X_train_avg = np.asarray(X_train_avg)

In [275]:
X_test_avg = []
for i in range(len(X_test_sequences)):
    avg = np.zeros(100)
    for w in X_test_sequences[i]:
        avg += embedding_matrix[w]
    avg = avg/len(X_test_sequences[i])
    X_test_avg.append(avg)
X_test_avg = np.asarray(X_test_avg)

In [281]:
layer_activation = 'relu'
output_activation = 'softmax'
# model_description = 'adding_glove_embedding_vectors ; Dense:128:{} ; Dense:64:{} ;Dense:6:{}'.format(layer_activation,output_activation)

input_dim = len(X_train_avg[0])
model = Sequential([
    Dense(128, input_dim=input_dim, activation=layer_activation),
    Dense(128, input_dim=input_dim, activation=layer_activation),
    Dense(6, activation=output_activation)
])

In [282]:
from keras import backend as K
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [283]:
emotions = {'happy':0, 'sadness':1, 'anger':2, 'fear':3, 'love':4, 'surprise':5}
y_train = y_train.replace(emotions)
y_test = y_test.replace(emotions)

In [284]:
from keras import metrics
# import tensorflow_addons as tfa 
# f1 = tfa.metrics.F1Score(6)
loss = 'sparse_categorical_crossentropy'
optimizer = 'adam'
metric = metrics.SparseCategoricalAccuracy()
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=[metrics.SparseCategoricalAccuracy()])#metrics.AUC(multi_label=True, num_classes=6)
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48 (Dense)            (None, 256)               25856     
                                                                 
 dense_49 (Dense)            (None, 256)               65792     
                                                                 
 dense_50 (Dense)            (None, 128)               32896     
                                                                 
 dense_51 (Dense)            (None, 6)                 774       
                                                                 
Total params: 125,318
Trainable params: 125,318
Non-trainable params: 0
_________________________________________________________________


In [285]:
# %load_ext tensorboard

# log_folder = 'logs'
callbacks = [
            EarlyStopping(patience = 10)
            # TensorBoard(log_dir=log_folder)
            ]
num_epochs = 200
history = model.fit(X_train_avg, y_train, epochs=num_epochs, batch_size=8, validation_data=(X_test_avg, y_test),callbacks=callbacks)#,callbacks=callbacks)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200


In [176]:
from keras.models import load_model
model_load = load_model('../models/neural_lstm_kaggle_clean.h5')

In [177]:
test = np.zeros(100,dtype=np.int64)
test[0] = word_index['god']
test[1] = word_index['is']
test[2] = word_index['love']
# test = list(test)
test = np.array([test])

In [178]:
model_load.predict(test)

array([[0.2319767 , 0.2617607 , 0.31832987, 0.10729696, 0.01612998,
        0.06450569]], dtype=float32)

In [164]:
import pickle
with open('../models/neural_lstm_kaggle_clean.pkl', 'r') as file:
    pickle.load(file)

TypeError: a bytes-like object is required, not 'str'

In [161]:
test = embeddings_index['i']+embeddings_index['feel']+embeddings_index['good']
test = test.reshape(1,100)
print(model.predict(test))
print(emotions)

[[9.9998891e-01 1.1125703e-05 1.9218303e-09 7.9951297e-12 1.2341030e-08
  5.5463226e-15]]
{'happy': 0, 'sadness': 1, 'anger': 2, 'fear': 3, 'love': 4, 'surprise': 5}


In [148]:
import csv

field_names = ['dataset', 'model_name', 'model_description', 'encoding', 'loss', 'optimizer',
            'metric', 'layer_activation', 'output_activation', 'epoch', 'loss_train',
            'accuracy_train', 'loss_val', 'accuracy_val']
data = [{'dataset':dataset_used,
            'model_name':type(model).__name__,
            'model_description':model_description,
            'encoding':type(tokenizer).__name__,
            'loss':loss,
            'optimizer':optimizer,
            'metric':type(metric).__name__,
            'layer_activation':layer_activation,
            'output_activation':output_activation,
            'epoch':history.epoch[-1],
            'loss_train':history.history[loss][-1],
            'accuracy_train':history.history['sparse_categorical_accuracy'][-1],
            'loss_val':history.history['val_loss'][-1],
            'accuracy_val':history.history['val_sparse_categorical_accuracy'][-1],
           }]
with open('../results/results.csv', 'a') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = field_names)
    # writer.writeheader()
    writer.writerows(data)

In [None]:
import mlflow
import mlflow.keras
from urllib.parse import urlparse

import warnings
warnings.filterwarnings('ignore')

# On crée une nouvelle expérimentation
# experiment_id = mlflow.create_experiment("NLP_projet")

with mlflow.start_run(experiment_id = 1):

    tags = {'dataset':dataset_used,
            'model_name':type(model).__name__,
            'model_description':model_description,
            'encoding':type(tokenizer).__name__,
            'loss':loss,
            'optimizer':optimizer,
            'metric':metric,
            'layer_activation':layer_activation,
            'output_activation':output_activation,
            'epoch':history.epoch[-1],
            'loss_train':history.history[loss][-1],
            'accuracy_train':history.history['sparse_categorical_accuracy'][-1],
            'loss_val':history.history['val_loss'][-1],
            'accuracy_val':history.history['val_sparse_categorical_accuracy'][-1],
           }
    # tags.update({'words_rooting':words_rooting})
    mlflow.set_tags(tags)

(4292, 100)