## Imports

In [1]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append('..')
from preprocessing.preprocessing import Preprocessor
from preprocessing.dataset_balancing import Balance

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM, Input
from tensorflow.keras.metrics import Precision

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, fbeta_score

## Data preprocessing

In [4]:
# Load training and test sets
df_train = pd.read_json('../datas/training_set.json')
df_test = pd.read_json('../datas/testing_set.json')

print(f"Train shape : {df_train.shape}")
print(f"Test shape : {df_test.shape}")

Train shape : (6035, 2)
Test shape : (1065, 2)


In [5]:
# Balance the train set
train_balance = Balance(df_train)
train_balance.process_balance('../preprocessed_data/train_balanced')

In [6]:
# Preprocess the data
train_preprocessor = Preprocessor(train_balance.df_dataset)
train_preprocessor.preprocess('../preprocessed_data/train_nlp_preprocessed')
test_preprocessor = Preprocessor(df_test)
test_preprocessor.preprocess('../preprocessed_data/test_nlp_preprocessed')

  data = np.array(data)


In [7]:
train_preprocessor.df_dataset.head()

Unnamed: 0,sentence,intent
0,"[1.9918094, -2.0211177, -3.1467276, -0.7597667...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
1,"[0.37102285, -0.25975516, -2.5644374, 1.239701...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
2,"[0.5238122, -1.4262346, -0.8189889, 0.9439444,...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
3,"[2.6711895, -3.8386414, -3.3684235, 0.06278875...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,"[0.16812666, -1.4507266, -3.699288, -0.9399957...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"


In [8]:
x_train = np.array(list(train_preprocessor.df_dataset['sentence']))
y_train = np.array(list(train_preprocessor.df_dataset['intent']))
x_test = np.array(list(test_preprocessor.df_dataset['sentence']))
y_test = np.array(list(test_preprocessor.df_dataset['intent']))

print("x_train.shape = ", x_train.shape)
print("y_train.shape = ", y_train.shape)
print("x_test.shape = ", x_test.shape)
print("y_test.shape = ", y_test.shape)

x_train.shape =  (15996, 300)
y_train.shape =  (15996, 8)
x_test.shape =  (1065, 300)
y_test.shape =  (1065, 8)


In [9]:
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

print("x_train.shape = ", x_train.shape)
print("x_test.shape = ", x_test.shape)

x_train.shape =  (15996, 1, 300)
x_test.shape =  (1065, 1, 300)


## Model creation and training

In [10]:
model = Sequential()

model.add(Bidirectional(LSTM(64), input_shape=x_train[0].shape))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 128)               186880    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 520       
Total params: 199,816
Trainable params: 199,816
Non-trainable params: 0
__________________________________________________

In [11]:
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=[Precision()]
)

history = model.fit(x_train, y_train, batch_size=128, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
model.save('../models/model_v1')



INFO:tensorflow:Assets written to: ../models/model_v1\assets


INFO:tensorflow:Assets written to: ../models/model_v1\assets


In [13]:
score = model.evaluate(x_test, y_test)



In [14]:
print("Test loss =", score[0])
print("Test precision =", score[1])

Test loss = 1.1689561605453491
Test precision = 0.8145315647125244


In [15]:
y_pred = model.predict(x_test)
print(y_pred.shape)

(1065, 8)


In [16]:
y_test_true = list(map(lambda x: np.argmax(x), y_test))
y_test_pred = list(map(lambda x: np.argmax(x), y_pred))
print(len(y_test_true))
print(len(y_test_pred))

1065
1065


In [17]:
print(y_test_true[:10])
print(y_test_pred[:10])

[1, 1, 1, 4, 3, 1, 0, 1, 1, 1]
[4, 1, 1, 4, 3, 1, 0, 1, 1, 1]


In [18]:
fbeta = fbeta_score(y_test_true, y_test_pred, beta=0.5, labels=list(range(8)), average='weighted')
print(fbeta)

0.8263126529465825


In [19]:
# Training scores
print(classification_report(y_test_true, y_test_pred))

              precision    recall  f1-score   support

           0       0.69      0.86      0.77        21
           1       0.93      0.82      0.87       677
           2       0.84      0.88      0.86        24
           3       0.75      0.85      0.79        93
           4       0.64      0.84      0.72       114
           5       0.62      0.61      0.62        67
           6       0.64      1.00      0.78        14
           7       0.56      0.78      0.65        55

    accuracy                           0.81      1065
   macro avg       0.71      0.83      0.76      1065
weighted avg       0.83      0.81      0.82      1065

