## Imports

In [1]:
import numpy as np
import pandas as pd
import os
from preprocessing import Preprocessor
from dataset_balancing import Balance

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM, Input
from tensorflow.keras.metrics import Precision

In [3]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, log_loss, roc_curve, fbeta_score

## Data preprocessing

In [4]:
# Load training and test sets
df_train = pd.read_json('datas/training_set.json')
df_test = pd.read_json('datas/testing_set.json')

print(f"Train shape : {df_train.shape}")
print(f"Test shape : {df_test.shape}")

Train shape : (6035, 2)
Test shape : (1065, 2)


In [5]:
# Balance the train set
train_balance = Balance(df_train)
train_balance.process_balance('./preprocessed_data/train_balanced')

In [6]:
# Preprocess the data
train_preprocessor = Preprocessor(train_balance.df_dataset)
train_preprocessor.preprocess('./preprocessed_data/train_nlp_preprocessed')
test_preprocessor = Preprocessor(df_test)
test_preprocessor.preprocess('./preprocessed_data/test_nlp_preprocessed')

In [7]:
train_preprocessor.df_dataset.head()

Unnamed: 0,sentence,intent
0,"[[-0.0033358, -8.6066, -7.4396, -3.3738, 12.51...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
1,"[[-3.398, 3.652, -11.154, -0.76505, 0.59124, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
2,"[[-0.78445, -3.7161, 1.5587, 3.4742, -2.844, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
3,"[[-0.0033358, -8.6066, -7.4396, -3.3738, 12.51...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,"[[-1.0319, 4.7613, -7.8448, -1.0902, 1.3204, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"


In [8]:
x_train = np.array(list(train_preprocessor.df_dataset['sentence']))
y_train = np.array(list(train_preprocessor.df_dataset['intent']))
x_test = np.array(list(test_preprocessor.df_dataset['sentence']))
y_test = np.array(list(test_preprocessor.df_dataset['intent']))

print("x_train.shape = ", x_train.shape)
print("y_train.shape = ", y_train.shape)
print("x_test.shape = ", x_test.shape)
print("y_test.shape = ", y_test.shape)

x_train.shape =  (15995, 93, 300)
y_train.shape =  (15995, 8)
x_test.shape =  (1065, 92, 300)
y_test.shape =  (1065, 8)


## Model creation and training

In [9]:
model = Sequential()
model.add(Input(shape=x_train[0].shape))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 128)               186880    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 520       
Total params: 199,816
Trainable params: 199,816
Non-trainable params: 0
__________________________________________________

In [10]:
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=[Precision()]
)

history = model.fit(x_train, y_train, batch_size=128, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
model.save('./models/model_v1')



INFO:tensorflow:Assets written to: ./models/model_v1\assets


INFO:tensorflow:Assets written to: ./models/model_v1\assets


In [12]:
score = model.evaluate(x_test, y_test)







In [13]:
print("Test loss =", score[0])
print("Test precision =", score[1])

Test loss = 1.2395265102386475
Test precision = 0.8431001901626587


In [14]:
y_pred = model.predict(x_test)
print(y_pred.shape)
print(y_pred[0])
print(y_test[0])





(1065, 8)
[1.7130101e-06 1.8774951e-04 7.8208183e-07 9.8333874e-07 9.9948847e-01
 3.1800562e-04 6.1540169e-07 1.6887410e-06]
[0. 1. 0. 0. 0. 0. 0. 0.]


In [15]:
y_test_true = list(map(lambda x: np.argmax(x), y_test))
y_test_pred = list(map(lambda x: np.argmax(x), y_pred))
print(len(y_test_true))
print(len(y_test_pred))

1065
1065


In [16]:
print(y_test_true[:10])
print(y_test_pred[:10])

[1, 1, 1, 4, 3, 1, 0, 1, 1, 1]
[4, 1, 1, 4, 3, 1, 0, 1, 1, 1]


In [17]:
fbeta = fbeta_score(y_test_true, y_test_pred, beta=0.5, labels=list(range(8)), average='weighted')
print(fbeta)

0.8560412208044338
