# Data section

## Imports

In [1]:
import pandas as pd
import requests as rq
import os
import time
from sklearn.metrics import classification_report

## Data Loading

In [2]:
# Load training and test sets

df_train = pd.read_json('datas/training_set.json')
df_test = pd.read_json('datas/testing_set.json')

print(f"Train shape : {df_train.shape}")
print(f"Test shape : {df_test.shape}")

Train shape : (6035, 2)
Test shape : (1065, 2)


## Informations and visualisations of datasets

In [3]:
# Stats on the training set
df_train.describe()

Unnamed: 0,intent,sentence
count,6035,6035
unique,8,6035
top,irrelevant,Peux tu me dire combien coûterait un bac en bo...
freq,3852,1


### Comments
We see here there is indeed a total of 8 dfferent intents.
Moreover, the *irrelevant* intent in highly represented in the dataset (3852/6035).

This can be good since *irrelevant* is the intent for every sentence that doesn't fit one of the 7 others. It is less specific than the others so it may need more examples to be well-recognized. Nevertheless, it can involve weak detections for the other intents by the model, because of a too small amount of examples.

In [4]:
# Informations about colums
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6035 entries, 0 to 6034
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   intent    6035 non-null   object
 1   sentence  6035 non-null   object
dtypes: object(2)
memory usage: 94.4+ KB


### Comments
The dataset is made of 2 columns:
- `intent` is our target
- `sentence` is the input, it's what the user will give to the model

Both columns are categorical, we have no numerical values here.
In addition, there aren't any missing values (*non-null*).

In [5]:
# Show 10 first elements
df_train.head(n=10)

Unnamed: 0,intent,sentence
0,irrelevant,"850€ maximum pour le loyer, à partir de janvie..."
1,irrelevant,D'imprimer
2,purchase,Le meilleur cabriolet hybrid moins de 5m10 min...
3,find-hotel,en ce moment je cher un location pour les vaca...
4,irrelevant,c'est possible de t'utiliser la nuit ?
5,irrelevant,J'ai besoin d'acheter un fusil
6,irrelevant,Vous pouvez réserver pour 09h oui
7,irrelevant,Du 20 au 22 novembre pour 100-150 euros la nuit
8,purchase,Mon docteur m'a suggéré de porter des bandes p...
9,purchase,Commande à effectuer : 30 bloc note petits car...


### Comments
This gives us some examples of rows in the dataset. We notice there are both short and long sentences, well-written or not, which is good to train the model on various writting styles (to work well on the different users' styles in production).

In [6]:
# Show the 8 different intents
df_train["intent"].value_counts()

irrelevant           3852
purchase              613
find-restaurant       469
find-around-me        383
find-hotel            316
find-train            143
find-flight           142
provide-showtimes     117
Name: intent, dtype: int64

### Comments
We get here a more detailed count of rows for each intent. As we said before, there are mostly *irrelevant* rows (but maybe for a good reason). 

Ohterwize, it's also kind of unbalanced between the 7 'specific' intents. It would have been nice to have an explonation for that. Maybe it's due to the use of the app made by the clients, asking more often for purchase matters than for showtimes ones.

# Model section

In [7]:
# Launch application to have access to the model
# This may take some time
os.system('docker run -p 8080:8080 3eec8ccf7aec &')

0

## Split datasets in inputs and labels

In [8]:
df_x_train = df_train['sentence']
df_y_train = df_train['intent']
df_x_test = df_test['sentence']
df_y_test = df_test['intent']

print(f"Train data shape : {df_x_train.shape}")
print(f"Train labels shape : {df_y_train.shape}")
print(f"Test data shape : {df_x_test.shape}")
print(f"Test labels shape : {df_y_test.shape}")

Train data shape : (6035,)
Train labels shape : (6035,)
Test data shape : (1065,)
Test labels shape : (1065,)


## Get model predictions for both datasets

In [9]:
route = 'http://localhost:8080/api/intent?'

# Function to get the model's predictions for a given dataset
def predict(datas):
    
    # List of predicted intents
    predicted_labels = []
    # List of probabilities for the predicted intents
    prediction_probabilities = []
    
    # Request the model for each data
    for data in datas:
        
        try:
            res = rq.get(route, {'sentence':data}).json()
        except:
            print("Request Error: Service not available")
            return [], []
        
        predicted_class = max(res, key=res.get)
        predicted_value = max(res.values())
 
        predicted_labels.append(predicted_class)
        prediction_probabilities.append(predicted_value)
        
    return predicted_labels, prediction_probabilities   

In [10]:
# Get both datasets predictions from the model
train_predicted_labels, train_predicted_probabilities = predict(df_x_train)
test_predicted_labels, test_predicted_probabilities = predict(df_x_test)

assert len(train_predicted_labels) == df_x_train.shape[0]
assert len(train_predicted_labels) == df_x_train.shape[0]
assert len(test_predicted_labels) == df_x_test.shape[0]
assert len(test_predicted_labels) == df_x_test.shape[0]

Service not available
Service not available


AssertionError: 

## Compute model's various scores

In [None]:
# Training scores
print(classification_report(df_y_train, train_predicted_labels))

In [None]:
# Test scores
print(classification_report(df_y_test, test_predicted_labels))

# Application section