In [None]:
# Eventual installations
#!pip install transformers
#!pip install -U sentence-transformers

In [None]:
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import multilabel_confusion_matrix

import numpy as np
import pandas as pd
import time

### 0. Loading data and tranform it into vectors/tensors

In [None]:
PATH = ''
df = pd.read_csv(PATH+'intent-detection-train.csv')
X = df['text'].tolist()
Y = df['label'].tolist()
Y_splitted = df['label'].apply(lambda x: x.replace('_', ' ')).tolist()
labels = list(df['label'].apply(lambda x: x.replace('_', ' ')).unique())

### 1. Try zero-shot classification

In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")



In [None]:
# classifier = pipeline("zero-shot-classification",
#                        model="vicgalle/xlm-roberta-large-xnli-anli")

In [None]:
classifier_distilC = pipeline(
    task='zero-shot-classification',
    model="cmarkea/distilcamembert-base-nli",
    tokenizer="cmarkea/distilcamembert-base-nli"
)

In [None]:
# Labels in french
labels_french = ['alerte de voyage', 'statut de vol',
          'perte de baggage', 'recommandation de voyage',
          'informations sur les bagages à main', 'réserver un hôtel',
          'réserver un vol', 'autre', 'traduction'
          ]

In [None]:
results = []
for x,y in tqdm(zip(X,Y)):
    results.append(classifier(x, labels))
    print(results[-1]['labels'][0], ' , real = ', y)

1it [00:05,  5.72s/it]

translate  , real =  translate


2it [00:09,  4.82s/it]

translate  , real =  translate


3it [00:12,  3.79s/it]

travel suggestion  , real =  translate


4it [00:14,  3.26s/it]

travel suggestion  , real =  translate


5it [00:18,  3.39s/it]

translate  , real =  translate


6it [00:21,  3.13s/it]

travel suggestion  , real =  translate


7it [00:23,  2.84s/it]

carry on  , real =  translate


8it [00:25,  2.75s/it]

travel alert  , real =  travel_alert


9it [00:28,  2.62s/it]

travel alert  , real =  travel_alert


10it [00:31,  2.65s/it]

travel alert  , real =  travel_alert


11it [00:34,  2.83s/it]

travel suggestion  , real =  travel_alert


12it [00:36,  2.74s/it]

travel alert  , real =  travel_alert


13it [00:39,  2.65s/it]

travel alert  , real =  flight_status


14it [00:41,  2.66s/it]

travel suggestion  , real =  flight_status


15it [00:44,  2.56s/it]

flight status  , real =  flight_status


16it [00:47,  2.86s/it]

flight status  , real =  flight_status


17it [00:51,  3.20s/it]

flight status  , real =  flight_status


18it [00:54,  3.19s/it]

carry on  , real =  flight_status


19it [00:58,  3.25s/it]

lost luggage  , real =  lost_luggage


20it [01:03,  3.74s/it]

lost luggage  , real =  lost_luggage


21it [01:05,  3.44s/it]

lost luggage  , real =  lost_luggage


21it [01:07,  3.21s/it]


KeyboardInterrupt: ignored

In [None]:
results = []
for x,y in tqdm(zip(X,Y)):
    results.append(classifier_distilC(sequences = x, candidate_labels=labels, hypothesis_template="L'utilissteur veut en savoir plus sur {}."))
    print(results[-1]['labels'][0], ' , real = ', y)

1it [00:00,  1.15it/s]

travel suggestion  , real =  translate


2it [00:01,  1.19it/s]

travel suggestion  , real =  translate


3it [00:02,  1.20it/s]

travel suggestion  , real =  translate


4it [00:03,  1.27it/s]

travel alert  , real =  translate


5it [00:04,  1.22it/s]

travel suggestion  , real =  translate


6it [00:04,  1.24it/s]

travel suggestion  , real =  translate


7it [00:05,  1.27it/s]

travel suggestion  , real =  translate


8it [00:06,  1.19it/s]

travel suggestion  , real =  travel_alert


9it [00:07,  1.23it/s]

travel alert  , real =  travel_alert


10it [00:07,  1.33it/s]

travel suggestion  , real =  travel_alert


11it [00:08,  1.39it/s]

travel alert  , real =  travel_alert


12it [00:09,  1.37it/s]

travel alert  , real =  travel_alert


13it [00:10,  1.35it/s]

travel alert  , real =  flight_status


14it [00:10,  1.34it/s]

travel alert  , real =  flight_status


15it [00:11,  1.34it/s]

flight status  , real =  flight_status


16it [00:12,  1.34it/s]

travel alert  , real =  flight_status


17it [00:13,  1.30it/s]

flight status  , real =  flight_status


18it [00:13,  1.39it/s]

travel suggestion  , real =  flight_status


19it [00:14,  1.45it/s]

travel alert  , real =  lost_luggage


20it [00:15,  1.52it/s]

travel alert  , real =  lost_luggage


21it [00:15,  1.60it/s]

travel suggestion  , real =  lost_luggage


22it [00:16,  1.58it/s]

travel alert  , real =  lost_luggage


23it [00:16,  1.64it/s]

travel suggestion  , real =  lost_luggage


24it [00:17,  1.65it/s]

travel suggestion  , real =  lost_luggage


25it [00:17,  1.64it/s]

travel suggestion  , real =  lost_luggage


26it [00:18,  1.70it/s]

travel alert  , real =  travel_suggestion


27it [00:19,  1.71it/s]

travel suggestion  , real =  travel_suggestion


27it [00:19,  1.38it/s]


KeyboardInterrupt: ignored

We can see that the classification with this method is really bad. Let's try another one.

### 2. Sentence similarity (Our best method found yet)



In [None]:
# Load the model
model = SentenceTransformer('sentence-transformers/LaBSE')

In [None]:
# Start counting to get check if the model is fast
start_s = time.time()

# Get the embeddings
X_embedded = model.encode(X)

In [None]:
# Compte results
# Here -np.inf for i==j since it is the same sentence
results = [
    [util.pytorch_cos_sim(x1, x2).numpy()[0,0] if i!=j else -np.inf for j, x2 in enumerate(X_embedded)]
    for i, x1 in enumerate(X_embedded)
    ]

In [None]:
# We get the index of the most similar sentence from our train dataset, for each input
indexes = [np.argmax(r) for r in results]

# We then get the corresponding line in the dataframe
res = df.loc[indexes]

In [None]:
# We get the accuracy
accuracy = (res['label'].to_numpy()==df['label'].to_numpy()).sum()/len(df)*100

# We print accuracy
end_s = time.time()
print('Accuracy : ', round(accuracy, 2), ' %')
print('Computed ', len(X), ' results in ', round(end_s-start_s, 2), 's ('+str(round((end_s-start_s)/len(X), 4))+'s/sentence)')

Accuracy :  89.33  %
computed  75  results in  12.85 s (0.1714s/sentence)


In [None]:
# Here we plot the confusion matrix wrt lost luggages label
conf_matrix = multilabel_confusion_matrix(y_true = df['label'].to_numpy(), y_pred = res['label'].to_numpy(), labels = ['lost_luggage'])[0]
print('Confusion matrix for lost luggage label :')
print(conf_matrix)
print('Percentage of Sentences classified as "lost luggage" intent which weren\'t \n (false positives since here lost luggage is labelled as 0): ', round(conf_matrix[1,0]/conf_matrix.sum(), 2), '%')

Confusion matrix for lost luggage label :
[[68  0]
 [ 0  7]]
Percentage of Sentences classified as "lost luggage" intent which weren't 
 (false positives since here lost luggage is labelled as 0):  0.0 %


Limits of this method :
- "out of scope" are luckily well classified for this train dataset but could perform bad on further test examples.

### Annex : We test our function on a "test set" like dataset

In [None]:
def test():
    s_start = time.time()

    df_test = pd.read_csv(PATH+'intent-detection-train.csv')
    X_test = df_test['text'].tolist()

    # Embed the input to the vectorized representation
    X_embedded_test = model.encode(X_test)

    # Now we compute the cosine similarity with the training dataset
    results = [
        [util.pytorch_cos_sim(x1, x2).numpy()[0,0] for j, x2 in enumerate(X_embedded)]
        for i, x1 in enumerate(X_embedded_test)
        ]

    # We get the index of the most similar sentence from our train dataset, for each input
    indexes = [np.argmax(r) for r in results]

    # We then get the corresponding line in the dataframe
    predicted_similar_sentences = df.loc[indexes]

    # We compute the accuracy of our model on the test set
    accuracy = (predicted_similar_sentences['label'].to_numpy()==df_test['label'].to_numpy()).sum()/len(df_test)*100

    conf_matrix = multilabel_confusion_matrix(
        y_true = df_test['label'].to_numpy(),
        y_pred = predicted_similar_sentences['label'].to_numpy(),
        labels = ['lost_luggage'],
        )[0]


    # Print the results of our models
    s_end = time.time()

    # Print the results of our models
    # Accuracy
    print('Accuracy for this test set is :', round(accuracy, 2), '%')
    # False negative are the special case we want to avoid
    print('Percentage of sentences classified as "lost luggage" which weren\'t :', round(conf_matrix[1,0]/conf_matrix.sum(), 2) , '%')
    # Show rapidity of our model
    print('computed ', len(X_test), ' results in ', round(s_end-s_start, 2), 's ('+str(round((s_end-s_start)/len(X), 4))+'s/sentence)')



In [None]:
test()

Accuracy for this test set is : 100.0 %
Percent of sentences classified as "lost luggage" which weren't : 0.0 %
computed  75  results in  8.14 s (0.1085s/sentence)
