## In this notebook we try classification with two techniques :

- Zero shot classification
- Sentence similarity

In [None]:
# Eventual installations
!pip install transformers
!pip install -U sentence-transformers

In [3]:
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import multilabel_confusion_matrix

import numpy as np
import pandas as pd
import time

### 0. Loading data and tranform it into vectors/tensors

In [4]:
PATH = ''
df = pd.read_csv(PATH+'intent-detection-train.csv')
X = df['text'].tolist()
Y = df['label'].tolist()
Y_splitted = df['label'].apply(lambda x: x.replace('_', ' ')).tolist()
labels = list(df['label'].apply(lambda x: x.replace('_', ' ')).unique())

### 1. Try zero-shot classification

In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")



In [None]:
# classifier = pipeline("zero-shot-classification",
#                        model="vicgalle/xlm-roberta-large-xnli-anli")

In [None]:
classifier_distilC = pipeline(
    task='zero-shot-classification',
    model="cmarkea/distilcamembert-base-nli",
    tokenizer="cmarkea/distilcamembert-base-nli"
)

In [6]:
# Labels in french
labels_french = ['traduction', 'alerte de voyage', 'statut de vol',
          'perte de baggage', 'recommandation de voyage',
          'informations sur les bagages à main', 'réserver un hôtel',
          'réserver un vol', 'autre',
          ]

In [24]:
# Mapping between french and english labels
# It will be usefull later
french_to_english_label_map = {f: e for f,e in zip(labels_french, labels)}

In [None]:
# We test MoritzLaurer/mDeBERTa-v3-base-mnli-xnli model from hugging face
results = []
for x,y in tqdm(zip(X,Y)):
    results.append(classifier(x, labels))
    print(results[-1]['labels'][0], ' , real = ', y)

In [43]:
# We test cmarkea/distilcamembert-base-nli model from hugging face

# With english labels
results = []
for x in tqdm(X):
    results.append(classifier_distilC(sequences = x, candidate_labels=labels))
    #print(results[-1]['labels'][0], ' , real = ', y)

100%|██████████| 75/75 [00:46<00:00,  1.62it/s]


In [44]:
# We get the labels predicted
results_labels = np.array([r['labels'][0] for r in results])

# We get the accuracy
accuracy = (np.array(results_labels)==np.array(Y_splitted)).sum()/len(results_labels)*100

# Print results
print('Accuracy is : ', round(accuracy, 2), '%')

Accuracy is :  13.33 %


In [41]:
# With french labels
results = []
for x in tqdm(X):
    results.append(classifier_distilC(sequences = x, candidate_labels=labels_french,))
    #print(results[-1]['labels'][0], ' , real = ', y)

100%|██████████| 75/75 [00:47<00:00,  1.59it/s]


In [42]:
# We get the labels predicted
results_labels = np.array([french_to_english_label_map[r['labels'][0]] for r in results])

# We get the accuracy
accuracy = (np.array(results_labels)==np.array(Y_splitted)).sum()/len(results_labels)*100

# Print results
print('Accuracy is : ', round(accuracy, 2), '%')

Accuracy is :  17.33 %


We can see that this classification method don't offer good results (random guess is 11.11%). Let's try another one.

### 2. Sentence similarity (Our best method found yet)



In [None]:
# Load the model
model = SentenceTransformer('sentence-transformers/LaBSE')

In [None]:
#model.save('model_saved')

In [None]:
#model_bis = SentenceTransformer('model_saved')

In [None]:
# Start counting to get check if the model is fast
start_s = time.time()

# Get the embeddings
X_embedded = model.encode(X)

In [None]:
# Compte results
# Here -np.inf for i==j since it is the same sentence
results = [
    [util.pytorch_cos_sim(x1, x2).numpy()[0,0] if i!=j else -np.inf for j, x2 in enumerate(X_embedded)]
    for i, x1 in enumerate(X_embedded)
    ]

In [None]:
# We get the index of the most similar sentence from our train dataset, for each input
indexes = [np.argmax(r) for r in results]

# We then get the corresponding line in the dataframe
res = df.loc[indexes]

In [None]:
# We get the accuracy
accuracy = (res['label'].to_numpy()==df['label'].to_numpy()).sum()/len(df)*100

# We print accuracy
end_s = time.time()
print('Accuracy : ', round(accuracy, 2), ' %')
print('Computed ', len(X), ' results in ', round(end_s-start_s, 2), 's ('+str(round((end_s-start_s)/len(X), 4))+'s/sentence)')

Accuracy :  89.33  %
computed  75  results in  12.85 s (0.1714s/sentence)


In [None]:
# Here we plot the confusion matrix wrt lost luggages label
conf_matrix = multilabel_confusion_matrix(y_true = df['label'].to_numpy(), y_pred = res['label'].to_numpy(), labels = ['lost_luggage'])[0]
print('Confusion matrix for lost luggage label :')
print(conf_matrix)
print('Percentage of Sentences classified as "lost luggage" intent which weren\'t \n (false positives since here lost luggage is labelled as 0): ', round(conf_matrix[1,0]/conf_matrix.sum(), 2), '%')

Confusion matrix for lost luggage label :
[[68  0]
 [ 0  7]]
Percentage of Sentences classified as "lost luggage" intent which weren't 
 (false positives since here lost luggage is labelled as 0):  0.0 %


Limits of this method :
- "out of scope" are luckily well classified for this train dataset but could perform bad on further test examples.

### Annex : We test our function on a "test set" like dataset

In [None]:
def test():
    s_start = time.time()

    df_test = pd.read_csv(PATH+'intent-detection-train.csv')
    X_test = df_test['text'].tolist()

    # Embed the input to the vectorized representation
    X_embedded_test = model.encode(X_test)

    # Now we compute the cosine similarity with the training dataset
    results = [
        [util.pytorch_cos_sim(x1, x2).numpy()[0,0] for j, x2 in enumerate(X_embedded)]
        for i, x1 in enumerate(X_embedded_test)
        ]

    # We get the index of the most similar sentence from our train dataset, for each input
    indexes = [np.argmax(r) for r in results]

    # We then get the corresponding line in the dataframe
    predicted_similar_sentences = df.loc[indexes]

    # We compute the accuracy of our model on the test set
    accuracy = (predicted_similar_sentences['label'].to_numpy()==df_test['label'].to_numpy()).sum()/len(df_test)*100

    conf_matrix = multilabel_confusion_matrix(
        y_true = df_test['label'].to_numpy(),
        y_pred = predicted_similar_sentences['label'].to_numpy(),
        labels = ['lost_luggage'],
        )[0]


    # Print the results of our models
    s_end = time.time()

    # Print the results of our models
    # Accuracy
    print('Accuracy for this test set is :', round(accuracy, 2), '%')
    # False negative are the special case we want to avoid
    print('Percentage of sentences classified as "lost luggage" which weren\'t :', round(conf_matrix[1,0]/conf_matrix.sum(), 2) , '%')
    # Show rapidity of our model
    print('computed ', len(X_test), ' results in ', round(s_end-s_start, 2), 's ('+str(round((s_end-s_start)/len(X), 4))+'s/sentence)')



In [None]:
# We test this function
# the 100% accuracy is expected as sentences are compared to themselves. And hence are similar.
test()

Accuracy for this test set is : 100.0 %
Percent of sentences classified as "lost luggage" which weren't : 0.0 %
computed  75  results in  8.14 s (0.1085s/sentence)
