Projet 5 | Catégorisez automatiquement des questions

Partie 4.3 | Analyse Supervisée - USE

# Présentation du projet
Nous cherchons a aider la communauté de Stack Overflow, site célèbre de questions-réponses liées au développement informatique, en réalisant un API de prédiction de tags.<br/>
L'analyse se basera sur le NLP (Natural language processing) et nous testerons différentes méthodes pour ne sélectionner que la plus efficace et pertinente.

# Import des packages, fonctions et paramétrage initial

## Librairies

In [1]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

# Modules classiques d'analyse exploratoire:
import pandas as pd
import re    #module for regular expression operations
import string
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from nltk import FreqDist

# Algo supervisés
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss, jaccard_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore')

import pickle
import time
from sklearn import cluster, metrics
from sklearn import manifold, decomposition
import logging
import numpy as np

logging.disable(logging.WARNING) # disable WARNING, INFO and DEBUG logging everywhere

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## MLFlow

In [3]:
!pip install mlflow --quiet
!pip install pyngrok --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.0/25.0 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [4]:
# check wether mlflow is installed
!mlflow

Usage: mlflow [OPTIONS] COMMAND [ARGS]...

Options:
  --version  Show the version and exit.
  --help     Show this message and exit.

Commands:
  artifacts    Upload, list, and download artifacts from an MLflow...
  db           Commands for managing an MLflow tracking database.
  deployments  Deploy MLflow models to custom targets.
  doctor       Prints out useful information for debugging issues with MLflow.
  experiments  Manage experiments.
  gc           Permanently delete runs in the `deleted` lifecycle stage.
  models       Deploy MLflow models locally.
  recipes      Run MLflow Recipes and inspect recipe results.
  run          Run an MLflow project from the given URI.
  runs         Manage runs.
  sagemaker    Serve models on SageMaker.
  server       Run the MLflow tracking server.


In [5]:
import mlflow
from mlflow.tracking import MlflowClient
import mlflow.sklearn

mlflow.set_experiment('IMLP5-Supervisé')

<Experiment: artifact_location='file:///content/mlruns/721016419642630710', creation_time=1717488808172, experiment_id='721016419642630710', last_update_time=1717488808172, lifecycle_stage='active', name='IMLP5-Supervisé', tags={}>

In [6]:
print(mlflow.__version__)

2.13.1


# Chargement du datasets

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Création des DataFrame
df_stack = pd.read_csv("/content/drive/MyDrive/2_Formations/2.0_OpenClassRooms/2.0.0_IML/IML P5/5.1_Data/QueryResults.csv")
df_stack.head(1)

Unnamed: 0,Title,Body,Tags,CreationDate,AnswerCount,ViewCount,Score
0,Python kernel dies for second run of PyQt5 GUI,<ul>\n<li>Using Spyder in Python 3.5.2 |Anacon...,<python><ipython><anaconda><pyqt5><spyder>,2016-10-17 19:21:55,3,10077,17


In [9]:
df_stack.head(1)

Unnamed: 0,Title,Body,Tags,CreationDate,AnswerCount,ViewCount,Score
0,Python kernel dies for second run of PyQt5 GUI,<ul>\n<li>Using Spyder in Python 3.5.2 |Anacon...,<python><ipython><anaconda><pyqt5><spyder>,2016-10-17 19:21:55,3,10077,17


In [10]:
df_stack['txt'] = df_stack['Title'] + '-' + df_stack['Body']

In [11]:
# Création des DataFrame
df_cleaned = pd.read_csv("/content/drive/MyDrive/2_Formations/2.0_OpenClassRooms/2.0.0_IML/IML P5/5.1_Data/df_cleaned.csv",
                 sep=';')
df_cleaned.sample(1)

Unnamed: 0,Title,Body,Tags,lemmatized_corpus,lemmatized_tags,stemmed_corpus,stemmed_tags
18265,Getting cURL to work with Visual Studios 2017,<p>*Edit: I got CURL working in VS 2017 on a 6...,<c++><curl><linker><visual-studio-2017><vcpkg>,"['getting', 'curl', 'work', 'visual', 'studio'...","['curl', 'linker', 'visual', 'studio', 'vcpkg']","['get', 'curl', 'work', 'visual', 'studio', 'e...","['curl', 'linker', 'visual', 'studio', 'vcpkg']"


In [12]:
import ast
df_cleaned['lemmatized_corpus'] = df_cleaned['lemmatized_corpus'].map(
    lambda x: ast.literal_eval(x)
)
df_cleaned['lemmatized_tags'] = df_cleaned['lemmatized_tags'].map(
    lambda x: ast.literal_eval(x)
)
df_cleaned['stemmed_corpus'] = df_cleaned['stemmed_corpus'].map(
    lambda x: ast.literal_eval(x)
)
df_cleaned['stemmed_tags'] = df_cleaned['stemmed_tags'].map(
    lambda x: ast.literal_eval(x)
)

In [13]:
df_cleaned.sample(1)

Unnamed: 0,Title,Body,Tags,lemmatized_corpus,lemmatized_tags,stemmed_corpus,stemmed_tags
15360,Install Bundler gem using Ansible,<p>I am trying to install Bundler on my VPS us...,<ruby><ubuntu><rubygems><bundler><ansible>,"[install, bundler, using, ansiblei, trying, in...","[ruby, ubuntu, rubygems, bundler, ansible]","[instal, bundler, use, ansiblei, tri, instal, ...","[rubi, ubuntu, rubygem, bundler, ansibl]"


In [14]:
df_cleaned['Text_complet'] = df_cleaned['Title'] + df_cleaned['Body']
df_cleaned['Text_complet'].head(1)

0    Python kernel dies for second run of PyQt5 GUI...
Name: Text_complet, dtype: object

# Nettoyage bis

In [15]:
# Tokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenizer_fct(sentence) :
    # print(sentence)
    sentence_clean = sentence.replace('-', ' ').replace('+', ' ').replace('/', ' ').replace('#', ' ')
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', '.', ':', '?', '(', ')']

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w2

# lower case et alpha
def lower_start_fct(list_words) :
    lw = [w.lower() for w in list_words if (not w.startswith("@"))
    #                                   and (not w.startswith("#"))
                                       and (not w.startswith("http"))]
    return lw

# Lemmatizer (base d'un mot)
from nltk.stem import WordNetLemmatizer

def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w

# Fonction de préparation du texte pour le bag of words (Countvectorizer et Tf_idf, Word2Vec)
def transform_bow_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    # lem_w = lemma_fct(lw)
    transf_desc_text = ' '.join(lw)
    return transf_desc_text

# Fonction de préparation du texte pour le bag of words avec lemmatization
def transform_bow_lem_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text

# Fonction de préparation du texte pour le Deep learning (USE et BERT)
def transform_dl_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
#    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(word_tokens)
    # lem_w = lemma_fct(lw)
    transf_desc_text = ' '.join(lw)
    return transf_desc_text

df_stack['sentence_bow'] = df_stack['txt'].apply(lambda x : transform_bow_fct(x))
df_stack['sentence_bow_lem'] = df_stack['txt'].apply(lambda x : transform_bow_lem_fct(x))
df_stack['tags_bow_lem'] = df_stack['Tags'].apply(lambda x : transform_bow_lem_fct(x))
df_stack['sentence_dl'] = df_stack['txt'].apply(lambda x : transform_dl_fct(x))
df_stack.shape

(35018, 12)

In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize

df_stack['length_bow'] = df_stack['sentence_bow'].apply(lambda x : len(word_tokenize(x)))
print("max length bow : ", df_stack['length_bow'].max())
df_stack['length_dl'] = df_stack['sentence_dl'].apply(lambda x : len(word_tokenize(x)))
print("max length dl : ", df_stack['length_dl'].max())

max length bow :  3286
max length dl :  9758


In [17]:
df_stack.to_csv("df_cleaned.csv")

In [18]:
df = df_stack.sample(5000)

In [19]:
import time

# Calcul Tsne, détermination des clusters et calcul ARI entre vrais catégorie et n° de clusters
def ARI_fct(features) :
    time1 = time.time()
    num_labels=len(l_cat)
    tsne = manifold.TSNE(n_components=2, perplexity=30, n_iter=2000,
                                 init='random', learning_rate=200, random_state=42)
    X_tsne = tsne.fit_transform(features)

    # Détermination des clusters à partir des données après Tsne
    cls = cluster.KMeans(n_clusters=num_labels, n_init=100, random_state=42)
    cls.fit(X_tsne)
    ARI = np.round(metrics.adjusted_rand_score(y_cat_num, cls.labels_),4)
    time2 = np.round(time.time() - time1,0)
    print("ARI : ", ARI, "time : ", time2)

    return ARI, X_tsne, cls.labels_


# visualisation du Tsne selon les vraies catégories et selon les clusters
def TSNE_visu_fct(X_tsne, y_cat_num, labels, ARI) :
    fig = plt.figure(figsize=(15,6))

    ax = fig.add_subplot(121)
    scatter = ax.scatter(X_tsne[:,0],X_tsne[:,1], c=y_cat_num, cmap='Set1')
    ax.legend(handles=scatter.legend_elements()[0], labels=l_cat, loc="best", title="Categorie")
    plt.title('Représentation des tweets par catégories réelles')

    ax = fig.add_subplot(122)
    scatter = ax.scatter(X_tsne[:,0],X_tsne[:,1], c=labels, cmap='Set1')
    ax.legend(handles=scatter.legend_elements()[0], labels=set(labels), loc="best", title="Clusters")
    plt.title('Représentation des tweets par clusters')

    plt.show()
    print("ARI : ", ARI)

In [20]:
models_performance = {}

def metrics_report(model_name, test_labels, predictions, performances):
    """
    Compute performance metrics of a model and store them in a dictionary

    Args:
        model_name(string): name of the evaluated model
        test_labels(array): labels related to predictors
        preductions(array): predicted results
        performances(dict): used dictionary to store metrics
    Returns:
        performances(dict): used dictionary to store metrics filed with models ones
    """
    accuracy = accuracy_score(test_labels, predictions)
    jaccard = jaccard_score(test_labels, predictions, average='macro')
    hamLoss = hamming_loss(test_labels, predictions)

    weighted_precision = precision_score(test_labels, predictions, average='weighted')
    weighted_recall = recall_score(test_labels, predictions, average='weighted')
    weighted_f1 = f1_score(test_labels, predictions, average='weighted')

    macro_precision = precision_score(test_labels, predictions, average='macro')
    macro_recall = recall_score(test_labels, predictions, average='macro')
    macro_f1 = f1_score(test_labels, predictions, average='macro')

    micro_precision = precision_score(test_labels, predictions, average='micro')
    micro_recall = recall_score(test_labels, predictions, average='micro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')


    with mlflow.start_run():

      print("------" + model_name + " Model Metrics-----")
    #print("Balanced Accuracy: {:.4f}\nAccuracy: {:.4f}\nJaccard: {:.4f}\nHamming Loss: {:.4f}\nPrecision:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nRecall:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nF1-measure:\n  - Macro: {:.4f}\n  - Micro: {:.4f}"\
    #      .format(balanced_accuracy, accuracy, jaccard, hamLoss, macro_precision, micro_precision, macro_recall, micro_recall, macro_f1, micro_f1))

      performances[model_name] = {}
      performances[model_name]["Accuracy"] =  round(accuracy,2)
      performances[model_name]["Precision"] =  round(weighted_precision,2)
      performances[model_name]["Recall"] = round(weighted_recall,2)
      performances[model_name]["F1"] = round(weighted_f1,2)
      performances[model_name]["Haming loss"] = round(hamLoss,2)
      performances[model_name]["Jaccard score"] = round(jaccard,2)

      mlflow.log_metric("Accuracy", round(accuracy,2))
      mlflow.log_param(model_name, "model")

      return performances

In [21]:
def build_word_distribution(corpus):
    """
    Build corpus word distribution

    Args:
        Corpus(List of lists): Original corpus

    Returns:

        word_dist_df(DataFrame): Word distribution of the corpus
    """
    word_corpus = [token for token_list in corpus for token in token_list]
    word_dist = FreqDist(word_corpus)
    word_dist_df = pd.DataFrame(word_dist.items(), columns=['Word', 'Frequency']).set_index('Word')
    word_dist_df.sort_values("Frequency", ascending=False, inplace=True)

    return word_dist_df

# Distribution des tags

In [22]:
tags = df_cleaned['lemmatized_tags'].to_list()

tag_dist = build_word_distribution(tags)

print(f"{tag_dist.shape[0]} tags différents")
print("10 tags les plus utilisés :")
display(tag_dist.head(10))

6558 tags différents
10 tags les plus utilisés :


Unnamed: 0_level_0,Frequency
Word,Unnamed: 1_level_1
python,7927
android,7149
java,5038
javascript,4809
spring,4119
node,2024
core,1969
angular,1901
google,1854
swift,1793


In [23]:
tag_dist.describe()

Unnamed: 0,Frequency
count,6558.0
mean,26.153095
std,195.189879
min,1.0
25%,1.0
50%,2.0
75%,8.0
max,7927.0


In [24]:
m = 50
first_m_tags = tag_dist[0:m].index.to_list()

# Distribution du corpus

In [25]:
corpus = df_cleaned['lemmatized_corpus'].to_list()

word_dist = build_word_distribution(corpus)

print(f"{word_dist.shape[0]} tokens différents dans le corpus")
print("10 tokens les plus utilisés :")
display(word_dist.head(10))

142364 tokens différents dans le corpus
10 tokens les plus utilisés :


Unnamed: 0_level_0,Frequency
Word,Unnamed: 1_level_1
java,36672
error,31224
file,29539
android,26709
class,25055
using,24584
name,22201
http,20954
user,20841
data,20325


In [26]:
df_stack.columns

Index(['Title', 'Body', 'Tags', 'CreationDate', 'AnswerCount', 'ViewCount',
       'Score', 'txt', 'sentence_bow', 'sentence_bow_lem', 'tags_bow_lem',
       'sentence_dl', 'length_bow', 'length_dl'],
      dtype='object')

In [27]:
df_stack.head(2)

Unnamed: 0,Title,Body,Tags,CreationDate,AnswerCount,ViewCount,Score,txt,sentence_bow,sentence_bow_lem,tags_bow_lem,sentence_dl,length_bow,length_dl
0,Python kernel dies for second run of PyQt5 GUI,<ul>\n<li>Using Spyder in Python 3.5.2 |Anacon...,<python><ipython><anaconda><pyqt5><spyder>,2016-10-17 19:21:55,3,10077,17,Python kernel dies for second run of PyQt5 GUI...,python kernel dies second run pyqt5 gui using ...,python kernel dy second run pyqt5 gui using sp...,python ipython anaconda pyqt5 spyder,python kernel dies for second run of pyqt5 gui...,117,278
1,How can I use optional chaining with arrays an...,<p>I'm trying to use optional chaining with an...,<javascript><arrays><typescript><function><opt...,2020-01-07 07:05:02,5,137461,296,How can I use optional chaining with arrays an...,how use optional chaining arrays functions try...,how use optional chaining array function tryin...,javascript array typescript function optional ...,how can i use optional chaining with arrays an...,43,149


In [28]:
df_stack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35018 entries, 0 to 35017
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             35018 non-null  object
 1   Body              35018 non-null  object
 2   Tags              35018 non-null  object
 3   CreationDate      35018 non-null  object
 4   AnswerCount       35018 non-null  int64 
 5   ViewCount         35018 non-null  int64 
 6   Score             35018 non-null  int64 
 7   txt               35018 non-null  object
 8   sentence_bow      35018 non-null  object
 9   sentence_bow_lem  35018 non-null  object
 10  tags_bow_lem      35018 non-null  object
 11  sentence_dl       35018 non-null  object
 12  length_bow        35018 non-null  int64 
 13  length_dl         35018 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 3.7+ MB


In [29]:
df_stack['sentence_bow_lem'].head(2)

0    python kernel dy second run pyqt5 gui using sp...
1    how use optional chaining array function tryin...
Name: sentence_bow_lem, dtype: object

In [30]:
df_cleaned.columns

Index(['Title', 'Body', 'Tags', 'lemmatized_corpus', 'lemmatized_tags',
       'stemmed_corpus', 'stemmed_tags', 'Text_complet'],
      dtype='object')

In [31]:
df_cleaned.head(2)

Unnamed: 0,Title,Body,Tags,lemmatized_corpus,lemmatized_tags,stemmed_corpus,stemmed_tags,Text_complet
0,Python kernel dies for second run of PyQt5 GUI,<ul>\n<li>Using Spyder in Python 3.5.2 |Anacon...,<python><ipython><anaconda><pyqt5><spyder>,"[python, kernel, dy, second, pyqt, using, spyd...","[python, ipython, anaconda, pyqt, spyder]","[python, kernel, die, second, pyqt, use, spyde...","[python, ipython, anaconda, pyqt, spyder]",Python kernel dies for second run of PyQt5 GUI...
1,How can I use optional chaining with arrays an...,<p>I'm trying to use optional chaining with an...,<javascript><arrays><typescript><function><opt...,"[optional, chaining, array, function, trying, ...","[javascript, array, typescript, function, opti...","[option, chain, array, function, tri, option, ...","[javascript, array, typescript, function, opti...",How can I use optional chaining with arrays an...


In [32]:
#df = df_cleaned.sample(1000)

In [33]:
df_cleaned['lemmatized_corpus'].head(2)

0    [python, kernel, dy, second, pyqt, using, spyd...
1    [optional, chaining, array, function, trying, ...
Name: lemmatized_corpus, dtype: object

In [34]:
df_cleaned['lemmatized_corpus_string'] = ','.join(str(v) for v in df_cleaned['lemmatized_corpus'])

In [35]:
df_cleaned['lemmatized_corpus_string'].head(2)

0    ['python', 'kernel', 'dy', 'second', 'pyqt', '...
1    ['python', 'kernel', 'dy', 'second', 'pyqt', '...
Name: lemmatized_corpus_string, dtype: object

# USE (Universal Sentence Encoder)
https://analyticsindiamag.com/guide-to-universal-sentence-encoder-with-tensorflow/ <br/>
/!\ The embeddings vector is 512 length, irrespective of the length of the input.

In [36]:
models_performance = {}

def metrics_report(model_name, test_labels, predictions, performances):
    """
    Compute performance metrics of a model and store them in a dictionary

    Args:
        model_name(string): name of the evaluated model
        test_labels(array): labels related to predictors
        preductions(array): predicted results
        performances(dict): used dictionary to store metrics
    Returns:
        performances(dict): used dictionary to store metrics filed with models ones
    """
    accuracy = accuracy_score(test_labels, predictions)
    jaccard = jaccard_score(test_labels, predictions, average='macro')
    hamLoss = hamming_loss(test_labels, predictions)
    weighted_precision = precision_score(test_labels, predictions, average='weighted')
    weighted_recall = recall_score(test_labels, predictions, average='weighted')
    weighted_f1 = f1_score(test_labels, predictions, average='weighted')

    print("------" + model_name + " Model Metrics-----")

    performances[model_name] = {}
    performances[model_name]["Accuracy"] =  round(accuracy,2)
    performances[model_name]["Precision"] =  round(weighted_precision,2)
    performances[model_name]["Recall"] = round(weighted_recall,2)
    performances[model_name]["F1"] = round(weighted_f1,2)
    performances[model_name]["Haming loss"] = round(hamLoss,2)
    performances[model_name]["Jaccard score"] = round(jaccard,2)

    return performances

In [37]:
df = df_cleaned.sample(5000)

### Training

In [38]:
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import os, sys
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [39]:
#Load the TF Hub module of the universal sentence encoder
url = "https://tfhub.dev/google/universal-sentence-encoder/4"

In [40]:
model = hub.load(url) #Load the module from selected URL

In [41]:
# Define a function for computing sentence embedding of input string
def embed(input):
  return model(input)

In [42]:
# Reduce logging output
logging.set_verbosity(logging.ERROR)

In [43]:
# Embed the defined word, sentence and paragraph using the embed() method defined in step (3).
text = ['python', 'java']
message_emb = embed(text)
message_emb

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[-0.0190932 , -0.00982735, -0.01016157, ...,  0.06900886,
         0.00897889, -0.07027519],
       [ 0.00775662, -0.02107682,  0.03759729, ..., -0.05968777,
        -0.01158069, -0.05798421]], dtype=float32)>

In [44]:
messages = df['Text_complet'].sample(3).to_list()

In [45]:
import numpy as np

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print(f"Message: {i}")
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join(
      (str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Message: 0
Embedding size: 512
Embedding: [-0.013179359026253223, -0.07463549077510834, 0.02318587526679039, ...]

Message: 1
Embedding size: 512
Embedding: [-0.05215676873922348, -0.055304158478975296, 0.05364486202597618, ...]

Message: 2
Embedding size: 512
Embedding: [-0.016533682122826576, 0.026625465601682663, 0.016116123646497726, ...]



In [46]:
df_vector_use = pd.DataFrame(message_embeddings)
df_vector_use

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.013179,-0.074635,0.023186,0.011953,-0.016607,0.039946,0.020213,-0.060206,-0.033197,0.054621,...,-0.033851,-0.072923,0.043469,-0.015644,-0.034159,-0.062903,0.054999,0.076574,-0.007135,-0.042261
1,-0.052157,-0.055304,0.053645,0.048861,0.052492,0.020158,-0.033431,-0.055264,-0.054378,0.053015,...,-0.027313,-0.055306,0.014961,-0.047952,-0.055306,-0.046731,0.054334,0.055306,-0.026365,-0.055159
2,-0.016534,0.026625,0.016116,-0.022137,0.026853,0.046306,-0.012975,0.04919,-0.002472,0.05454,...,-0.014556,-0.071926,0.066172,-0.068563,0.037643,-0.041516,0.036122,0.072809,0.053537,-0.055297


In [47]:
%%time
tot_embeddings = model(df['Text_complet'].to_list())

CPU times: user 18.1 s, sys: 7.05 s, total: 25.1 s
Wall time: 22.5 s


In [48]:
df_vector_use = pd.DataFrame(tot_embeddings)
df_vector_use.shape

(5000, 512)

### Evaluation

In [49]:
col_to_drop = []
X = df_vector_use.copy().drop(columns = col_to_drop)
y = df['stemmed_tags']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [51]:
mlb = MultiLabelBinarizer(classes=first_m_tags)
train_labels = mlb.fit_transform(y_train)
test_labels = mlb.transform(y_test)

print("Affichage des classes du modèle de vectorisation :")
print("-"*60)
display(mlb.classes_)

Affichage des classes du modèle de vectorisation :
------------------------------------------------------------


array(['python', 'android', 'java', 'javascript', 'spring', 'node',
       'core', 'angular', 'google', 'swift', 'studio', 'html', 'reactjs',
       'amazon', 'xcode', 'laravel', 'testing', 'react', 'json', 'django',
       'typescript', 'docker', 'visual', 'apache', 'boot', 'panda',
       'service', 'gradle', 'linux', 'window', 'framework', 'jquery',
       'data', 'firebase', 'http', 'azure', 'spark', 'server', 'learning',
       'selenium', 'flutter', 'ruby', 'unit', 'bootstrap', 'angularjs',
       'rest', 'array', 'database', 'cloud', 'dataframe'], dtype=object)

In [52]:
%%time
knn_clf = OneVsRestClassifier(KNeighborsClassifier(), n_jobs=-1)
knn_clf.fit(X_train, train_labels)
knn_predictions = knn_clf.predict(X_test)
metrics_report("knn", test_labels, knn_predictions, models_performance)

------knn Model Metrics-----
CPU times: user 11.6 s, sys: 887 ms, total: 12.5 s
Wall time: 13.6 s


{'knn': {'Accuracy': 0.35,
  'Precision': 0.75,
  'Recall': 0.52,
  'F1': 0.59,
  'Haming loss': 0.02,
  'Jaccard score': 0.26}}

In [53]:
%%time
svm_clf = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svm_clf.fit(X_train, train_labels)

svm_preds = svm_clf.predict(X_test)
metrics_report("svm", test_labels, svm_preds, models_performance)

------svm Model Metrics-----
CPU times: user 1.02 s, sys: 578 ms, total: 1.59 s
Wall time: 7.16 s


{'knn': {'Accuracy': 0.35,
  'Precision': 0.75,
  'Recall': 0.52,
  'F1': 0.59,
  'Haming loss': 0.02,
  'Jaccard score': 0.26},
 'svm': {'Accuracy': 0.39,
  'Precision': 0.81,
  'Recall': 0.56,
  'F1': 0.64,
  'Haming loss': 0.02,
  'Jaccard score': 0.3}}

In [54]:
%%time
rf_clf = OneVsRestClassifier(RandomForestClassifier(), n_jobs=-1)
rf_clf.fit(X_train, train_labels)
rf_preds = rf_clf.predict(X_test)
metrics_report("Random Forest", test_labels, rf_preds, models_performance)

------Random Forest Model Metrics-----
CPU times: user 4.73 s, sys: 549 ms, total: 5.28 s
Wall time: 4min 40s


{'knn': {'Accuracy': 0.35,
  'Precision': 0.75,
  'Recall': 0.52,
  'F1': 0.59,
  'Haming loss': 0.02,
  'Jaccard score': 0.26},
 'svm': {'Accuracy': 0.39,
  'Precision': 0.81,
  'Recall': 0.56,
  'F1': 0.64,
  'Haming loss': 0.02,
  'Jaccard score': 0.3},
 'Random Forest': {'Accuracy': 0.24,
  'Precision': 0.77,
  'Recall': 0.29,
  'F1': 0.37,
  'Haming loss': 0.02,
  'Jaccard score': 0.1}}

In [55]:
%%time
gb_clf = OneVsRestClassifier(GradientBoostingClassifier(), n_jobs=-1)
gb_clf.fit(X_train, train_labels)
gb_preds = gb_clf.predict(X_test)
metrics_report("Gradient Boosting", test_labels, gb_preds, models_performance)

------Gradient Boosting Model Metrics-----
CPU times: user 25.6 s, sys: 4.23 s, total: 29.8 s
Wall time: 43min 55s


{'knn': {'Accuracy': 0.35,
  'Precision': 0.75,
  'Recall': 0.52,
  'F1': 0.59,
  'Haming loss': 0.02,
  'Jaccard score': 0.26},
 'svm': {'Accuracy': 0.39,
  'Precision': 0.81,
  'Recall': 0.56,
  'F1': 0.64,
  'Haming loss': 0.02,
  'Jaccard score': 0.3},
 'Random Forest': {'Accuracy': 0.24,
  'Precision': 0.77,
  'Recall': 0.29,
  'F1': 0.37,
  'Haming loss': 0.02,
  'Jaccard score': 0.1},
 'Gradient Boosting': {'Accuracy': 0.3,
  'Precision': 0.72,
  'Recall': 0.46,
  'F1': 0.54,
  'Haming loss': 0.02,
  'Jaccard score': 0.2}}

In [56]:
result_df = pd.DataFrame.from_dict(models_performance, orient="index")
result_df

Unnamed: 0,Accuracy,Precision,Recall,F1,Haming loss,Jaccard score
knn,0.35,0.75,0.52,0.59,0.02,0.26
svm,0.39,0.81,0.56,0.64,0.02,0.3
Random Forest,0.24,0.77,0.29,0.37,0.02,0.1
Gradient Boosting,0.3,0.72,0.46,0.54,0.02,0.2
