In [1]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# sklearn preprocessing pour le traiter les variables catégorielles
from sklearn.preprocessing import LabelEncoder

# Gestion du système de fichiers
import os

# Suppression des alertes
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../Data/2.sample_dataset.csv")
df = df.dropna(subset=['words'])

In [3]:
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import gensim

2024-02-01 11:56:57.507109: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Création et entraînement du modèle Word2Vec
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100

sentences = df['words'].to_list()

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=1)
#                                                workers=multiprocessing.cpu_count())
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

Build & train Word2Vec model ...
Vocabulary size: 61
Word2Vec trained


In [5]:
# Préparation des sentences (tokenization)
maxlen = 24 # adapt to length of sentences

print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

Fit Tokenizer ...
Number of unique words: 292434


In [6]:
print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

Create Embedding matrix ...
Word embedding rate :  0.0001
Embedding matrix: (292434, 300)


In [7]:
from sklearn.model_selection import train_test_split

labels = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(x_sentences, labels, test_size=0.2, random_state=42)

In [19]:
import mlflow
from xgboost import XGBClassifier

In [20]:
param_sets = [
    {'learning_rate': 0.01},
    {'learning_rate': 0.1},
    {'learning_rate': 0.2},
    {'n_estimators': 100},
    {'n_estimators': 200},
    {'n_estimators': 300},
    {'max_depth': 3},
    {'max_depth': 4},
    {'max_depth': 5},
    {'subsample': 0.8},
    {'subsample': 0.9},
    {'subsample': 1},
    {'colsample_bytree': 0.8},
    {'colsample_bytree': 0.9},
    {'colsample_bytree': 1},
]

In [21]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix

artifact_path = './artifacts/'

mlflow.set_experiment("W2V_XGBoostClassifier")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}' # héhéhéhé ça marche
    with mlflow.start_run(run_name=f"W2V_XGBoostClf{name_experience}"):
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train, verbose=False)

        y_pred = clf.predict(X_test)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test, y_test))
        mlflow.log_metric("Precision", precision_score(y_test, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test, y_pred))

        fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        conf_matrix = confusion_matrix(y_test, y_pred)
        conf_matrix_path = f"{artifact_path}confMat_W2V_XGBoostClf_{name_experience}.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_W2V_XGBoostClf_{name_experience}.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

2024/02/01 13:04:29 INFO mlflow.tracking.fluent: Experiment with name 'W2V_XGBoostClassifier' does not exist. Creating a new experiment.
