In [1]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# sklearn preprocessing pour le traiter les variables catégorielles
from sklearn.preprocessing import LabelEncoder

# Gestion du système de fichiers
import os

# Suppression des alertes
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../Data/2.sample_dataset.csv")
df = df.dropna(subset=['words'])

In [3]:
import tensorflow as tf
# import tensorflow_hub as hub
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

os.environ["TF_KERAS"]='1'
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

2024-02-01 12:45:27.390166: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import shutil

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2024-02-01 12:45:33.522654: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


In [5]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size

    features = None

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))
    return features

In [6]:
batch_size = 10
sentences = df['words'].sample(100000, random_state=42)
sentences = sentences.to_list()

In [7]:
x_sentences = feature_USE_fct(sentences, batch_size)

In [8]:
from sklearn.model_selection import train_test_split

labels = df['target'].sample(100000, random_state=42)
labels = labels.values

X_train, X_test, y_train, y_test = train_test_split(x_sentences, labels, test_size=0.2, random_state=42)

In [9]:
import mlflow
from xgboost import XGBClassifier

In [10]:
param_sets = [
    {'learning_rate': 0.01},
    {'learning_rate': 0.1},
    {'learning_rate': 0.2},
    {'n_estimators': 100},
    {'n_estimators': 200},
    {'n_estimators': 300},
    {'max_depth': 3},
    {'max_depth': 4},
    {'max_depth': 5},
    {'subsample': 0.8},
    {'subsample': 0.9},
    {'subsample': 1},
    {'colsample_bytree': 0.8},
    {'colsample_bytree': 0.9},
    {'colsample_bytree': 1},
]

In [11]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix

artifact_path = './artifacts/'

mlflow.set_experiment("USE_XGBoostClassifier")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}' # héhéhéhé ça marche
    with mlflow.start_run(run_name=f"USE_XGBoostClf{name_experience}"):
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train, verbose=False)

        y_pred = clf.predict(X_test)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test, y_test))
        mlflow.log_metric("Precision", precision_score(y_test, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test, y_pred))

        fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        conf_matrix = confusion_matrix(y_test, y_pred)
        conf_matrix_path = f"{artifact_path}confMat_USE_XGBoostClf_{name_experience}.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_USE_XGBoostClf_{name_experience}.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

2024/02/01 13:10:24 INFO mlflow.tracking.fluent: Experiment with name 'USE_XGBoostClassifier' does not exist. Creating a new experiment.
