In [1]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../Data/2.train_df.csv")
df = df.dropna(subset=['words'])

In [3]:
import tensorflow as tf
import os
# import tensorflow_hub as hub
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

os.environ["TF_KERAS"]='1'
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

2024-02-15 12:58:26.325580: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import shutil

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2024-02-15 12:58:31.159670: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


In [5]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size

    features = None

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))
    return features

In [6]:
batch_size = 10
sentences = df['words'].sample(100000, random_state=42)
sentences = sentences.to_list()

In [7]:
x_sentences = feature_USE_fct(sentences, batch_size)

In [9]:
from sklearn.model_selection import train_test_split

labels = df['target'].sample(100000, random_state=42)
labels = labels.values

X_train, X_test, y_train, y_test = train_test_split(x_sentences, labels, test_size=0.2, random_state=42)

In [10]:
import mlflow
import lightgbm as lgb

In [11]:
param_sets = [
    {'max_depth': 10},
    {'max_depth': 15},
    {'num_leaves': 12},
    {'num_leaves': 32},
    {'feature_fraction': 0.6},
    {'feature_fraction': 0.8},
    {'boosting': 'gbdt'},
    {'boosting': 'dart'}
]

In [12]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("DeepLearningTweet")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}' # héhéhéhé ça marche
    with mlflow.start_run(run_name=f"USE_LightGBM_{name_experience}"):
        clf = lgb.LGBMClassifier(**params)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test, y_test))
        mlflow.log_metric("Precision", precision_score(y_test, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test, y_pred))

        f1 = f1_score(y_test, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            mlflow.set_tag("tag1", "LightGBM with USE")
            mlflow.set_tags({"tag2":f'{name_experience}'})
            mlflow.sklearn.log_model(clf, "model", registered_model_name="LightGBM_USE")

            conf_matrix = confusion_matrix(y_test, y_pred)
            conf_matrix_path = f"{artifact_path}confMat_USE_LightGBM_{name_experience}.csv"
            pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
            mlflow.log_artifact(conf_matrix_path, "metrics")

            plt.figure(figsize=(8, 8))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid(True)
            roc_curve_path = f"{artifact_path}roc_USE_LightGBM_{name_experience}.png"
            plt.savefig(roc_curve_path)
            plt.close()
            mlflow.log_artifact(roc_curve_path, "plots")

[LightGBM] [Info] Number of positive: 39985, number of negative: 40015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.191810 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499812 -> initscore=-0.000750
[LightGBM] [Info] Start training from score -0.000750


Successfully registered model 'LightGBM_USE'.
Created version '1' of model 'LightGBM_USE'.


[LightGBM] [Info] Number of positive: 39985, number of negative: 40015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.186439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499812 -> initscore=-0.000750
[LightGBM] [Info] Start training from score -0.000750
[LightGBM] [Info] Number of positive: 39985, number of negative: 40015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.236010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499812 -> initscore=-0.000750
[LightGBM] [Info] Start training from score -0.000750
[Light

Registered model 'LightGBM_USE' already exists. Creating a new version of this model...
Created version '2' of model 'LightGBM_USE'.


[LightGBM] [Info] Number of positive: 39985, number of negative: 40015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.204746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499812 -> initscore=-0.000750
[LightGBM] [Info] Start training from score -0.000750
[LightGBM] [Info] Number of positive: 39985, number of negative: 40015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.198434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499812 -> initscore=-0.000750
[LightGBM] [Info] Start training from score -0.000750
