## **Modules**

In [36]:
!pip install mlflow pyngrok --quiet
!pip install --upgrade tensorflow tensorflow-hub tensorflow_text
!pip install transformers huggingface_hub
!pip install 'transformers[torch]'



In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
import multiprocessing
from sklearn.metrics import multilabel_confusion_matrix
import mlflow
import time
import pickle
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from gensim.models import Word2Vec
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import logging
from transformers import BertTokenizer, BertModel
import torch

## **Fonctions**

In [38]:
def tracker():
  # run tracking UI in the background
  get_ipython().system_raw("mlflow ui --port 5000 &")
  from pyngrok import ngrok

  # Terminate open tunnels if exist
  ngrok.kill()

  # Setting the authtoken (optional)
  NGROK_AUTH_TOKEN = "2e2sfIlj3qAvaM0F540Nv6j28RY_4zpiTmLho7K4dBjWbfRU"
  ngrok.set_auth_token(NGROK_AUTH_TOKEN)

  # Open an HTTPs tunnel on port 5000 for http://localhost:5000
  ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
  print("MLflow Tracking UI:", ngrok_tunnel.public_url)

In [39]:

def evaluate_model(pipeline, X_test, y_test, mlb):
    """
    Evaluate the model using the provided pipeline and test data.

    Parameters:
        pipeline: Fitted scikit-learn pipeline containing the model.
        X_test: Test features.
        y_test: True labels for the test set.
        mlb: Fitted label encoder used for transforming labels.

    Returns:
        None
    """
    # Predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

    # Créer la matrice de confusion
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Afficher la matrice de confusion
    plt.figure(figsize=(10,8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

## **Importation des données**

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
df = pd.read_csv("/content/drive/MyDrive/Projet 5/my_data.csv")

df

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount
0,minification failed returning unminified content,made first website using mvc work fine local m...,|asp.net|asp.net-mvc|bundling-and-minification|,24395412,40,49177,0.0,12
1,ajax request made flux app,creating react application flux architecture t...,|javascript|reactjs|reactjs-flux|,26632415,199,40698,0.0,6
2,gem install debugger error,running rail mac everything smoothly sudden ge...,|ruby-on-rails|ruby|rubygems|rvm|,24395453,25,28278,0.0,6
3,change body parser limit firebase,trying proxy file upload firebase cloud functi...,|node.js|express|firebase|google-cloud-functio...,45489810,7,946,,1
4,io android material design hierarchical timing...,want animation introduced android material des...,|ios|uicollectionview|calayer|material-design|,26632893,11,895,0.0,3
...,...,...,...,...,...,...,...,...
49995,make toolbar grid layout,want make qtoolbar column button docked left s...,|c++|qt|qt5|,23638595,7,2137,,1
49996,using jmslistener multiple payload type destin...,write instance multiple type given destination...,|java|spring|jms|spring-jms|,42656519,6,6453,0.0,3
49997,testing resolved data test ngoninit,working testing guide wish write test ngoninit...,|javascript|angular|jasmine|karma-runner|resol...,42656045,34,67797,0.0,2
49998,unit test project reference mvc project,perhaps easier let visual studio create unit t...,|unit-testing|visual-studio-2013|asp.net-mvc-5|,23638602,10,7610,0.0,4


In [42]:
df.isnull().sum()

Title              26
Body                0
Tags                0
Id                  0
Score               0
ViewCount           0
FavoriteCount    7645
AnswerCount         0
dtype: int64

In [43]:
df.fillna('', inplace=True)
data = df.copy()

In [44]:
data

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount
0,minification failed returning unminified content,made first website using mvc work fine local m...,|asp.net|asp.net-mvc|bundling-and-minification|,24395412,40,49177,0.0,12
1,ajax request made flux app,creating react application flux architecture t...,|javascript|reactjs|reactjs-flux|,26632415,199,40698,0.0,6
2,gem install debugger error,running rail mac everything smoothly sudden ge...,|ruby-on-rails|ruby|rubygems|rvm|,24395453,25,28278,0.0,6
3,change body parser limit firebase,trying proxy file upload firebase cloud functi...,|node.js|express|firebase|google-cloud-functio...,45489810,7,946,,1
4,io android material design hierarchical timing...,want animation introduced android material des...,|ios|uicollectionview|calayer|material-design|,26632893,11,895,0.0,3
...,...,...,...,...,...,...,...,...
49995,make toolbar grid layout,want make qtoolbar column button docked left s...,|c++|qt|qt5|,23638595,7,2137,,1
49996,using jmslistener multiple payload type destin...,write instance multiple type given destination...,|java|spring|jms|spring-jms|,42656519,6,6453,0.0,3
49997,testing resolved data test ngoninit,working testing guide wish write test ngoninit...,|javascript|angular|jasmine|karma-runner|resol...,42656045,34,67797,0.0,2
49998,unit test project reference mvc project,perhaps easier let visual studio create unit t...,|unit-testing|visual-studio-2013|asp.net-mvc-5|,23638602,10,7610,0.0,4


### **Préparation de la variable cible**

In [45]:
tags, nested_tags = [], []
# recuperer top k des tags importants
for i in range(len(data)):
  tags.append(data["Tags"].iloc[i].split('|'))
for tag in tags:
  for subtag in tag:
    if subtag !="":
      nested_tags.append(subtag)

df_tags =  pd.DataFrame(nested_tags).rename(columns={0:"tags"})
df_tags = df_tags["tags"].value_counts().reset_index().sort_values("count", ascending=False)
real_tags = df_tags.head(10)["tags"].tolist()

In [46]:
# transformer le contenu de tag en list de simple tag
data['filtered_tags'] = data['Tags'].str.split("|")

In [47]:
# strip les virgules au debut des tags
def strip_and_filter_commas(tag_list):
    return [tag.strip(',') for tag in tag_list if tag.strip(',')]
data['filtered_tags'] = data['filtered_tags'].apply(strip_and_filter_commas)

In [48]:
# Vérifier si chaque ligne de 'filtered_tags' contient au moins un élément de 'real_tags'
data['contains_real_tag'] = data['filtered_tags'].apply(lambda x: any(tag in x for tag in real_tags))

In [50]:
# garder les tags pertinents et remplacer par others si impertinents
allowed_tags = real_tags

def filter_tags(tag_list):
    if any(tag in allowed_tags for tag in tag_list):
        return [tag for tag in tag_list if tag in allowed_tags]
    else:
        return ["Others"]


data['filtered_tags'] = data['filtered_tags'].apply(filter_tags)
data['filtered_tags']

0            [Others]
1        [javascript]
2            [Others]
3            [Others]
4               [ios]
             ...     
49995           [c++]
49996          [java]
49997    [javascript]
49998        [Others]
49999        [Others]
Name: filtered_tags, Length: 50000, dtype: object

## **Encodage et séparation des données en trainset et testset**

In [51]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(data['filtered_tags'])
y = mlb.transform(data['filtered_tags'])

In [52]:
X = data[['Title', 'Body']].apply(lambda x: ' '.join(x), axis=1)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

In [53]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Expériences MLFLOW avec Logistic Classifier**

In [None]:
mlflow.set_experiment("logistiC Classifier")

def track_experiment(X_train, y_train, X_test, y_test, param_grid, message, month):
    """
    Suivi des expériences avec différents hyperparamètres du logistiC Classifier à l'aide de MLflow.

    Paramètres:
        X_train : fonctionnalités de formation.
        y_train : étiquettes de formation.
        X_test : tester les fonctionnalités.
        y_test : tester les étiquettes.
        param_grid : Dictionnaire des hyperparamètres et de leurs plages.
        message : message à définir comme nom d'exécution dans MLflow.
        mois : mois pour lequel l'expérience est menée.

    Retour:
        Aucun
    """
    # Start MLflow run
    if mlflow.active_run():
        mlflow.end_run()
    else:
        mlflow.start_run(run_name="logistiC Classifier")

    # Log the month for which the experiment is conducted
    mlflow.log_param("month", month)

    # Loop over parameter grid
    for params in ParameterGrid(param_grid):
        # Start nested MLflow run
        with mlflow.start_run(nested=True):
            # Log hyperparameters
            for key, value in params.items():
                mlflow.log_param(key, value)

            # Instantiate model with current set of hyperparameters
            base_model = LogisticRegression(**params)
            model_logistic = OneVsRestClassifier(base_model)

            # Train the model
            start_time = time.time()
            model_logistic.fit(X_train, y_train)
            end_time = time.time()

            # Log model
            mlflow.sklearn.log_model(model_logistic, "model_logistic")
            # Register model in Model Registry
            model_uri = f"runs:/{mlflow.active_run().info.run_id}/model_logistic"
            registered_model = mlflow.register_model(model_uri, "Logistic Classifier model")

            # Make predictions
            y_pred = model_logistic.predict(X_test)

            # Calculate metrics
            jaccard = jaccard_score(y_test, y_pred, average='weighted')
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')


            # Log metrics
            mlflow.log_metric("Jaccard", round(jaccard, 2))
            mlflow.log_metric("accuracy", round(accuracy, 2))
            mlflow.log_metric("precision", round(precision, 2))
            mlflow.log_metric("recall", round(recall, 2))
            mlflow.log_metric("f1", round(f1, 2))
            mlflow.log_metric("training_time", end_time - start_time)

# # Example usage:
param_grid_logistic = {
    "max_iter": [500],
    "solver" :['liblinear'],
}

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

for month in months:
    track_experiment(X_train, y_train, X_test, y_test, param_grid=param_grid_logistic, message="logistiC Classifier", month=month)

Successfully registered model 'Logistic Classifier model'.
Created version '1' of model 'Logistic Classifier model'.
Registered model 'Logistic Classifier model' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic Classifier model'.
Registered model 'Logistic Classifier model' already exists. Creating a new version of this model...
Created version '3' of model 'Logistic Classifier model'.
Registered model 'Logistic Classifier model' already exists. Creating a new version of this model...
Created version '4' of model 'Logistic Classifier model'.
Registered model 'Logistic Classifier model' already exists. Creating a new version of this model...
Created version '5' of model 'Logistic Classifier model'.
Registered model 'Logistic Classifier model' already exists. Creating a new version of this model...
Created version '6' of model 'Logistic Classifier model'.
Registered model 'Logistic Classifier model' already exists. Creating a new version of t

## **Entrainement du modèle SGD Classifier**

---



In [None]:
mlflow.set_experiment("SGD Classifier")

def track_experiment(X_train, y_train, X_test, y_test, param_grid, message, month):
    """
    Suivi des expériences avec différents hyperparamètres du SGD Classifier à l'aide de MLflow
    Paramètres:
        X_train : fonctionnalités de formation.
        y_train : étiquettes de formation.
        X_test : tester les fonctionnalités.
        y_test : tester les étiquettes.
        param_grid : Dictionnaire des hyperparamètres et de leurs plages.
        message : message à définir comme nom d'exécution dans MLflow.
        mois : mois pour lequel l'expérience est menée.

    Retour:
        Aucun
    """
    # Start MLflow run
    if mlflow.active_run():
        mlflow.end_run()
    else:
        mlflow.start_run(run_name="SGD Classifier")

    # Log the month for which the experiment is conducted
    mlflow.log_param("month", month)

    # Loop over parameter grid
    for params in ParameterGrid(param_grid):
        # Start nested MLflow run
        with mlflow.start_run(nested=True):
            # Log hyperparameters
            for key, value in params.items():
                mlflow.log_param(key, value)

            # Instantiate model with current set of hyperparameters
            base_model = SGDClassifier(**params)
            model_sgd = OneVsRestClassifier(base_model)

            # Train the model
            start_time = time.time()
            model_sgd.fit(X_train, y_train)
            end_time = time.time()

            # Log model
            mlflow.sklearn.log_model(model_sgd, "model_sgd")
            # Register model in Model Registry
            model_uri = f"runs:/{mlflow.active_run().info.run_id}/model_sgd"
            registered_model = mlflow.register_model(model_uri, "SGD Classifier Model")

            # Make predictions
            y_pred = model_sgd.predict(X_test)

            # Calculate metrics
            jaccard = jaccard_score(y_test, y_pred, average='weighted')
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')
            # Fonction pour calculer le taux de valeurs non prédites

            # Log metrics
            mlflow.log_metric("Jaccard", round(jaccard, 2))
            mlflow.log_metric("accuracy", round(accuracy, 2))
            mlflow.log_metric("precision", round(precision, 2))
            mlflow.log_metric("recall", round(recall, 2))
            mlflow.log_metric("f1", round(f1, 2))
            mlflow.log_metric("training_time", end_time - start_time)

# # Example usage:
param_grid = {
    "loss" : ["hinge"],
    "max_iter": [500]
}
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

for month in months:
    track_experiment(X_train, y_train, X_test, y_test, param_grid, message="SGD Classifier", month=month)

## **Entrainement du modèle KNN Classifier**

In [None]:
mlflow.set_experiment("KNN Classifier")

def track_experiment(X_train, y_train, X_test, y_test, param_grid, message, month):
    """
    Suivi des expériences avec différents hyperparamètres du KNN Classifier à l'aide de MLflow.

    Paramètres:
        X_train : fonctionnalités de formation.
        y_train : étiquettes de formation.
        X_test : tester les fonctionnalités.
        y_test : tester les étiquettes.
        param_grid : Dictionnaire des hyperparamètres et de leurs plages.
        message : message à définir comme nom d'exécution dans MLflow.
        mois : mois pour lequel l'expérience est menée.

    Retour:
        Aucun
    """
    # Start MLflow run
    if mlflow.active_run():
        mlflow.end_run()
    else:
        mlflow.start_run(run_name="KNN Classifier")

    # Log the month for which the experiment is conducted
    mlflow.log_param("month", month)

    # Loop over parameter grid
    for params in ParameterGrid(param_grid):
        # Start nested MLflow run
        with mlflow.start_run(nested=True):
            # Log hyperparameters
            for key, value in params.items():
                mlflow.log_param(key, value)

            # Instantiate model with current set of hyperparameters
            model_knn = KNeighborsClassifier(**params)

            # Train the model
            start_time = time.time()
            model_knn.fit(X_train, y_train)
            end_time = time.time()

            # Log model
            mlflow.sklearn.log_model(model_knn, "model_knn")
            # Register model in Model Registry
            model_uri = f"runs:/{mlflow.active_run().info.run_id}/model_knn"
            registered_model = mlflow.register_model(model_uri, "KNN Model")

            # Make predictions
            y_pred = model_knn.predict(X_test)

            # Calculate metrics
            jaccard = jaccard_score(y_test, y_pred, average='weighted')
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            # Log metrics
            mlflow.log_metric("Jaccard", round(jaccard, 2))
            mlflow.log_metric("accuracy", round(accuracy, 2))
            mlflow.log_metric("precision", round(precision, 2))
            mlflow.log_metric("recall", round(recall, 2))
            mlflow.log_metric("f1", round(f1, 2))
            mlflow.log_metric("training_time", end_time - start_time)

# Example usage:
param_grid_knn = {
    "n_neighbors": [3, 5, 7, 10]
}
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

for month in months:
    track_experiment(X_train, y_train, X_test, y_test, param_grid=param_grid_knn, message="KNN Classifier", month=month)


In [None]:
mlflow.set_experiment("RDF Classifier")

def track_experiment(X_train, y_train, X_test, y_test, param_grid, message, month):
    """
    Suivi des expériences avec différents hyperparamètres du RDF Classifier à l'aide de MLflow.

    Paramètres:
        X_train : fonctionnalités de formation.
        y_train : étiquettes de formation.
        X_test : tester les fonctionnalités.
        y_test : tester les étiquettes.
        param_grid : Dictionnaire des hyperparamètres et de leurs plages.
        message : message à définir comme nom d'exécution dans MLflow.
        mois : mois pour lequel l'expérience est menée.

    Retour:
        Aucun
    """
    # Start MLflow run
    if mlflow.active_run():
        mlflow.end_run()
    else:
        mlflow.start_run(run_name="RDF Classifier")

    # Log the month for which the experiment is conducted
    mlflow.log_param("month", month)

    # # Vérifiez la forme de y_train et y_test
    # print(f"y_train shape: {y_train.shape}")
    # print(f"y_test shape: {y_test.shape}")

    # Assurez-vous que y_train et y_test sont des matrices avec la bonne forme
    if len(y_train.shape) == 1:
        y_train = y_train.reshape(-1, 1)
    if len(y_test.shape) == 1:
        y_test = y_test.reshape(-1, 1)

    # Loop over parameter grid
    for params in ParameterGrid(param_grid):
        # Start nested MLflow run
        with mlflow.start_run(nested=True):
            # Log hyperparameters

            # Instantiate model with current set of hyperparameters
            base_rf = RandomForestClassifier(**params)

            # Créer un classifieur chain avec Random Forest
            chain = ClassifierChain(base_rf, order='random', random_state=42)

            # Train the model
            start_time = time.time()
            chain.fit(X_train, y_train)
            end_time = time.time()

            # Log model
            mlflow.sklearn.log_model(chain, "model_RDF")
            # Register model in Model Registry
            model_uri = f"runs:/{mlflow.active_run().info.run_id}/chain"
            registered_model = mlflow.register_model(model_uri, "RDF Classifier model")

            # Make predictions
            y_pred = chain.predict(X_test)

            # Calculate metrics
            jaccard = jaccard_score(y_test, y_pred, average='weighted')
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            # Log metrics
            mlflow.log_metric("Jaccard", round(jaccard, 2))
            mlflow.log_metric("accuracy", round(accuracy, 2))
            mlflow.log_metric("precision", round(precision, 2))
            mlflow.log_metric("recall", round(recall, 2))
            mlflow.log_metric("f1", round(f1, 2))
            mlflow.log_metric("training_time", end_time - start_time)

# # Example usage:
param_grid_rf = {
    "n_estimators": [50]
}
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

for month in months:
    track_experiment(X_train, y_train, X_test, y_test, param_grid=param_grid_rf, message="RDF Classifier", month=month)


In [None]:
tracker()

## **Word2Vec Embedding**

In [None]:
 DATA = df.copy()

In [None]:
num_workers = multiprocessing.cpu_count()
num_workers

In [None]:
# Séparation des titres et des corps des questions
sentences = DATA['Title'] + ' ' + DATA['Body']

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [None]:
# Entraînement du modèle Word2Vec
word2vec_model = Word2Vec(tokenized_sentences, vector_size=50, window=5, min_count=2, workers=num_workers)

In [None]:
def sentence_embedding(sentence, model):
    word_embeddings = [model.wv[word] for word in sentence if word in model.wv]
    if len(word_embeddings) > 0:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)  # Si aucun mot de la phrase n'est présent dans le modèle, renvoie un vecteur nul

# Créer les embeddings de phrases pour chaque phrase dans tokenized_sentences
sentence_embeddings = [sentence_embedding(sentence, word2vec_model) for sentence in tokenized_sentences]


In [None]:
X = np.array(sentence_embeddings)

In [None]:
tags, nested_tags = [], []
# recuperer top k des tags importants
for i in range(len(data)):
  tags.append(DATA["Tags"].iloc[i].split('|'))
for tag in tags:
  for subtag in tag:
    if subtag !="":
      nested_tags.append(subtag)

df_tags =  pd.DataFrame(nested_tags).rename(columns={0:"tags"})
df_tags = df_tags["tags"].value_counts().reset_index().sort_values("count", ascending=False)
real_tags = df_tags.head(30)["tags"].tolist()

In [None]:
# transformer le contenu de tag en list de simple tag
DATA['filtered_tags'] = DATA['Tags'].str.split("|")

In [None]:
# strip les virgules au debut des tags
def strip_and_filter_commas(tag_list):
    return [tag.strip(',') for tag in tag_list if tag.strip(',')]
DATA['filtered_tags'] = DATA['filtered_tags'].apply(strip_and_filter_commas)

In [None]:
# Vérifier si chaque ligne de 'filtered_tags' contient au moins un élément de 'real_tags'
DATA['contains_real_tag'] = DATA['filtered_tags'].apply(lambda x: any(tag in x for tag in real_tags))

In [None]:
# garder les tags pertinents et remplacer par others si impertinents
allowed_tags = real_tags

def filter_tags(tag_list):
    if any(tag in allowed_tags for tag in tag_list):
        return [tag for tag in tag_list if tag in allowed_tags]
    else:
        return ["Others"]


DATA['filtered_tags'] = DATA['filtered_tags'].apply(filter_tags)

In [None]:
# Créer une instance de MultiLabelBinarizer et ajuster sur les libellés d'entraînement
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(DATA['filtered_tags'])

In [None]:
# Séparation des données en ensemble d'entraînement et ensemble de test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# 5. Entraînement du modèle
classifier = OneVsRestClassifier(LogisticRegression())
start_time = time.time()
classifier.fit(X_train, y_train)
end_time = time.time()

In [None]:
# 6. Évaluation du modèle
y_pred = classifier.predict(X_test)
# Calculate metrics
jaccard = jaccard_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

# Log metrics
print("Jaccard", round(jaccard, 2))
print("accuracy", round(accuracy, 2))
print("precision", round(precision, 2))
print("recall", round(recall, 2))
print("f1", round(f1, 2))
print("training_time", end_time - start_time)
print("Classification Report:\n", report)

## **USE Embedding**

In [None]:
# Charger le modèle Universal Sentence Encoder
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Créer un modèle personnalisé avec la couche Keras
class USEModel(tf.keras.Model):
    def __init__(self, use_model):
        super(USEModel, self).__init__()
        self.use_layer = hub.KerasLayer(use_model, trainable=False)

    def call(self, inputs):
        return self.use_layer(inputs)

# Instancier le modèle personnalisé
use_custom_model = USEModel(use_model)

In [None]:
# Echantillonner 50% de vos données
sampled_data = DATA.sample(frac=0.5, random_state=42)

# Utiliser les données échantillonnées pour la suite de votre code
sentences = (sampled_data['Title'] + ' ' + sampled_data['Body']).tolist()

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

# Convertir les listes de phrases en une seule chaîne de caractères
input_text = [" ".join(sentence) for sentence in tokenized_sentences]

# Générer les embeddings des phrases
sentence_embeddings = use_custom_model(inputs=input_text)



In [None]:
# Convertir les embeddings en numpy array
X = sentence_embeddings.numpy()

In [None]:
# Créer une instance de MultiLabelBinarizer et ajuster sur les libellés d'entraînement
mlb = MultiLabelBinarizer()
y_pred_bert = mlb.fit_transform(sampled_data['filtered_tags'])

In [None]:
# Séparation des données en ensemble d'entraînement et ensemble de test
X_train, X_test, y_train, y_test = train_test_split(X, y_pred_bert, test_size=0.2, random_state=42)

In [None]:
# 5. Entraînement du modèle
classifier = OneVsRestClassifier(LogisticRegression())
start_time = time.time()
classifier.fit(X_train, y_train)
end_time = time.time()

In [None]:
# 6. Évaluation du modèle
from sklearn.metrics import classification_report
y_pred_use = classifier.predict(X_test)
# Calculate metrics
jaccard = jaccard_score(y_test, y_pred_use, average='weighted')
accuracy = accuracy_score(y_test, y_pred_use)
precision = precision_score(y_test, y_pred_use, average='weighted')
recall = recall_score(y_test, y_pred_use, average='weighted')
f1 = f1_score(y_test, y_pred_use, average='weighted')
report = classification_report(y_test, y_pred_use)

# Log metrics
print("Jaccard", round(jaccard, 2))
print("accuracy", round(accuracy, 2))
print("precision", round(precision, 2))
print("recall", round(recall, 2))
print("f1", round(f1, 2))
print("training_time", end_time - start_time)
print("Classification Report:\n", report)

## **BERT Embedding**

In [None]:
# Authentification avec Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

# Importer les bibliothèques nécessaires
from transformers import AutoTokenizer, AutoModel
import gc

# Nettoyer la mémoire
gc.collect()

# Redémarrer le runtime pour s'assurer que toutes les modifications sont prises en compte
from IPython.display import clear_output
clear_output()

In [None]:
# Chargement du modèle BERT
bert_model_path = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"
bert_preprocess_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_model = hub.KerasLayer(bert_model_path)

In [None]:
sampled_data = DATA.sample(frac=0.1, random_state=42)

In [None]:
texts = sampled_data['Title'] + ' ' + sampled_data['Body']

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_texts = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt')

In [None]:
from torch.utils.data import DataLoader, TensorDataset
batch_size = 16  # Vous pouvez ajuster cette taille selon vos besoins
dataset = TensorDataset(tokenized_texts['input_ids'], tokenized_texts['attention_mask'])
dataloader = DataLoader(dataset, batch_size=batch_size)

In [None]:
import logging
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertModel

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
all_embeddings = []

logger.info("Creating DistilBERT embeddings in batches...")
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()

for batch in dataloader:
    input_ids, attention_mask = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(batch_embeddings)

embeddings = torch.cat(all_embeddings, dim=0)
logger.info("Embeddings created successfully.")

In [None]:
X = embeddings.cpu().numpy()

In [None]:
# Créer une instance de MultiLabelBinarizer et ajuster sur les libellés d'entraînement
mlb = MultiLabelBinarizer()
y_pred_bert = mlb.fit_transform(sampled_data['filtered_tags'])

In [None]:
# Séparation des données en ensemble d'entraînement et ensemble de test
X_train, X_test, y_train, y_test = train_test_split(X, y_pred_bert, test_size=0.2, random_state=42)

In [None]:
# 5. Entraînement du modèle
classifier = OneVsRestClassifier(LogisticRegression())
start_time = time.time()
classifier.fit(X_train, y_train)
end_time = time.time()

In [None]:
# 6. Évaluation du modèle
y_pred = classifier.predict(X_test)
# Calculate metrics
jaccard = jaccard_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

# Log metrics
print("Jaccard", round(jaccard, 2))
print("accuracy", round(accuracy, 2))
print("precision", round(precision, 2))
print("recall", round(recall, 2))
print("f1", round(f1, 2))
print("training_time", end_time - start_time)
print("Classification Report:\n", report)