In [1]:
# manipulation des données
import numpy as np
import pandas as pd

# matplotlib et seaborn pour les représentations graphiques
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../Data/2.sample_dataset.csv")
df = df.dropna(subset=['words'])
df_sample = df.sample(10000)
df_sample['target'].value_counts()

target
1    5037
0    4963
Name: count, dtype: int64

In [3]:
from sklearn.model_selection import train_test_split

X = df_sample['words']
y = df_sample['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
import mlflow
from pysentimiento import create_analyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix
import joblib

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("BERTweet")

with mlflow.start_run(run_name="roberta-targeted-sentiment-analysis"):
        clf = create_analyzer(task="sentiment", lang="en")

        model_name = "pysentimiento/roberta-targeted-sentiment-analysis"
        tokenizer = AutoTokenizer.from_pretrained("pysentimiento/roberta-targeted-sentiment-analysis")
        model = AutoModelForSequenceClassification.from_pretrained("pysentimiento/roberta-targeted-sentiment-analysis")

        y_pred = []
        for i in df_sample['words']:
            result = clf.predict(i)
            if result.output == 'POS':
                result = 1
            elif result.output == 'NEG':
                result = 0
            elif result.output == 'NEU':
                result = 0
            y_pred.append(result)

        mlflow.log_metric("accuracy", accuracy_score(df_sample['target'], y_pred))
        mlflow.log_metric("Precision", precision_score(df_sample['target'], y_pred))
        mlflow.log_metric("Recall", recall_score(df_sample['target'], y_pred))
        mlflow.log_metric("F1_Score", f1_score(df_sample['target'], y_pred))
        mlflow.log_param("task", "sentiment")
        mlflow.log_param("lang", "en")
        mlflow.log_param("model_name", model_name)

        fpr, tpr, thresholds = roc_curve(df_sample['target'], y_pred)
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        best_model_path = f"{artifact_path}roberta-targeted-sentiment-analysis"
        tokenizer.save_pretrained(best_model_path)
        model.save_pretrained(best_model_path)
        #clf.save_pretrained(best_model_path)
        #joblib.dump(clf, best_model_path)
        mlflow.log_artifact(best_model_path, "roberta-targeted-sentiment-analysis")

        conf_matrix = confusion_matrix(df_sample['target'], y_pred)
        conf_matrix_path = f"{artifact_path}roberta-targeted-sentiment-analysis.csv"
        pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
        mlflow.log_artifact(conf_matrix_path, "metrics")

        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        roc_curve_path = f"{artifact_path}roc_roberta-targeted-sentiment-analysis.png"
        plt.savefig(roc_curve_path)
        plt.close()
        mlflow.log_artifact(roc_curve_path, "plots")

In [18]:
result = mlflow.register_model(
    "runs:/2611a276d34e47889790945b2783644d/roberta-targeted-sentiment-analysis-model", "roberta-targeted-sentiment-analysis-model"
)

Successfully registered model 'roberta-targeted-sentiment-analysis-model'.
Created version '1' of model 'roberta-targeted-sentiment-analysis-model'.
