# Imports

In [2]:
import pandas as pd
import re
import os


import nltk
# Download necessary NLTK data to the specific directory
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Strategies

This notebook aims to test several basic models for binary classification. We will start by preprocessing tweets like we did during the EDA (URLs, mentions, hashtags,...), then tokens will be vectorized using either BoW or TF-IDF. Then, we will compare basic machine learning models using MLflow to mesure performances of each model and ease the comparison.

**Models to test:**
- Linear regression - Simple yet effective linear model for binary classification. Works great with bag-of-words or TF-IDF features. Outputs probabilities, which are useful for threshold tuning.
- Random Forest Classifier - Can capture non-linear word interactions without manual feature engineering. Easy to interpret (at least for shallow trees).
- SVM - Very fast and efficient for text (especially with sparse, high-dimensional data). Based on word occurrence probabilities, which often work surprisingly well in text classification. Robust even with relatively small datasets.
- Naive bayes - Very fast and efficient for text (especially with sparse, high-dimensional data). Based on word occurrence probabilities, which often work surprisingly well in text classification. Robust even with relatively small datasets.

**Metrics :**
- Accuracy for an overall performance check
- Precision (positive predicted values / true positives)
- Recall 
- F1-score

# Data and preparation

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', True)

In [3]:
path = "../data/training.1600000.processed.noemoticon.csv"
# Load the data from the CSV file into a pandas DataFrame
data = pd.read_csv(path,
                header=None,
                names=["target", "ids", "date", "flag", "user", "text"],
                usecols=["target", "ids", "date", "user", "text"],  # remove flag column
                parse_dates=["date"],
                encoding="utf-8",
                encoding_errors="replace",)

# Display the first few rows of the DataFrame and its info
display(data.head())
print(f'Shape of the DataFrame: {data.shape}')

print(f"Number of unique target values: {data['target'].value_counts()}")
# Change target value 4 to 1
data['target'] = data['target'].replace(4, 1)
print(f"Number of unique target values after replacement: {data['target'].value_counts()}")

  data = pd.read_csv(path,


Unnamed: 0,target,ids,date,user,text
0,0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all...."


Shape of the DataFrame: (1600000, 5)
Number of unique target values: target
0    800000
4    800000
Name: count, dtype: int64
Number of unique target values after replacement: target
0    800000
1    800000
Name: count, dtype: int64


## Preprocessing

In [4]:
def process_text(text, lemmatize=False, stemming=False):
    """
    Cleans and preprocesses a single text string by replacing URLs, mentions, and hashtags,
    converting to lowercase, removing special characters, and removing stopwords.

    Args:
        text (str): The text string to process.

    Returns:
        str: The processed text string.
    """


    if pd.isna(text):
        return ""
    # Replace URLs with <URL>
    processed = re.sub(r'https?://\S+', '<URL>', text)
    # Replace mentions with <MENTION>
    processed = re.sub(r'@[A-Za-z0-9_]+', '<MENTION>', processed)
    # Separate # from word and replace the word with <HASHTAG>
    processed = re.sub(r'#([A-Za-z0-9_]+)', r'#<HASHTAG>', processed)

    # Convert text to lowercase
    processed = processed.lower()

    # Remove special characters and numbers, keeping !, ?, and ellipsis (...)
    # Also keeps the placeholders <URL>, <MENTION>, <HASHTAG>
    processed = re.sub(r'[^a-z0-9\s.!?<>#]', '', processed)

    # Tokenize the text
    tokens = word_tokenize(processed)

    if stemming and lemmatize:
        raise ValueError("Cannot use both stemming and lemmatization at the same time. Choose one.")

    # Initialize lemmatizer
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens] if lemmatize else tokens

    # Initialize stemmer
    if stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens] if stemming else tokens


    # Define negative words that should not be removed
    negative_words = {
        'no', 'not', 'nor', "don't", "aren't", "couldn't", "didn't", "doesn't",
        "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
        "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't",
        "never", "none", "nobody", "nothing", "nowhere", "neither"
    }
    # Create a set of stopwords to remove, excluding the negative words
    stop_words_to_remove = set(stopwords.words('english')) - negative_words

    # remove stopwords, and join back to string
    filtered_tokens = [word for word in tokens if word not in stop_words_to_remove]

    return ' '.join(filtered_tokens)


In [5]:
# Sample dataframe for each target
negative_sample = data[data['target'] == 0].sample(n=25000, random_state=42)
positive_sample = data[data['target'] == 1].sample(n=25000, random_state=42)
# Concatenate the two samples
sample_df = pd.concat([negative_sample, positive_sample], ignore_index=True)
# Apply the process_text function to the 'text' column
nltk.download('punkt_tab')
sample_df['processed_text_lem'] = sample_df['text'].apply(process_text, lemmatize=True)
sample_df['processed_text_stem'] = sample_df['text'].apply(process_text, stemming=True)

# Display the first few rows of the processed DataFrame
display(sample_df.tail())
print(f'Shape of the processed DataFrame: {sample_df.shape}')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/laetitiataddei/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,target,ids,date,user,text,processed_text_lem,processed_text_stem
49995,1,2185646565,2009-06-15 17:41:41,nazbear,@abcdude I'll bet your plane takes off to the ...,< mention > ill bet plane take tune quotill al...,< mention > ill bet plane take tune quotil alr...
49996,1,1556812170,2009-04-19 00:28:02,digmyshine,Gaaah just got some good news about day 26!!!!,gaaah got good news day 26 ! ! ! !,gaaah got good news day 26 ! ! ! !
49997,1,1994848543,2009-06-01 12:33:13,TerezBaskin,"Salad: chicken, craisins, carrots, mushrooms, ...",salad chicken craisins carrot mushroom cheese ...,salad chicken craisin carrot mushroom chees ba...
49998,1,2068458420,2009-06-07 14:06:50,TidalWaves7,Chillin on a raft in Devin's pool.,chillin raft devins pool .,chillin raft devin pool .
49999,1,2056840796,2009-06-06 11:53:42,krpearce,I think it's about time for a spontaneous trip...,think time spontaneous trip home visit amazing...,think time spontan trip home visit amaz famili...


Shape of the processed DataFrame: (50000, 7)


## Split data to train and test sets

In [6]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X_lem = sample_df['processed_text_lem']
y_lem = sample_df['target']

X_stem = sample_df['processed_text_stem']
y_stem = sample_df['target']

# Split the data into training and testing sets
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(
    X_lem, y_lem,
    test_size=0.2,
    random_state=42,
    stratify=y_lem
)

# Display the shapes of the resulting datasets
print(f"X_train_lem shape: {X_train_lem.shape}")
print(f"X_test_lem shape: {X_test_lem.shape}")
print(f"y_train_lem shape: {y_train_lem.shape}")
print(f"y_test_lem shape: {y_test_lem.shape}")

# Split the data into training and testing sets
X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(
    X_stem, y_stem,
    test_size=0.2,
    random_state=42,
    stratify=y_stem
)

# Display the shapes of the resulting datasets
print(f"X_train_stem shape: {X_train_stem.shape}")
print(f"X_test_stem shape: {X_test_stem.shape}")
print(f"y_train_stem shape: {y_train_stem.shape}")
print(f"y_test_stem shape: {y_test_stem.shape}")

X_train_shape = X_train_lem.shape
X_test_shape = X_test_lem.shape

X_train_lem shape: (40000,)
X_test_lem shape: (10000,)
y_train_lem shape: (40000,)
y_test_lem shape: (10000,)
X_train_stem shape: (40000,)
X_test_stem shape: (10000,)
y_train_stem shape: (40000,)
y_test_stem shape: (10000,)


# Configuring MLflow

In [7]:
import os
from dotenv import load_dotenv
import time
import pickle

import mlflow
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient


from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix,fbeta_score, roc_curve, classification_report, make_scorer, matthews_corrcoef, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm


# For scikit-learn
mlflow.sklearn.autolog()

# Configuring MLflow
load_dotenv()
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(tracking_uri)
print(f"MLflow Tracking URI: {tracking_uri}")

MLflow Tracking URI: http://127.0.0.1:8080


In [8]:
# Create a new MLflow Experiment
mlflow.set_experiment("P7-Sentiments_Analysis")

2025/08/26 13:02:15 INFO mlflow.tracking.fluent: Experiment with name 'P7-Sentiments_Analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/459239379042760612', creation_time=1756206135431, experiment_id='459239379042760612', last_update_time=1756206135431, lifecycle_stage='active', name='P7-Sentiments_Analysis', tags={}>

## Evaluation function

In [9]:
def evaluate_model(model, X_test, y_test, model_name, vectorizer_name):
    """
    Evaluates a machine learning model using various metrics and visualizations.
    Args:
        model: The trained machine learning model to evaluate.
        X_test: The test features.
        y_test: The true labels for the test set.
        model_name: The name of the model being evaluated.
        vectorizer_name: The name of the vectorizer used for feature extraction.
    Returns:
        accuracy, precision, recall, f1, f2, roc_auc: Evaluation metrics.
        fig_cm: Figure of the confusion matrix.
        fig_roc: Figure of the ROC curve.
        y_pred: Predicted labels for the test set.
    """
    # Predict the labels for the test set
    y_pred = model.predict(X_test)

    # Get the predicted probabilities or decision function scores
    if hasattr(model, 'predict_proba'):
        y_score = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, 'decision_function'):
        y_score = model.decision_function(X_test)
    else:
        y_score = y_pred  # Fallback for models without probability estimates

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    roc_auc = roc_auc_score(y_test, y_score)

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    fig_cm, ax_cm = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax_cm)
    ax_cm.set_xlabel('Prediction')
    ax_cm.set_ylabel('Real label')
    ax_cm.set_title(f'Confusion matrix - {model_name} with {vectorizer_name}')

    # Créer la courbe ROC
    fpr, tpr, _ = roc_curve(y_test, y_score)
    fig_roc, ax_roc = plt.subplots(figsize=(8, 6))
    ax_roc.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.3f})')
    ax_roc.plot([0, 1], [0, 1], 'k--')
    ax_roc.set_xlim([0.0, 1.0])
    ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel('False Positive Rate')
    ax_roc.set_ylabel('True Positive Rate')
    ax_roc.set_title(f'ROC curve - {model_name} with {vectorizer_name}')
    ax_roc.legend(loc="lower right")
    ax_roc.grid(True)

    # Afficher les résultats
    print(f"\nResults for {model_name} with {vectorizer_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"F2 Score: {f2:.4f}")  # Ajout du F2-score
    print(f"ROC AUC: {roc_auc:.4f}")
    print("\nConfusion matrix:")
    print(cm)
    print("\nClassificatin report:")
    print(classification_report(y_test, y_pred))

    return accuracy, precision, recall, f1, f2, roc_auc, fig_cm, fig_roc, y_pred

## Instantiate the vectorizers

In [10]:
# Instantiate the vectorizers
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
bow_vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 2))


# Vectorize texts into TF-IDF vectors
X_train_tfidf_lem = tfidf_vectorizer.fit_transform(X_train_lem)
X_test_tfidf_lem = tfidf_vectorizer.transform(X_test_lem)
X_train_tfidf_stem = tfidf_vectorizer.fit_transform(X_train_stem)
X_test_tfidf_stem = tfidf_vectorizer.transform(X_test_stem)

# Vectorize texts into Bag-of-Words vectors
X_train_bow_lem = bow_vectorizer.fit_transform(X_train_lem)
X_test_bow_lem = bow_vectorizer.transform(X_test_lem)
X_train_bow_stem = bow_vectorizer.fit_transform(X_train_stem)
X_test_bow_stem = bow_vectorizer.transform(X_test_stem)

## Instanciate models

In [11]:
# Instanciate the models
base_models = {
    "Logistic_Regression": LogisticRegression(random_state=42),
    "SVM_Lineaire": LinearSVC(random_state=42),
    "Random_Forest": RandomForestClassifier(random_state=42),
    "Naive_Bayes": MultinomialNB()
}

# Defined hyperparameters
param_grids = {
    "Logistic_Regression": {
        'C': [0.01, 0.1, 1.0, 10.0],
        'max_iter': [1000],
        'solver': ['liblinear', 'saga']
    },
    "SVM_Lineaire": {
        'C': [0.01, 0.1, 1.0, 10.0],
        'max_iter': [1000],
        'dual': [True, False]
    },
    "Random_Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    "Naive_Bayes": {
        'alpha': [0.1, 0.5, 1.0, 2.0]
    }
}

# Vectorizers
vectorizers = {
    "TF-IDF_lem": (tfidf_vectorizer, X_train_tfidf_lem, X_test_tfidf_lem, y_train_lem, y_test_lem),
    "BoW_lem": (bow_vectorizer, X_train_bow_lem, X_test_bow_lem, y_train_lem, y_test_lem),
    "TF-IDF_stem": (tfidf_vectorizer, X_train_tfidf_stem, X_test_tfidf_stem, y_train_stem, y_test_stem),
    "BoW_stem": (bow_vectorizer, X_train_bow_stem, X_test_bow_stem, y_train_stem, y_test_stem),
}

# Scorers
scorers = {
    'f2': make_scorer(fbeta_score, beta=2),
    'f1': make_scorer(f1_score),
    'mcc': make_scorer(matthews_corrcoef),
    'balanced_acc': make_scorer(balanced_accuracy_score)
}

## Training

In [12]:
train_models = True  # Set to True to train models
if train_models:

    # Total iterations for the progress bar
    total_iterations = len(base_models) * len(vectorizers)
    progress_bar = tqdm(total=total_iterations, desc="Overall Progress")

    results = []

    # Test each model with the two types of vectorization
    for model_name, base_model in base_models.items():
        for vectorizer_name, (vectorizer, X_train_vec, X_test_vec, y_train_vec, y_test_vec) in vectorizers.items():
            print(f"\n{'='*80}")
            print(f"Starting GridSearchCV for {model_name} with {vectorizer_name}...")

            # Define the parameter grid and create GridSearchCV
            param_grid = param_grids[model_name]

            grid_search = GridSearchCV(
                base_model,
                param_grid,
                cv=5,
                scoring=scorers,
                refit='f2',
                n_jobs=-1,
                verbose=1,
                return_train_score=True
            )

            # Start MLflow run
            with mlflow.start_run(run_name=f"Basic_Model_{model_name}_{vectorizer_name}"):
                # Log parameters
                mlflow.log_param("model_type", model_name)
                mlflow.log_param("vectorizer_type", vectorizer_name)
                mlflow.log_param("dataset_size", X_train_shape[0] + X_test_shape[0])
                mlflow.log_param("train_size", X_train_shape[0])
                mlflow.log_param("test_size", X_test_shape[0])
                mlflow.log_param("max_features", 10000)
                mlflow.log_param("ngram_range", "(1, 2)")
                mlflow.log_param("scoring_metric", "f2_score")

                # Initialize the timer
                start_time = time.time()

                # Grid search with cross-validation
                grid_search.fit(X_train_vec, y_train_vec)

                # Training time
                training_time = time.time() - start_time

                # Log the training time
                mlflow.log_metric("training_time", training_time)

                # Get and log the best parameters
                best_params = grid_search.best_params_
                for param, value in best_params.items():
                    mlflow.log_param(f"best_{param}", value)

                # Log the best cross-validation F2 score
                mlflow.log_metric("best_cv_f2_score", grid_search.best_score_)

                # Get the best model
                best_model = grid_search.best_estimator_

                # Evaluate the best model on the test set
                acc, prec, rec, f1, f2, roc_auc, fig_cm, fig_roc, y_pred = evaluate_model(
                    best_model, X_test_vec, y_test_vec, model_name, vectorizer_name
                )

                # Log metrics
                mlflow.log_metric("accuracy", acc)
                mlflow.log_metric("precision", prec)
                mlflow.log_metric("recall", rec)
                mlflow.log_metric("f1", f1)
                mlflow.log_metric("f2", f2)
                mlflow.log_metric("roc_auc", roc_auc)

                # Log confusion matrix and ROC curve figures
                mlflow.log_figure(fig_cm, "confusion_matrix.png")
                mlflow.log_figure(fig_roc, "roc_curve.png")
                plt.close(fig_cm)
                plt.close(fig_roc)

                # Log the model
                signature = infer_signature(X_train_vec, y_pred)
                mlflow.sklearn.log_model(best_model, "model", signature=signature)

                # Save the model artifacts into a specific directory
                os.makedirs("./content/basic-model", exist_ok=True)

                # Save and log the vectorizer
                vectorizer_path = f"./content/basic-model/vectorizer_{vectorizer_name}.pkl"
                with open(vectorizer_path, "wb") as f:
                    pickle.dump(vectorizer, f)
                mlflow.log_artifact(vectorizer_path)

                # Log the GridSearchCV results
                cv_results = pd.DataFrame(grid_search.cv_results_)
                cv_results_path = "./content/basic-model/cv_results.csv"
                cv_results.to_csv(cv_results_path, index=False)
                mlflow.log_artifact(cv_results_path)

                # Log the best parameters and scores
                plt.figure(figsize=(12, 8))
                params = [f"{k}={v}" for k, v in best_params.items()]
                params_str = ", ".join(params)

                # Plot the results of GridSearchCV
                for param in param_grid.keys():
                    if len(param_grid[param]) > 1:  # Only if the parameter has multiple values
                        param_name = f"param_{param}"
                        if param_name in cv_results.columns:
                            # Use the column specific to the f2 metric (which you defined as primary)
                            scores_df = cv_results[[param_name, "mean_test_f2", "std_test_f2"]]
                            scores_df = scores_df.sort_values(param_name)

                            plt.figure(figsize=(10, 6))
                            plt.errorbar(
                                scores_df[param_name].astype(str),
                                scores_df["mean_test_f2"],
                                yerr=scores_df["std_test_f2"],
                                fmt='-o'
                            )
                            plt.title(f'Cross-validation score {param}')
                            plt.xlabel(param)
                            plt.ylabel('Average F2 Score')
                            plt.grid(True)
                            mlflow.log_figure(plt.gcf(), f"cv_results_{param}.png")
                            plt.close()

                # Log the best parameters and scores
                if hasattr(best_model, 'coef_'):
                    # Get the most important features
                    if isinstance(best_model, LogisticRegression) or isinstance(best_model, LinearSVC):
                        coefs = best_model.coef_[0]
                        if vectorizer_name == "TF-IDF":
                            feature_names = tfidf_vectorizer.get_feature_names_out()
                        else:
                            feature_names = bow_vectorizer.get_feature_names_out()

                        # Create a DataFrame for the coefficients
                        coefs_df = pd.DataFrame({
                            'feature': feature_names,
                            'importance': coefs
                        })

                        # Sort and filter the coefficients
                        coefs_df['abs_importance'] = abs(coefs_df['importance'])
                        coefs_df = coefs_df.sort_values('abs_importance', ascending=False).head(20)

                        # Log the top features
                        top_features_path = "./content/basic-model/top_features.csv"
                        coefs_df.to_csv(top_features_path, index=False)
                        mlflow.log_artifact(top_features_path)

                        # Plot the top positive and negative features
                        plt.figure(figsize=(10, 8))
                        sns.barplot(x='importance', y='feature', data=coefs_df.sort_values('importance', ascending=False).head(20))
                        plt.title(f'Top 20 positive features')
                        plt.tight_layout()
                        mlflow.log_figure(plt.gcf(), "top_positive_features.png")
                        plt.close()

                        plt.figure(figsize=(10, 8))
                        sns.barplot(x='importance', y='feature', data=coefs_df.sort_values('importance').head(20))
                        plt.title(f'Top 20 negative features')
                        plt.tight_layout()
                        mlflow.log_figure(plt.gcf(), "top_negative_features.png")
                        plt.close()

                # Save the results in a list
                results.append({
                    "Model": model_name.replace("_", " "),
                    "Vectorization": vectorizer_name,
                    "Best params": str(best_params),
                    "Accuracy": acc,
                    "Precision": prec,
                    "Recall": rec,
                    "F1 Score": f1,
                    "F2 Score": f2,
                    "ROC AUC": roc_auc,
                    "Training time (s)": training_time
                })

                # Display the results
                print(f"Best params: {best_params}")
                print(f"Best CV score (F2): {grid_search.best_score_:.4f}")
                print(f"F1 Score test: {f1:.4f}")
                print(f"F2 Score test: {f2:.4f}")
                print(f"ROC AUC test: {roc_auc:.4f}")
                print(f"Training time: {training_time:.2f} secondes")


            # Update the progress bar
            progress_bar.update(1)
            progress_bar.set_description(f"Last model: {model_name} with {vectorizer_name}")

    progress_bar.close()

Overall Progress:   0%|          | 0/16 [00:00<?, ?it/s]


Starting GridSearchCV for Logistic_Regression with TF-IDF_lem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:03:39 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:03:39 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for Logistic_Regression with TF-IDF_lem:
Accuracy: 0.7721
Precision: 0.7692
Recall: 0.7774
F1 Score: 0.7733
F2 Score: 0.7758
ROC AUC: 0.8535

Confusion matrix:
[[3834 1166]
 [1113 3887]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.77      5000
           1       0.77      0.78      0.77      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: Logistic_Regression with TF-IDF_lem:   6%|▋         | 1/16 [00:14<03:39, 14.60s/it]

Best params: {'C': 1.0, 'max_iter': 1000, 'solver': 'liblinear'}
Best CV score (F2): 0.7786
F1 Score test: 0.7733
F2 Score test: 0.7758
ROC AUC test: 0.8535
Training time: 12.43 secondes
🏃 View run Basic_Model_Logistic_Regression_TF-IDF_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/4f254f2920c34b2986e4d4fcadd87194
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Logistic_Regression with BoW_lem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:04:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:04:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for Logistic_Regression with BoW_lem:
Accuracy: 0.7682
Precision: 0.7609
Recall: 0.7822
F1 Score: 0.7714
F2 Score: 0.7778
ROC AUC: 0.8431

Confusion matrix:
[[3771 1229]
 [1089 3911]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      5000
           1       0.76      0.78      0.77      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: Logistic_Regression with BoW_lem:  12%|█▎        | 2/16 [00:35<04:19, 18.54s/it]   

Best params: {'C': 0.1, 'max_iter': 1000, 'solver': 'liblinear'}
Best CV score (F2): 0.7815
F1 Score test: 0.7714
F2 Score test: 0.7778
ROC AUC test: 0.8431
Training time: 19.46 secondes
🏃 View run Basic_Model_Logistic_Regression_BoW_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/ae48ee9638d94f519a2192a8852601d6
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Logistic_Regression with TF-IDF_stem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:04:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:04:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for Logistic_Regression with TF-IDF_stem:
Accuracy: 0.7724
Precision: 0.7722
Recall: 0.7728
F1 Score: 0.7725
F2 Score: 0.7727
ROC AUC: 0.8546

Confusion matrix:
[[3860 1140]
 [1136 3864]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      5000
           1       0.77      0.77      0.77      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: Logistic_Regression with TF-IDF_stem:  19%|█▉        | 3/16 [00:45<03:08, 14.49s/it]

Best params: {'C': 1.0, 'max_iter': 1000, 'solver': 'liblinear'}
Best CV score (F2): 0.7810
F1 Score test: 0.7725
F2 Score test: 0.7727
ROC AUC test: 0.8546
Training time: 7.80 secondes
🏃 View run Basic_Model_Logistic_Regression_TF-IDF_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/faa8a94fb8154c23b67d884c9f59087c
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Logistic_Regression with BoW_stem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:04:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:04:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for Logistic_Regression with BoW_stem:
Accuracy: 0.7721
Precision: 0.7648
Recall: 0.7858
F1 Score: 0.7752
F2 Score: 0.7815
ROC AUC: 0.8452

Confusion matrix:
[[3792 1208]
 [1071 3929]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      5000
           1       0.76      0.79      0.78      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: Logistic_Regression with BoW_stem:  25%|██▌       | 4/16 [01:07<03:27, 17.28s/it]   

Best params: {'C': 0.1, 'max_iter': 1000, 'solver': 'liblinear'}
Best CV score (F2): 0.7834
F1 Score test: 0.7752
F2 Score test: 0.7815
ROC AUC test: 0.8452
Training time: 19.77 secondes
🏃 View run Basic_Model_Logistic_Regression_BoW_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/f3d1111704dd444bb8963d1abf2b2485
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for SVM_Lineaire with TF-IDF_lem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:04:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:04:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for SVM_Lineaire with TF-IDF_lem:
Accuracy: 0.7724
Precision: 0.7692
Recall: 0.7784
F1 Score: 0.7738
F2 Score: 0.7765
ROC AUC: 0.8536

Confusion matrix:
[[3832 1168]
 [1108 3892]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.77      5000
           1       0.77      0.78      0.77      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: SVM_Lineaire with TF-IDF_lem:  31%|███▏      | 5/16 [01:18<02:45, 15.03s/it]     

Best params: {'C': 0.1, 'dual': True, 'max_iter': 1000}
Best CV score (F2): 0.7801
F1 Score test: 0.7738
F2 Score test: 0.7765
ROC AUC test: 0.8536
Training time: 9.29 secondes
🏃 View run Basic_Model_SVM_Lineaire_TF-IDF_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/96d421267a874107bf8d28e9269725b7
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for SVM_Lineaire with BoW_lem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:05:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:05:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for SVM_Lineaire with BoW_lem:
Accuracy: 0.7691
Precision: 0.7593
Recall: 0.7880
F1 Score: 0.7734
F2 Score: 0.7821
ROC AUC: 0.8434

Confusion matrix:
[[3751 1249]
 [1060 3940]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      5000
           1       0.76      0.79      0.77      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: SVM_Lineaire with BoW_lem:  38%|███▊      | 6/16 [01:35<02:38, 15.88s/it]   

Best params: {'C': 0.01, 'dual': True, 'max_iter': 1000}
Best CV score (F2): 0.7843
F1 Score test: 0.7734
F2 Score test: 0.7821
ROC AUC test: 0.8434
Training time: 15.71 secondes
🏃 View run Basic_Model_SVM_Lineaire_BoW_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/7a21d916af654d1085a3f09fbbabd9a0
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for SVM_Lineaire with TF-IDF_stem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:05:11 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:05:11 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for SVM_Lineaire with TF-IDF_stem:
Accuracy: 0.7726
Precision: 0.7714
Recall: 0.7748
F1 Score: 0.7731
F2 Score: 0.7741
ROC AUC: 0.8547

Confusion matrix:
[[3852 1148]
 [1126 3874]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      5000
           1       0.77      0.77      0.77      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: SVM_Lineaire with TF-IDF_stem:  44%|████▍     | 7/16 [01:46<02:08, 14.25s/it]

Best params: {'C': 0.1, 'dual': False, 'max_iter': 1000}
Best CV score (F2): 0.7811
F1 Score test: 0.7731
F2 Score test: 0.7741
ROC AUC test: 0.8547
Training time: 9.08 secondes
🏃 View run Basic_Model_SVM_Lineaire_TF-IDF_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/ce95f61337534530afc87d9b8348613b
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for SVM_Lineaire with BoW_stem...




Fitting 5 folds for each of 8 candidates, totalling 40 fits


2025/08/26 13:05:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
2025/08/26 13:05:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Results for SVM_Lineaire with BoW_stem:
Accuracy: 0.7725
Precision: 0.7631
Recall: 0.7904
F1 Score: 0.7765
F2 Score: 0.7848
ROC AUC: 0.8454

Confusion matrix:
[[3773 1227]
 [1048 3952]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77      5000
           1       0.76      0.79      0.78      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



Last model: SVM_Lineaire with BoW_stem:  50%|█████     | 8/16 [02:04<02:04, 15.54s/it]   

Best params: {'C': 0.01, 'dual': True, 'max_iter': 1000}
Best CV score (F2): 0.7858
F1 Score test: 0.7765
F2 Score test: 0.7848
ROC AUC test: 0.8454
Training time: 16.31 secondes
🏃 View run Basic_Model_SVM_Lineaire_BoW_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/0605230b682e479c85442a2036e342ea
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Random_Forest with TF-IDF_lem...




Fitting 5 folds for each of 18 candidates, totalling 90 fits


2025/08/26 13:09:24 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.
2025/08/26 13:09:24 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.



Results for Random_Forest with TF-IDF_lem:
Accuracy: 0.7501
Precision: 0.7489
Recall: 0.7526
F1 Score: 0.7507
F2 Score: 0.7518
ROC AUC: 0.8289

Confusion matrix:
[[3738 1262]
 [1237 3763]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75      5000
           1       0.75      0.75      0.75      5000

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



Last model: Random_Forest with TF-IDF_lem:  56%|█████▋    | 9/16 [06:04<10:00, 85.73s/it]

Best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score (F2): 0.7606
F1 Score test: 0.7507
F2 Score test: 0.7518
ROC AUC test: 0.8289
Training time: 232.46 secondes
🏃 View run Basic_Model_Random_Forest_TF-IDF_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/3f6d4ee523184eda87876b89a65a59e4
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Random_Forest with BoW_lem...




Fitting 5 folds for each of 18 candidates, totalling 90 fits


2025/08/26 13:12:35 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.
2025/08/26 13:12:35 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.



Results for Random_Forest with BoW_lem:
Accuracy: 0.7520
Precision: 0.7597
Recall: 0.7372
F1 Score: 0.7483
F2 Score: 0.7416
ROC AUC: 0.8329

Confusion matrix:
[[3834 1166]
 [1314 3686]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.74      0.77      0.76      5000
           1       0.76      0.74      0.75      5000

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



Last model: Random_Forest with BoW_lem:  62%|██████▎   | 10/16 [09:11<11:42, 117.01s/it]   

Best params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score (F2): 0.7528
F1 Score test: 0.7483
F2 Score test: 0.7416
ROC AUC test: 0.8329
Training time: 183.64 secondes
🏃 View run Basic_Model_Random_Forest_BoW_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/64d45425797043d382fd38a45b3450c2
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Random_Forest with TF-IDF_stem...




Fitting 5 folds for each of 18 candidates, totalling 90 fits


2025/08/26 13:15:31 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.
2025/08/26 13:15:31 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.



Results for Random_Forest with TF-IDF_stem:
Accuracy: 0.7274
Precision: 0.7037
Recall: 0.7856
F1 Score: 0.7424
F2 Score: 0.7677
ROC AUC: 0.8048

Confusion matrix:
[[3346 1654]
 [1072 3928]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.76      0.67      0.71      5000
           1       0.70      0.79      0.74      5000

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



Last model: Random_Forest with TF-IDF_stem:  69%|██████▉   | 11/16 [12:06<11:12, 134.54s/it]

Best params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score (F2): 0.7668
F1 Score test: 0.7424
F2 Score test: 0.7677
ROC AUC test: 0.8048
Training time: 172.47 secondes
🏃 View run Basic_Model_Random_Forest_TF-IDF_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/afcf262de5334ff48d0380db0671c71b
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Random_Forest with BoW_stem...




Fitting 5 folds for each of 18 candidates, totalling 90 fits


2025/08/26 13:18:26 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.
2025/08/26 13:18:26 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.



Results for Random_Forest with BoW_stem:
Accuracy: 0.7142
Precision: 0.6911
Recall: 0.7746
F1 Score: 0.7305
F2 Score: 0.7563
ROC AUC: 0.7844

Confusion matrix:
[[3269 1731]
 [1127 3873]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.74      0.65      0.70      5000
           1       0.69      0.77      0.73      5000

    accuracy                           0.71     10000
   macro avg       0.72      0.71      0.71     10000
weighted avg       0.72      0.71      0.71     10000



Last model: Random_Forest with BoW_stem:  75%|███████▌  | 12/16 [15:01<09:48, 147.02s/it]   

Best params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score (F2): 0.7794
F1 Score test: 0.7305
F2 Score test: 0.7563
ROC AUC test: 0.7844
Training time: 173.84 secondes
🏃 View run Basic_Model_Random_Forest_BoW_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/dd3be3b430ec456f97988f9c4fccaddf
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Naive_Bayes with TF-IDF_lem...




Fitting 5 folds for each of 4 candidates, totalling 20 fits


2025/08/26 13:18:34 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2025/08/26 13:18:34 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.



Results for Naive_Bayes with TF-IDF_lem:
Accuracy: 0.7646
Precision: 0.7769
Recall: 0.7424
F1 Score: 0.7593
F2 Score: 0.7491
ROC AUC: 0.8451

Confusion matrix:
[[3934 1066]
 [1288 3712]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.75      0.79      0.77      5000
           1       0.78      0.74      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.77      0.76      0.76     10000
weighted avg       0.77      0.76      0.76     10000



Last model: Naive_Bayes with TF-IDF_lem:  81%|████████▏ | 13/16 [15:09<05:14, 104.90s/it]

Best params: {'alpha': 2.0}
Best CV score (F2): 0.7531
F1 Score test: 0.7593
F2 Score test: 0.7491
ROC AUC test: 0.8451
Training time: 6.33 secondes
🏃 View run Basic_Model_Naive_Bayes_TF-IDF_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/da46af3630784beda44690ea50247433
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Naive_Bayes with BoW_lem...




Fitting 5 folds for each of 4 candidates, totalling 20 fits


2025/08/26 13:18:42 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2025/08/26 13:18:42 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.



Results for Naive_Bayes with BoW_lem:
Accuracy: 0.7630
Precision: 0.7700
Recall: 0.7500
F1 Score: 0.7599
F2 Score: 0.7539
ROC AUC: 0.8361

Confusion matrix:
[[3880 1120]
 [1250 3750]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      5000
           1       0.77      0.75      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000



Last model: Naive_Bayes with BoW_lem:  88%|████████▊ | 14/16 [15:17<02:31, 75.53s/it]    

Best params: {'alpha': 2.0}
Best CV score (F2): 0.7567
F1 Score test: 0.7599
F2 Score test: 0.7539
ROC AUC test: 0.8361
Training time: 6.08 secondes
🏃 View run Basic_Model_Naive_Bayes_BoW_lem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/baef66d4d47646bb81550855400af89f
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Naive_Bayes with TF-IDF_stem...




Fitting 5 folds for each of 4 candidates, totalling 20 fits


2025/08/26 13:18:51 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2025/08/26 13:18:51 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.



Results for Naive_Bayes with TF-IDF_stem:
Accuracy: 0.7612
Precision: 0.7735
Recall: 0.7388
F1 Score: 0.7557
F2 Score: 0.7455
ROC AUC: 0.8456

Confusion matrix:
[[3918 1082]
 [1306 3694]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.77      5000
           1       0.77      0.74      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000



Last model: Naive_Bayes with TF-IDF_stem:  94%|█████████▍| 15/16 [15:26<00:55, 55.35s/it]

Best params: {'alpha': 2.0}
Best CV score (F2): 0.7553
F1 Score test: 0.7557
F2 Score test: 0.7455
ROC AUC test: 0.8456
Training time: 6.95 secondes
🏃 View run Basic_Model_Naive_Bayes_TF-IDF_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/b2a80ec0296c40f0a6976e9988a2bdc5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612

Starting GridSearchCV for Naive_Bayes with BoW_stem...




Fitting 5 folds for each of 4 candidates, totalling 20 fits


2025/08/26 13:18:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2025/08/26 13:18:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.



Results for Naive_Bayes with BoW_stem:
Accuracy: 0.7629
Precision: 0.7709
Recall: 0.7482
F1 Score: 0.7594
F2 Score: 0.7526
ROC AUC: 0.8373

Confusion matrix:
[[3888 1112]
 [1259 3741]]

Classificatin report:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      5000
           1       0.77      0.75      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000



Last model: Naive_Bayes with BoW_stem: 100%|██████████| 16/16 [15:34<00:00, 58.41s/it]   

Best params: {'alpha': 2.0}
Best CV score (F2): 0.7593
F1 Score test: 0.7594
F2 Score test: 0.7526
ROC AUC test: 0.8373
Training time: 6.79 secondes
🏃 View run Basic_Model_Naive_Bayes_BoW_stem at: http://127.0.0.1:8080/#/experiments/459239379042760612/runs/7f322d6aa5294496b863cd0dcb7147bd
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/459239379042760612





<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

## Evaluation

### F2 Score as the Primary Metric

The **F-beta score** is a metric that provides a weighted average of Precision and Recall. The **F2 score** is a specific version where `beta = 2`, which gives **twice as much weight to Recall as to Precision**.

*   **Precision**: Of all the tweets your model labeled as "positive," how many were actually positive? (Minimizes False Positives).
*   **Recall**: Of all the tweets that were *actually* positive, how many did your model correctly identify? (Minimizes False Negatives).

In this context, choosing the F2 score means we are prioritizing **Recall**. This is a strategic choice because for many sentiment analysis applications (like identifying happy customers or positive feedback), **it is more costly to miss a positive tweet (a False Negative) than to incorrectly classify a negative one as positive (a False Positive).**

By setting `refit='f2'` in our `GridSearchCV`, we instruct it to find the model that is best at identifying the maximum number of *actual* positive tweets.

In [14]:
# Get mlflow results
def get_mlflow_results(experiment_name:str ="P7-Sentiments_Analysis", run_name: str = "Basic_Model", uri: str = None):
    """
    Retrieves the results of the MLflow runs for the basic models.
    Args:
        experiment_name (str): The name of the MLflow experiment to search in.
        run_name (str): The name of the runs to filter by.
        uri (str): Optional; the URI of the MLflow tracking server. If None, uses the default tracking URI.
    Returns:
        pd.DataFrame: A DataFrame containing the results of the MLflow runs.
    """
    # Connect to the MLflow tracking server
    if uri:
        mlflow.set_tracking_uri(uri)

    # Create mlflow client
    client = MlflowClient()

    # Get the experiment ID by name
    try:
        experiment = client.get_experiment_by_name(experiment_name)
        if not experiment:
            raise ValueError(f"Experiment '{experiment_name}' not found.")
        experiment_id = experiment.experiment_id
    except Exception as e:
        print(f"Error retrieving experiment '{experiment_name}': {experiment}")
        return pd.DataFrame()

    # Load the MLflow runs
    runs = client.search_runs(experiment_ids=[experiment_id],
                            filter_string=f"tags.mlflow.runName LIKE '%{run_name}%'",
                            order_by=["metrics.f2 DESC"])

    # Store results in a list
    results = []

    for run in runs:
        # Get run name
        run_name = run.data.tags.get("mlflow.runName", "")

        # Get model type and vectorization type
        model_type = run.data.params.get("model_type", "")
        vectorization_type = run.data.params.get("vectorizer_type", "")

        # Rename model
        model_type = model_type.replace("_", " ")

        # Get metrics
        metrics = run.data.metrics
        params = run.data.params

        # Get the best parameters
        best_params = {k: v for k, v in params.items() if k.startswith("best_")}

        results.append({
            "Run Name": run_name,
            "Model": model_type,
            "Vectorization": vectorization_type,
            "Best Params": str(best_params),
            "Accuracy": metrics.get("accuracy", None),
            "Precision": metrics.get("precision", None),
            "Recall": metrics.get("recall", None),
            "F1 Score": metrics.get("f1", None),
            "F2 Score": metrics.get("f2", None),
            "ROC AUC": metrics.get("roc_auc", None),
            "Training Time (s)": metrics.get("training_time", None)
        })

    results_df = pd.DataFrame(results)
    if not results_df.empty:
        results_df = results_df.sort_values(by="F2 Score", ascending=False)
        results_df.reset_index(drop=True, inplace=True)

    return results_df

In [15]:
results_df = get_mlflow_results(experiment_name="P7-Sentiments_Analysis", run_name="Basic_Model_", uri=tracking_uri)

In [16]:
display(results_df)

Unnamed: 0,Run Name,Model,Vectorization,Best Params,Accuracy,Precision,Recall,F1 Score,F2 Score,ROC AUC,Training Time (s)
0,Basic_Model_SVM_Lineaire_BoW_stem,SVM Lineaire,BoW_stem,"{'best_C': '0.01', 'best_max_iter': '1000', 'b...",0.7725,0.763082,0.7904,0.776501,0.784781,0.845413,16.3113
1,Basic_Model_SVM_Lineaire_BoW_lem,SVM Lineaire,BoW_lem,"{'best_C': '0.01', 'best_max_iter': '1000', 'b...",0.7691,0.759299,0.788,0.773383,0.782087,0.843445,15.705644
2,Basic_Model_Logistic_Regression_BoW_stem,Logistic Regression,BoW_stem,"{'best_C': '0.1', 'best_max_iter': '1000', 'be...",0.7721,0.764843,0.7858,0.77518,0.781517,0.845166,19.77463
3,Basic_Model_Logistic_Regression_BoW_lem,Logistic Regression,BoW_lem,"{'best_C': '0.1', 'best_max_iter': '1000', 'be...",0.7682,0.760895,0.7822,0.7714,0.777844,0.843118,19.458396
4,Basic_Model_SVM_Lineaire_TF-IDF_lem,SVM Lineaire,TF-IDF_lem,"{'best_C': '0.1', 'best_max_iter': '1000', 'be...",0.7724,0.76917,0.7784,0.773757,0.776536,0.853643,9.287599
5,Basic_Model_Logistic_Regression_TF-IDF_lem,Logistic Regression,TF-IDF_lem,"{'best_C': '1.0', 'best_max_iter': '1000', 'be...",0.7721,0.769246,0.7774,0.773302,0.775755,0.853455,12.429723
6,Basic_Model_SVM_Lineaire_TF-IDF_stem,SVM Lineaire,TF-IDF_stem,"{'best_C': '0.1', 'best_max_iter': '1000', 'be...",0.7726,0.771406,0.7748,0.773099,0.774119,0.854659,9.081667
7,Basic_Model_Logistic_Regression_TF-IDF_stem,Logistic Regression,TF-IDF_stem,"{'best_C': '1.0', 'best_max_iter': '1000', 'be...",0.7724,0.772182,0.7728,0.772491,0.772676,0.854628,7.804892
8,Basic_Model_Random_Forest_TF-IDF_stem,Random Forest,TF-IDF_stem,"{'best_max_depth': '20', 'best_n_estimators': ...",0.7274,0.70369,0.7856,0.742393,0.767727,0.804815,172.471045
9,Basic_Model_Random_Forest_BoW_stem,Random Forest,BoW_stem,"{'best_max_depth': '10', 'best_n_estimators': ...",0.7142,0.691113,0.7746,0.730479,0.756327,0.78445,173.838465


In [17]:
# Compare the results of the models
# Display the results in a table
def compare_models(results_df):
    """
    Compares the results of different machine learning models and vectorization methods.

    Args:
        results_df (pd.DataFrame): Results DataFrame containing model performance metrics.
    """
    if results_df.empty:
        print("No results to compare.")
        return

    # Columns to display
    columns_to_display = [
        "Model",
        "Vectorization",
        "Accuracy",
        "Precision",
        "Recall",
        "F1 Score",
        "F2 Score",
        "ROC AUC",
        "Training Time (s)"
    ]

    # Format the DataFrame for better readability
    for col in ["Accuracy", "Precision", "Recall", "F1 Score", "F2 Score", "ROC AUC"]:
        results_df[col] = results_df[col].apply(lambda x: f"{x:.4f}" if x is not None else "N/A")

    if 'Training Time (s)' in results_df.columns:
        results_df['Training Time (s)'] = results_df['Training Time (s)'].apply(lambda x: f"{x:.2f}" if x is not None else "N/A")

    # Display the results
    print("\nComparison of Models and Vectorization Methods:")
    print(results_df[columns_to_display].to_string(index=False))

    # Get the best model based on F2 Score
    best_model = results_df.loc[results_df['F2 Score'].idxmax()]
    print("\nBest Model:")
    print(f"Model: {best_model['Model']}")
    print(f"Vectorization: {best_model['Vectorization']}")
    print(f"F2 Score: {best_model['F2 Score']}")
    print(f"Best params: {best_model['Best Params']}")

In [18]:
compare_models(results_df)


Comparison of Models and Vectorization Methods:
              Model Vectorization Accuracy Precision Recall F1 Score F2 Score ROC AUC Training Time (s)
       SVM Lineaire      BoW_stem   0.7725    0.7631 0.7904   0.7765   0.7848  0.8454             16.31
       SVM Lineaire       BoW_lem   0.7691    0.7593 0.7880   0.7734   0.7821  0.8434             15.71
Logistic Regression      BoW_stem   0.7721    0.7648 0.7858   0.7752   0.7815  0.8452             19.77
Logistic Regression       BoW_lem   0.7682    0.7609 0.7822   0.7714   0.7778  0.8431             19.46
       SVM Lineaire    TF-IDF_lem   0.7724    0.7692 0.7784   0.7738   0.7765  0.8536              9.29
Logistic Regression    TF-IDF_lem   0.7721    0.7692 0.7774   0.7733   0.7758  0.8535             12.43
       SVM Lineaire   TF-IDF_stem   0.7726    0.7714 0.7748   0.7731   0.7741  0.8547              9.08
Logistic Regression   TF-IDF_stem   0.7724    0.7722 0.7728   0.7725   0.7727  0.8546              7.80
      Random Fo