In [18]:
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import numpy as np
import pickle
import warnings
import mlflow
import mlflow.sklearn
from tqdm import tqdm
from utils import split_data, load_splits_from_parquet
from ml import create_ml_model
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [19]:
SEED = 314
# Define the URI of the MLflow server and the name of the experiment
URI = "http://localhost:5000"
PATH_PARQUET = "../data/processed/df_preprocessed.parquet"
PATH_COLS = "../data/processed/columns.pkl"

In [20]:
# Remove FutureWarning alerts
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialiser tqdm pour pandas
tqdm.pandas()

# Set a random seed
np.random.seed(SEED)
print("Random seed set to", SEED)

Random seed set to 314


In [21]:
# Load the pickle file containing the columns
with open(PATH_COLS, "rb") as f:
    cols = pickle.load(f)

# Load the parquet file
df = pd.read_parquet(
    PATH_PARQUET,
    engine="pyarrow",
    use_nullable_dtypes=False,
)

# **Séparation des données**

In [22]:
# Define the parameters for the split
proportion = 0.015
sampling = True
test_split = 0.2

# Split the data
X_train_full, X_test_full, y_train, y_test = split_data(
    df,
    test_split=test_split,
    sampling=sampling,
    proportion=proportion,
)

In [23]:
def cross_score(X_train, y_train, model, col, cv=10):
    """
    Perform a cross-validation on the model
    :param X_train: Training data
    :param y_train: Training target
    :param model: Model to train
    :param cv: Number of folds
    :return: scores
    """
    # Perform a cross-validation
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=["accuracy", "f1"])
    # Return the mean scores
    return pd.DataFrame(scores).mean().rename(col)

# Evaluate the models

Pour débuter les évaluations des modèles, nous allons d'abord tester les différents jeux de données sans les features d'ingéniérie. Nous allons ensuite ajouter les features d'ingéniérie pour voir si cela améliore les performances des modèles.

In [24]:
# Define the URI of the MLflow server and the name of the experiment
experiment = "ml_tfidf"
cols_tracked = cols[3:]
val_split = 0.2

# Set the tracking URI
mlflow.set_tracking_uri(URI)
# try to connect to the server
try:
    mlflow.tracking.get_tracking_uri()
except Exception as e:
    print(f"Cannot connect to the server : {URI}. Check the server status.")
    raise e
# Set, and create if necessary, the experiment
try:
    mlflow.create_experiment(experiment)
except:
    pass
mlflow.set_experiment(experiment)

<Experiment: artifact_location='/home/hedredo/github/p7/mlruns/1', creation_time=1735481304954, experiment_id='1', last_update_time=1735481304954, lifecycle_stage='active', name='ml_tfidf', tags={}>

In [25]:
for col_name in cols_tracked:
    X_train, X_test = load_splits_from_parquet(
        X_train_full, X_test_full, cols=[col_name], path=PATH_PARQUET
    )
    mlflow.sklearn.autolog()
    with mlflow.start_run() as active_run:
        # create a pipeline with Tf-Idf and Logistic Regression
        model = create_ml_model(
            col_name,
            TfidfVectorizer(
                ngram_range=(1, 1),
                min_df=5,
                strip_accents="unicode",
                stop_words=None,
            ),
            LogisticRegression(max_iter=1000),
        )
        # Fit the model
        model.fit(X_train, y_train)
        # Predict the target
        accuracy = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        mlflow.log_param("data_preparation", col_name)

2025/01/05 02:39:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run melodic-jay-85 at: http://localhost:5000/#/experiments/1/runs/759c0a54c1f543979c1f42fd13a285aa.
2025/01/05 02:39:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:39:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run delicate-rat-457 at: http://localhost:5000/#/experiments/1/runs/c5240bb6b6f34eb3b87646dc6d78c17e.
2025/01/05 02:39:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:39:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run vaunted-cub-728 at: http://localhost:5000/#/experiments/1/runs/a87dee9f5a1e482fbaab2d660ab31162.
2025/01/05 02:39:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:39:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run gifted-skunk-

Suite

In [26]:
for col_name in cols_tracked:
    X_train, X_test = load_splits_from_parquet(
        X_train_full, X_test_full, cols=[col_name], path=PATH_PARQUET
    )
    mlflow.sklearn.autolog()
    with mlflow.start_run() as active_run:
        # create a pipeline with Tf-Idf and Logistic Regression
        model = create_ml_model(
            col_name,
            TfidfVectorizer(
                ngram_range=(1, 1),
                min_df=5,
                strip_accents="unicode",
                stop_words="english",
            ),
            LogisticRegression(max_iter=1000),
        )
        # Fit the model
        model.fit(X_train, y_train)
        # Predict the target
        accuracy = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        mlflow.log_param("data_preparation", col_name)

2025/01/05 02:40:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-shark-236 at: http://localhost:5000/#/experiments/1/runs/1cb452a609a84f5e8052b2b2c64d83ad.
2025/01/05 02:40:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:40:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run luxuriant-robin-182 at: http://localhost:5000/#/experiments/1/runs/1cd8360b40864ae28f34c852b44f244a.
2025/01/05 02:40:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:40:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-gull-3 at: http://localhost:5000/#/experiments/1/runs/306a86cb23d8481aae209fb72cbb1c08.
2025/01/05 02:40:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:40:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run bold-zeb

In [27]:
for col_name in cols_tracked:
    X_train, X_test = load_splits_from_parquet(
        X_train_full, X_test_full, cols=[col_name], path=PATH_PARQUET
    )
    mlflow.sklearn.autolog()
    with mlflow.start_run() as active_run:
        # create a pipeline with Tf-Idf and Logistic Regression
        model = create_ml_model(
            col_name,
            TfidfVectorizer(
                ngram_range=(1, 3),
                min_df=5,
                strip_accents="unicode",
                stop_words=None,
            ),
            LogisticRegression(max_iter=1000),
        )
        # Fit the model
        model.fit(X_train, y_train)
        # Predict the target
        accuracy = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        mlflow.log_param("data_preparation", col_name)

2025/01/05 02:40:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-sheep-326 at: http://localhost:5000/#/experiments/1/runs/2a5e41c0d7b14d5dbc9054c52c4b1d0e.
2025/01/05 02:40:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:40:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run upbeat-shrike-991 at: http://localhost:5000/#/experiments/1/runs/401f8a3f9bc64bff82f9aaeebc1c7b28.
2025/01/05 02:40:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:41:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run bright-sloth-437 at: http://localhost:5000/#/experiments/1/runs/347eee717b484d0f92ce1d61d102281d.
2025/01/05 02:41:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:41:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run mercu

In [28]:
for col_name in cols_tracked:
    X_train, X_test = load_splits_from_parquet(
        X_train_full, X_test_full, cols=[col_name], path=PATH_PARQUET
    )
    mlflow.sklearn.autolog()
    with mlflow.start_run() as active_run:
        # create a pipeline with Tf-Idf and Logistic Regression
        model = create_ml_model(
            col_name,
            TfidfVectorizer(
                ngram_range=(1, 3),
                min_df=5,
                strip_accents="unicode",
                stop_words="english",
            ),
            LogisticRegression(max_iter=1000),
        )
        # Fit the model
        model.fit(X_train, y_train)
        # Predict the target
        accuracy = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        mlflow.log_param("data_preparation", col_name)

2025/01/05 02:41:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-seal-220 at: http://localhost:5000/#/experiments/1/runs/c8e16fccd17e4abdaffa72bc78443160.
2025/01/05 02:41:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:41:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run intrigued-hound-420 at: http://localhost:5000/#/experiments/1/runs/0f5a7768218c418ba6b506b4c8d2472d.
2025/01/05 02:41:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:41:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run rogue-cub-483 at: http://localhost:5000/#/experiments/1/runs/71f56ba689704a20b3bc7013b4e2640d.
2025/01/05 02:41:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2025/01/05 02:41:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run dazzling

# **Séparation des données**

In [14]:
df.filter(["hour_cos", "hour_sin"]).align(X_train, axis=0)

(         hour_cos  hour_sin
 0        0.866025 -0.500000
 1        0.866025 -0.500000
 2        0.866025 -0.500000
 3        0.866025 -0.500000
 4        0.866025 -0.500000
 ...           ...       ...
 1599995 -0.500000  0.866025
 1599996 -0.500000  0.866025
 1599997 -0.500000  0.866025
 1599998 -0.500000  0.866025
 1599999 -0.500000  0.866025
 
 [1596630 rows x 2 columns],
          hour_cos  hour_sin                                               text
 0        0.866025 -0.500000  @switchfoot http://twitpic.com/2y1zl - Awww, t...
 1        0.866025 -0.500000  is upset that he can't update his Facebook by ...
 2        0.866025 -0.500000  @Kenichan I dived many times for the ball. Man...
 3             NaN       NaN                                                NaN
 4        0.866025 -0.500000  @nationwideclass no, it's not behaving at all....
 ...           ...       ...                                                ...
 1599995       NaN       NaN                                 