In [19]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from feature_engine.selection import SelectByShuffling, SelectBySingleFeaturePerformance, RecursiveFeatureElimination
import json

In [3]:
df = pd.read_csv("processed_data/dataset_after_feature_building.csv")
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Parental_influence,Course_Dropout_prob,Course_Enrolled_prob,Course_Graduate_prob,Application mode_Dropout_prob,Application mode_Enrolled_prob,Application mode_Graduate_prob,Previous qualification_Dropout_prob,Previous qualification_Enrolled_prob,Previous qualification_Graduate_prob
0,1,8,5,2,1,1,1,13,10,6,...,-4.824723,0.381395,0.172093,0.446512,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
1,1,6,1,11,1,1,1,1,3,4,...,-18.023059,0.380952,0.162698,0.456349,0.166667,0.333333,0.5,0.290019,0.187786,0.522195
2,1,1,5,5,1,1,1,22,27,10,...,14.511525,0.225664,0.185841,0.588496,0.201991,0.175644,0.622365,0.290019,0.187786,0.522195
3,1,8,2,15,1,1,1,23,27,6,...,14.122941,0.305136,0.102719,0.592145,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
4,2,12,1,3,0,1,1,22,28,10,...,15.327614,0.330233,0.097674,0.572093,0.55414,0.15414,0.29172,0.290019,0.187786,0.522195


In [4]:
df.isna().sum()

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrollment                                 0
International                                     0
Curricular u

In [5]:
target_variable = "Target"

In [6]:
X = df.drop(columns=[target_variable])
y = df[target_variable]

In [7]:
label_enc = LabelEncoder()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
y_train = label_enc.fit_transform(y_train)
y_test = label_enc.transform(y_test)

In [10]:
X_train.shape, X_test.shape

((3539, 51), (885, 51))

In [11]:
y_train

array([0, 1, 2, ..., 2, 2, 0])

In [12]:
y_test

array([0, 2, 2, 2, 0, 2, 1, 0, 2, 2, 2, 2, 0, 2, 0, 1, 1, 2, 1, 2, 2, 0,
       0, 2, 2, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 2, 2, 2, 1, 0, 0, 2, 0,
       0, 2, 2, 2, 2, 0, 1, 2, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 1, 0, 0, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 0, 0, 2, 0, 2, 1, 2, 2,
       2, 2, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 2, 2, 2, 0, 1, 1, 2, 0, 0, 1,
       0, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 2, 0, 2, 0, 2, 1,
       1, 1, 2, 2, 0, 2, 0, 1, 0, 0, 2, 2, 2, 2, 1, 2, 0, 1, 1, 0, 2, 1,
       0, 0, 2, 2, 0, 2, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0, 2,
       1, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 1, 2, 0, 2, 0, 2, 2,
       0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 1, 1, 2, 0, 0, 1,
       2, 2, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 0,
       0, 1, 2, 1, 0, 0, 2, 1, 0, 2, 2, 2, 2, 2, 0, 0, 2, 1, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 0, 0,
       2, 0, 0, 1, 0, 1, 0, 2, 2, 2, 1, 2, 1, 2, 2,

In [13]:
import warnings
warnings.filterwarnings('ignore')

## Using Select By Single Feature Performance

In [17]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Feature_Selection_Tracking")

# Define feature selection parameters
thresholds = [0.1, 0.2, 0.3]  # Different thresholds for feature selection
scoring_methods = ["accuracy", "precision_macro", "recall_macro", "f1_macro", "roc_auc_ovr",
                   "top_k_accuracy"]

for threshold in thresholds:
    for scoring in scoring_methods:
        # Feature selection
        selector = SelectBySingleFeaturePerformance(
            estimator=RandomForestClassifier(random_state=42),
            scoring=scoring,
            threshold=threshold
        )
        selector.fit(X_train, y_train)

        # Get selected features
        selected_features = X_train.columns[selector.get_support()].tolist()
        X_train_selected = selector.transform(X_train)
        X_test_selected = selector.transform(X_test)

        # Train & Evaluate Model After Feature Selection
        with mlflow.start_run(run_name=f"Threshold_{threshold}_Scoring_{scoring}"):
            model_fs = RandomForestClassifier(random_state=42)
            model_fs.fit(X_train_selected, y_train)
            y_pred_fs = model_fs.predict(X_test_selected)

            # Compute Metrics
            metrics_after = {
                "accuracy": accuracy_score(y_test, y_pred_fs),
                "precision": precision_score(y_test, y_pred_fs, average="weighted"),
                "recall": recall_score(y_test, y_pred_fs, average="weighted"),
                "f1_score": f1_score(y_test, y_pred_fs, average="weighted"),
            }

            # Log Parameters and Metrics
            mlflow.log_params({
                "feature_selection_applied": True,
                "threshold": threshold,
                "scoring": scoring,
                "num_selected_features": len(selected_features)
            })
            mlflow.log_metrics(metrics_after)

            # Log the names of selected features as a JSON artifact
            mlflow.log_dict({"selected_features": selected_features}, 
                            f"selected_features_threshold_{threshold}_scoring_{scoring}.json")

            # Log the trained model
            mlflow.sklearn.log_model(model_fs, f"model_threshold_{threshold}_scoring_{scoring}")

print("✅ MLflow logging complete! Compare different thresholds & scoring methods, including selected features (JSON), in the MLflow UI.")



🏃 View run Threshold_0.1_Scoring_accuracy at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/0d5def2354d044f6a5179f8f37cc905e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.1_Scoring_precision_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/d9f7ba31e10f4c3f9aaa633bc7d8393d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.1_Scoring_recall_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/0801796801ed4a3ab306c6bf46c064f6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.1_Scoring_f1_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/b82a1a21db3b4ef2b21fa3aee772d269
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.1_Scoring_roc_auc_ovr at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/ff92db33ded2491ab5d0645d8507a6f5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.1_Scoring_top_k_accuracy at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/05d58d0a65784041979b3a6fb54362a8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.2_Scoring_accuracy at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/47b8ebef91ab4bf484b9fd727aca37f3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.2_Scoring_precision_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/409f9886798f4ae1acdd2af234de669e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.2_Scoring_recall_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/2f7ad5d1603a46cfbff2f8d642fc785e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.2_Scoring_f1_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/80bad654f4a84062bf86e70878909474
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.2_Scoring_roc_auc_ovr at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/43997ad54fdc4d92a714755e861243a2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.2_Scoring_top_k_accuracy at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/abdab1ce0d7f4f8dbae0b8ee0d8e6fa4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.3_Scoring_accuracy at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/ccbbfc7b86da4a8ba65387e028444e42
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.3_Scoring_precision_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/e4498d86cc2f4dd58e0e44b98facd9c1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.3_Scoring_recall_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/bfb641976f084b21994d2e39bc13b214
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.3_Scoring_f1_macro at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/ea8ea978b19a41c19139faefcbf22a6a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.3_Scoring_roc_auc_ovr at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/0e008f4455e94834a29be96ed3a122ee
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889




🏃 View run Threshold_0.3_Scoring_top_k_accuracy at: http://127.0.0.1:5000/#/experiments/793534556724788889/runs/84d2f86ee3664b52a5b8b3a6112a36f8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/793534556724788889
✅ MLflow logging complete! Compare different thresholds & scoring methods, including selected features (JSON), in the MLflow UI.


## Using Select By Shuffling

In [None]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("SelectByShuffling")

# Define feature selection parameters
scoring_methods = ["accuracy", "precision_macro", "recall_macro", "f1_macro", "roc_auc_ovr",
                   "top_k_accuracy"]

for scoring in scoring_methods:
        # Feature selection
    selector = SelectByShuffling(
        estimator=RandomForestClassifier(random_state=42),
        scoring=scoring,
        cv=3
    )
    selector.fit(X_train, y_train)

        # Get selected features
    selected_features = X_train.columns[selector.get_support()].tolist()
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Train & Evaluate Model After Feature Selection
    with mlflow.start_run(run_name=f"Scoring_{scoring}"):
        model_fs = RandomForestClassifier(random_state=42)
        model_fs.fit(X_train_selected, y_train)
        y_pred_fs = model_fs.predict(X_test_selected)

        # Compute Metrics
        metrics_after = {
            "accuracy": accuracy_score(y_test, y_pred_fs),
            "precision": precision_score(y_test, y_pred_fs, average="weighted"),
            "recall": recall_score(y_test, y_pred_fs, average="weighted"),
            "f1_score": f1_score(y_test, y_pred_fs, average="weighted"),
        }

        # Log Parameters and Metrics
        mlflow.log_params({
            "feature_selection_applied": True,
            "scoring": scoring,
            "num_selected_features": len(selected_features)
        })
        mlflow.log_metrics(metrics_after)

        # Log the names of selected features as a JSON artifact
        mlflow.log_dict({"selected_features": selected_features}, 
                        f"selected_features_scoring_{scoring}.json")

        # Log the trained model
        mlflow.sklearn.log_model(model_fs, f"model_scoring_{scoring}")

print("✅ MLflow logging complete! Compare different scoring methods, including selected features (JSON), in the MLflow UI.")

2025/02/03 17:52:28 INFO mlflow.tracking.fluent: Experiment with name 'RFE' does not exist. Creating a new experiment.


## Using Recursive Feature Elimination

In [21]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("RFE")

# Define feature selection parameters
scoring_methods = ["accuracy", "precision_macro", "recall_macro", "f1_macro", "roc_auc_ovr",
                   "top_k_accuracy"]

for scoring in scoring_methods:
        # Feature selection
    selector = RecursiveFeatureElimination(
        estimator=RandomForestClassifier(random_state=42),
        scoring=scoring,
        cv=3
    )
    selector.fit(X_train, y_train)

        # Get selected features
    selected_features = X_train.columns[selector.get_support()].tolist()
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Train & Evaluate Model After Feature Selection
    with mlflow.start_run(run_name=f"Scoring_{scoring}"):
        model_fs = RandomForestClassifier(random_state=42)
        model_fs.fit(X_train_selected, y_train)
        y_pred_fs = model_fs.predict(X_test_selected)

        # Compute Metrics
        metrics_after = {
            "accuracy": accuracy_score(y_test, y_pred_fs),
            "precision": precision_score(y_test, y_pred_fs, average="weighted"),
            "recall": recall_score(y_test, y_pred_fs, average="weighted"),
            "f1_score": f1_score(y_test, y_pred_fs, average="weighted"),
        }

        # Log Parameters and Metrics
        mlflow.log_params({
            "feature_selection_applied": True,
            "scoring": scoring,
            "num_selected_features": len(selected_features)
        })
        mlflow.log_metrics(metrics_after)

        # Log the names of selected features as a JSON artifact
        mlflow.log_dict({"selected_features": selected_features}, 
                        f"selected_features_scoring_{scoring}.json")

        # Log the trained model
        mlflow.sklearn.log_model(model_fs, f"model_scoring_{scoring}")

print("✅ MLflow logging complete! Compare different scoring methods, including selected features (JSON), in the MLflow UI.")

2025/02/03 17:52:28 INFO mlflow.tracking.fluent: Experiment with name 'RFE' does not exist. Creating a new experiment.


🏃 View run Scoring_accuracy at: http://127.0.0.1:5000/#/experiments/113969755590251012/runs/0d39a1769a7445c3ad952438aa925eef
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/113969755590251012




🏃 View run Scoring_precision_macro at: http://127.0.0.1:5000/#/experiments/113969755590251012/runs/8d6437df60754a7cb0cf0115eb7dbbfd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/113969755590251012




🏃 View run Scoring_recall_macro at: http://127.0.0.1:5000/#/experiments/113969755590251012/runs/93aac5841710469dabdd2b2147003f4b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/113969755590251012




🏃 View run Scoring_f1_macro at: http://127.0.0.1:5000/#/experiments/113969755590251012/runs/10e73aacf3664b2488b3830d95d23206
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/113969755590251012




🏃 View run Scoring_roc_auc_ovr at: http://127.0.0.1:5000/#/experiments/113969755590251012/runs/8667f9a849af4ef5bbbeef60189c7047
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/113969755590251012




🏃 View run Scoring_top_k_accuracy at: http://127.0.0.1:5000/#/experiments/113969755590251012/runs/f81c2e87c65942708106018aeb59e662
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/113969755590251012
✅ MLflow logging complete! Compare different scoring methods, including selected features (JSON), in the MLflow UI.


In [22]:
transformed_df = pd.read_csv("processed_data/log_transformed_dataset.csv")
transformed_df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Parental_influence,Course_Dropout_prob,Course_Enrolled_prob,Course_Graduate_prob,Application mode_Dropout_prob,Application mode_Enrolled_prob,Application mode_Graduate_prob,Previous qualification_Dropout_prob,Previous qualification_Enrolled_prob,Previous qualification_Graduate_prob
0,1,8,1.791759,2,1,1,1,13,10,6,...,-4.824723,0.381395,0.172093,0.446512,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
1,1,6,0.693147,11,1,1,1,1,3,4,...,-18.023059,0.380952,0.162698,0.456349,0.166667,0.333333,0.5,0.290019,0.187786,0.522195
2,1,1,1.791759,5,1,1,1,22,27,10,...,14.511525,0.225664,0.185841,0.588496,0.201991,0.175644,0.622365,0.290019,0.187786,0.522195
3,1,8,1.098612,15,1,1,1,23,27,6,...,14.122941,0.305136,0.102719,0.592145,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
4,2,12,0.693147,3,0,1,1,22,28,10,...,15.327614,0.330233,0.097674,0.572093,0.55414,0.15414,0.29172,0.290019,0.187786,0.522195


In [24]:
# Set the tracking uri
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

# Enable MLflow tracking
mlflow.set_experiment("Feature_Transformation_Tracking")

with mlflow.start_run(run_name="Before_Log1p"):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Compute metrics
    metrics_before = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="weighted"),
        "recall": recall_score(y_test, y_pred, average="weighted"),
        "f1_score": f1_score(y_test, y_pred, average="weighted"),
    }
    
    # Log model and metrics
    mlflow.log_params({"log1p_applied": False})
    mlflow.log_metrics(metrics_before)
    mlflow.sklearn.log_model(model, "model_before")

2025/02/03 18:20:03 INFO mlflow.tracking.fluent: Experiment with name 'Feature_Transformation_Tracking' does not exist. Creating a new experiment.


🏃 View run Before_Log1p at: http://127.0.0.1:5000/#/experiments/252510383797126547/runs/871d8c7504e7431fa2fe7e21991b6078
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/252510383797126547


## Tracking Before VS After Log Transformations

In [25]:
X = transformed_df.drop(columns=[target_variable])
y = transformed_df[target_variable]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [27]:
# Enable MLflow tracking
mlflow.set_experiment("Feature_Transformation_Tracking")

with mlflow.start_run(run_name="After_Log1p"):
    model_log1p = RandomForestClassifier(random_state=42)
    model_log1p.fit(X_train, y_train)
    y_pred_log1p = model_log1p.predict(X_test)

    # Compute metrics
    metrics_after = {
        "accuracy": accuracy_score(y_test, y_pred_log1p),
        "precision": precision_score(y_test, y_pred_log1p, average="weighted"),
        "recall": recall_score(y_test, y_pred_log1p, average="weighted"),
        "f1_score": f1_score(y_test, y_pred_log1p, average="weighted"),
    }
    
    # Log model and metrics
    mlflow.log_params({"log1p_applied": True})
    mlflow.log_metrics(metrics_after)
    mlflow.sklearn.log_model(model_log1p, "model_after")

print("✅ MLflow logging complete! Compare runs in MLflow UI.")



🏃 View run After_Log1p at: http://127.0.0.1:5000/#/experiments/252510383797126547/runs/9b29d9887b7a4c33b563eaac2f5e0c47
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/252510383797126547
✅ MLflow logging complete! Compare runs in MLflow UI.


## Tracking Before VS After Winsorization

In [28]:
winsorized_df = pd.read_csv("processed_data/winsorized_dataset.csv")
winsorized_df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Parental_influence,Course_Dropout_prob,Course_Enrolled_prob,Course_Graduate_prob,Application mode_Dropout_prob,Application mode_Enrolled_prob,Application mode_Graduate_prob,Previous qualification_Dropout_prob,Previous qualification_Enrolled_prob,Previous qualification_Graduate_prob
0,1,8,1.791759,2,1,1,1,13,10,6,...,-4.824723,0.381395,0.172093,0.446512,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
1,1,6,0.693147,11,1,1,1,1,3,4,...,-18.023059,0.380952,0.162698,0.456349,0.166667,0.333333,0.5,0.290019,0.187786,0.522195
2,1,1,1.791759,5,1,1,1,22,27,10,...,14.511525,0.225664,0.185841,0.588496,0.201991,0.175644,0.622365,0.290019,0.187786,0.522195
3,1,8,1.098612,15,1,1,1,23,27,6,...,14.122941,0.305136,0.102719,0.592145,0.293578,0.182339,0.524083,0.290019,0.187786,0.522195
4,2,12,0.693147,3,0,1,1,22,28,10,...,15.327614,0.330233,0.097674,0.572093,0.55414,0.15414,0.29172,0.290019,0.187786,0.522195


In [29]:
X = winsorized_df.drop(columns=[target_variable])
y = winsorized_df[target_variable]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Enable MLflow tracking
mlflow.set_experiment("Feature_Transformation_Tracking")

with mlflow.start_run(run_name="After Winsorization"):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Compute metrics
    metrics_after = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="weighted"),
        "recall": recall_score(y_test, y_pred, average="weighted"),
        "f1_score": f1_score(y_test, y_pred, average="weighted"),
    }
    
    # Log model and metrics
    mlflow.log_params({"Winsorization": True})
    mlflow.log_metrics(metrics_after)
    mlflow.sklearn.log_model(model, "model_after")

print("✅ MLflow logging complete! Compare runs in MLflow UI.")



🏃 View run After Winsorization at: http://127.0.0.1:5000/#/experiments/252510383797126547/runs/3aa3c16c873a413a81f508f35e3ae0fb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/252510383797126547
✅ MLflow logging complete! Compare runs in MLflow UI.


## Applying RFE and log transformations simultaneously

In [32]:
X = transformed_df.drop(columns=[target_variable])
y = transformed_df[target_variable]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
label_enc = LabelEncoder()

In [36]:
y_train = label_enc.fit_transform(y_train)
y_test = label_enc.transform(y_test)

In [37]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("RFE And Log transformations")

# Define feature selection parameters
scoring_methods = ["accuracy", "precision_macro", "recall_macro", "f1_macro", "roc_auc_ovr",
                   "top_k_accuracy"]

for scoring in scoring_methods:
        # Feature selection
    selector = RecursiveFeatureElimination(
        estimator=RandomForestClassifier(random_state=42),
        scoring=scoring,
        cv=3
    )
    selector.fit(X_train, y_train)

        # Get selected features
    selected_features = X_train.columns[selector.get_support()].tolist()
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)

    # Train & Evaluate Model After Feature Selection
    with mlflow.start_run(run_name=f"Scoring_{scoring}"):
        model_fs = RandomForestClassifier(random_state=42)
        model_fs.fit(X_train_selected, y_train)
        y_pred_fs = model_fs.predict(X_test_selected)

        # Compute Metrics
        metrics_after = {
            "accuracy": accuracy_score(y_test, y_pred_fs),
            "precision": precision_score(y_test, y_pred_fs, average="weighted"),
            "recall": recall_score(y_test, y_pred_fs, average="weighted"),
            "f1_score": f1_score(y_test, y_pred_fs, average="weighted"),
        }

        # Log Parameters and Metrics
        mlflow.log_params({
            "feature_selection_applied": True,
            "scoring": scoring,
            "num_selected_features": len(selected_features)
        })
        mlflow.log_metrics(metrics_after)

        # Log the names of selected features as a JSON artifact
        mlflow.log_dict({"selected_features": selected_features}, 
                        f"selected_features_scoring_{scoring}.json")

        # Log the trained model
        mlflow.sklearn.log_model(model_fs, f"model_scoring_{scoring}")

print("✅ MLflow logging complete! Compare different scoring methods, including selected features (JSON), in the MLflow UI.")



🏃 View run Scoring_accuracy at: http://127.0.0.1:5000/#/experiments/195820902230537456/runs/02cc55a115b0482e9e6892be904922a9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/195820902230537456




🏃 View run Scoring_precision_macro at: http://127.0.0.1:5000/#/experiments/195820902230537456/runs/fdc2b0380d76417ca737a87495f72322
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/195820902230537456




🏃 View run Scoring_recall_macro at: http://127.0.0.1:5000/#/experiments/195820902230537456/runs/904dd4eb83f74449b6976de953585fa5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/195820902230537456




🏃 View run Scoring_f1_macro at: http://127.0.0.1:5000/#/experiments/195820902230537456/runs/bf16e0767201416cb99bb215e943736a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/195820902230537456




🏃 View run Scoring_roc_auc_ovr at: http://127.0.0.1:5000/#/experiments/195820902230537456/runs/21e5ccffe99742b5a8bda679c2cae6b1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/195820902230537456




🏃 View run Scoring_top_k_accuracy at: http://127.0.0.1:5000/#/experiments/195820902230537456/runs/1378aaaf3ed44e619ffdbd5bc71fb5e7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/195820902230537456
✅ MLflow logging complete! Compare different scoring methods, including selected features (JSON), in the MLflow UI.
