In [2]:
import pandas as pd
pd.options.display.max_columns = None
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import warnings
import os
warnings.filterwarnings("ignore")

In [3]:
from dotenv import load_dotenv
import os
load_dotenv("../backend/src/.env")
DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token") 


In [4]:
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token


In [5]:

#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow') #your mlfow tracking uri
mlflow.set_experiment("depression-detection-experiment")



<Experiment: artifact_location='mlflow-artifacts:/2770073eb03f43e99e9f9ae224f726e1', creation_time=1733241503098, experiment_id='0', last_update_time=1733241503098, lifecycle_stage='active', name='depression-detection-experiment', tags={}>

In [6]:
data_train = pd.read_csv("../data/train_data.csv")
data_test = pd.read_csv("../data/test_data.csv")

In [7]:
x_train = data_train.drop(['History of Mental Illness'],axis = 1)  # All rows except the last two
y_train = data_train['History of Mental Illness']   # The last row
x_test = data_test.drop(['History of Mental Illness'],axis = 1)   # All rows except the last two
y_test = data_test['History of Mental Illness'] 
x_train

Unnamed: 0,Age,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Widowed,Education Level_Associate Degree,Education Level_Bachelor's Degree,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,47,1,2,1,0,318.019103,2,1,0,0,0,0,False,True,False,False,False,False,False,True,False
1,21,0,0,0,0,236.338169,1,1,0,0,0,1,False,False,True,False,True,False,False,False,False
2,66,0,2,2,0,373.673066,0,1,0,0,1,1,False,False,False,True,False,False,False,True,False
3,59,3,2,2,1,88.284087,1,2,2,1,0,1,False,True,False,False,False,True,False,False,False
4,71,0,2,1,1,158.385006,0,1,0,1,0,1,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331009,76,2,2,1,0,241.632738,2,2,2,0,0,0,False,True,False,False,True,False,False,False,False
331010,27,1,2,2,0,311.621485,1,2,2,0,0,0,False,True,False,False,False,True,False,False,False
331011,31,0,2,2,0,358.538171,2,1,0,0,0,0,False,False,True,False,False,False,False,True,False
331012,22,0,2,1,0,166.246895,2,2,1,0,0,0,False,False,True,False,False,False,True,False,False


In [9]:
y_train

0         1
1         0
2         0
3         0
4         1
         ..
331009    0
331010    1
331011    0
331012    1
331013    1
Name: History of Mental Illness, Length: 331014, dtype: int64

In [10]:
x_test

Unnamed: 0,Age,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Widowed,Education Level_Associate Degree,Education Level_Bachelor's Degree,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,42,3,1,1,0,282.392741,1,0,1,0,1,0,True,False,False,False,False,True,False,False,False
1,66,1,2,2,0,158.991100,1,1,0,0,0,0,False,True,False,False,False,False,True,False,False
2,45,0,1,2,0,176.341487,1,1,1,0,1,0,False,True,False,False,False,False,True,False,False
3,79,1,1,2,0,271.742985,0,2,0,0,1,0,False,True,False,False,False,True,False,False,False
4,67,1,2,1,1,119.467945,1,1,2,1,0,0,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82749,23,0,2,1,1,138.430055,2,0,2,0,0,0,False,False,True,False,False,False,False,True,False
82750,47,1,2,1,1,103.292304,1,2,0,0,0,0,True,False,False,False,False,False,True,False,False
82751,68,3,2,1,0,238.824643,1,0,2,0,0,0,False,True,False,False,False,True,False,False,False
82752,56,1,2,1,0,181.973460,1,2,2,0,0,0,False,True,False,False,False,False,True,False,False


In [11]:
y_test

0        1
1        0
2        0
3        0
4        0
        ..
82749    1
82750    0
82751    0
82752    0
82753    0
Name: History of Mental Illness, Length: 82754, dtype: int64

# 2. Machine Learning Modeling :

In [12]:
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    """
    Train the model and evaluate it on test data.
    Returns the classification report as a dictionary.
    """
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return report

In [13]:
# Initialize models
# Initialize models
models = {
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LinearSVC": LinearSVC( ),
    "K-Nearest Neighbors (k=5)": KNeighborsClassifier(n_neighbors=5),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=200, random_state=42)
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training model: {model_name}...")
    report = train_and_evaluate_model(model, x_train, y_train, x_test, y_test)
    results[model_name] = {
        "F1-Score (0)": report["0"]["f1-score"],
        "F1-Score (1)": report["1"]["f1-score"]
    }

Training model: Naive Bayes...
Training model: Random Forest...
Training model: XGBoost...
Training model: LinearSVC...
Training model: K-Nearest Neighbors (k=5)...
Training model: MLP Classifier...


In [15]:
# Create a comparative table
results_df = pd.DataFrame(results)
print("\nComparative Table of F1-Scores:")

# Display the table
results_df


Comparative Table of F1-Scores:


Unnamed: 0,Naive Bayes,Random Forest,XGBoost,LinearSVC,K-Nearest Neighbors (k=5),MLP Classifier
F1-Score (0),0.776828,0.78857,0.820679,0.822275,0.763508,0.822275
F1-Score (1),0.281702,0.203761,0.014754,0.0,0.260911,0.0


## Naive Bayesien

In [20]:
with mlflow.start_run(run_name='Naive Bayes'):
    mlflow.log_param("data_train","../data/data_train.csv")
    mlflow.log_param("data_test","../data/data_test.csv")
    mlflow.log_param("data_version","v1.0")
    nb = GaussianNB()
    params = nb.get_params()
    mlflow.set_tag(key= "model", value="NaiveBayes")
    mlflow.log_params(params)
    nb.fit(x_train,y_train)
    train_features_name = f'{x_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=nb.predict(x_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(nb,artifact_path="ML_models")



🏃 View run Naive Bayes at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0/runs/9332def3ab774327bd9ecd8936fd69cc
🧪 View experiment at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0


## KNN

In [22]:
with mlflow.start_run(run_name='K-Nearest Neighbors'):
    # Log metadata
    mlflow.log_param("data_train", "../data/data_train.csv")
    mlflow.log_param("data_test", "../data/data_test.csv")
    mlflow.log_param("data_version", "v1.0")

    # Initialize and fit the k-NN model
    knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')
    params = knn.get_params()
    mlflow.set_tag(key="model", value="K-Nearest Neighbors")
    mlflow.log_params(params)
    knn.fit(x_train, y_train)

    # Log feature and label names
    train_features_name = f'{x_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name", value=train_features_name)
    mlflow.set_tag(key="train_label_name", value=train_label_name)

    # Evaluate the model
    predicted = knn.predict(x_test)
    precision, recall, fscore, support = score(y_test, predicted, average='macro')
    mlflow.log_metric("Precision_test", precision)
    mlflow.log_metric("Recall_test", recall)
    mlflow.log_metric("F1_score_test", fscore)

    # Log the k-NN model
    mlflow.sklearn.log_model(knn, artifact_path="ML_models")



🏃 View run K-Nearest Neighbors at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0/runs/b9fb617569634b8b86a6bbc6e37686dc
🧪 View experiment at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0


## Finetuning

In [24]:
import json
import mlflow
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support as score

# Helper function to make data JSON serializable
def make_json_serializable(data):
    if isinstance(data, np.ndarray):
        return data.tolist()
    if isinstance(data, dict):
        return {key: make_json_serializable(value) for key, value in data.items()}
    if isinstance(data, list):
        return [make_json_serializable(item) for item in data]
    return data

# Your main experiment code
with mlflow.start_run(run_name='K-Nearest Neighbors with GridSearch'):
    # Define parameter grid for GridSearchCV
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }

    # Initialize k-NN model and perform grid search
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid, scoring='f1_macro', cv=3, verbose=1, n_jobs=-1)
    grid_search.fit(x_train, y_train)

    # Log best parameters
    best_params = grid_search.best_params_
    mlflow.log_params(best_params)

    # Serialize and save grid search results as JSON
    grid_search_results = make_json_serializable(grid_search.cv_results_)
    with open("grid_search_results.json", "w") as f:
        json.dump(grid_search_results, f, indent=4)
    mlflow.log_artifact("grid_search_results.json", artifact_path="grid_search_results")

    # Fit the best model on the training data
    best_knn = grid_search.best_estimator_
    best_knn.fit(x_train, y_train)

    # Evaluate the model
    predicted = best_knn.predict(x_test)
    precision, recall, fscore, support = score(y_test, predicted, average='macro')

    # Log metrics
    mlflow.log_metric("Precision_test", precision)
    mlflow.log_metric("Recall_test", recall)
    mlflow.log_metric("F1_score_test", fscore)

    # Log the best k-NN model
    mlflow.sklearn.log_model(best_knn, artifact_path="ML_models")

    # Print the best parameters and scores
    print("Best Hyperparameters:", best_params)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits




Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Precision: 0.5211, Recall: 0.5179, F1-Score: 0.5162
🏃 View run K-Nearest Neighbors with GridSearch at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0/runs/849aceb781d94951b532b7930e80514d
🧪 View experiment at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import numpy as np 

from sklearn.ensemble import RandomForestClassifier  # For RFE

# Feature selection function
def select_features(estimator, x_train, y_train, n_features_to_select=None):
    """
    Perform Recursive Feature Elimination (RFE) using a compatible estimator.
    """
    rfe = RFE(estimator=estimator, n_features_to_select=n_features_to_select, step=1)
    rfe.fit(x_train, y_train)
    return rfe.support_, rfe

def run_experiment(model_name, model, x_train, y_train, x_test, y_test, selected_features=None):
    with mlflow.start_run(run_name=f"{model_name} Experiment"):
        # Feature selection metadata
        if selected_features is not None:
            if isinstance(x_train, pd.DataFrame):
                # Use Pandas .loc if x_train is a DataFrame
                x_train = x_train.loc[:, selected_features]
                x_test = x_test.loc[:, selected_features]
            else:
                # Use NumPy slicing if x_train is a NumPy array
                x_train = x_train[:, selected_features]
                x_test = x_test[:, selected_features]
                
            mlflow.log_param("selected_features_count", sum(selected_features))
            mlflow.log_param("selected_features_indices", list(np.where(selected_features)[0]))

        # Log metadata
        mlflow.log_param("data_train", "../data/data_train.csv")
        mlflow.log_param("data_test", "../data/data_test.csv")
        mlflow.log_param("data_version", "v1.0")

        # Fit and evaluate the model
        model.fit(x_train, y_train)
        predicted = model.predict(x_test)
        precision, recall, fscore, support = score(y_test, predicted, average='macro')

        # Log metrics
        mlflow.log_metric("Precision_test", precision)
        mlflow.log_metric("Recall_test", recall)
        mlflow.log_metric("F1_score_test", fscore)

        # Log model parameters and save the model
        mlflow.set_tag("model", model_name)
        mlflow.log_params(model.get_params())
        mlflow.sklearn.log_model(model, artifact_path="ML_models")

        # Print metrics
        print(f"{model_name} Results:")
        print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")


# Running Feature Selection
print("Running Feature Selection...")
# Use RandomForestClassifier as the base estimator for RFE
rfe_estimator = RandomForestClassifier(random_state=42)
selected_features, rfe_model = select_features(rfe_estimator, x_train, y_train, n_features_to_select=10)

# Naive Bayes Experiment
print("Running Naive Bayes Experiment...")
run_experiment("Naive Bayes", GaussianNB(), x_train, y_train, x_test, y_test, selected_features)

# k-NN Experiment with Specified Parameters
print("Running k-NN Experiment with Specified Parameters...")
knn = KNeighborsClassifier(n_neighbors=3, weights="distance", metric="manhattan")
run_experiment("K-Nearest Neighbors", knn, x_train, y_train, x_test, y_test, selected_features)

Running Feature Selection...
Running Naive Bayes Experiment...




Naive Bayes Results:
Precision: 0.3491, Recall: 0.5000, F1-Score: 0.4111
🏃 View run Naive Bayes Experiment at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0/runs/ef58809a13144d5e9d2957286cba5480
🧪 View experiment at: https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow/#/experiments/0
Running k-NN Experiment with Specified Parameters...


