In [1]:
import pandas as pd
pd.options.display.max_columns = None
import seaborn as sns
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from xgboost import XGBClassifier
import mlflow
import datetime
import warnings
import os
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
import os
load_dotenv("../backend/src/.env")
DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token") 


In [3]:
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token


In [4]:

#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow') #your mlfow tracking uri
mlflow.set_experiment("depression-detection-experiment")



<Experiment: artifact_location='mlflow-artifacts:/2770073eb03f43e99e9f9ae224f726e1', creation_time=1733241503098, experiment_id='0', last_update_time=1733241503098, lifecycle_stage='active', name='depression-detection-experiment', tags={}>

In [5]:
data_train = pd.read_csv("../data/train_data.csv")
data_test = pd.read_csv("../data/test_data.csv")

In [6]:
x_train = data_train.drop(['History of Mental Illness'],axis = 1)  # All rows except the last two
y_train = data_train['History of Mental Illness']   # The last row
x_test = data_test.drop(['History of Mental Illness'],axis = 1)   # All rows except the last two
y_test = data_test['History of Mental Illness'] 
x_train

Unnamed: 0,Age,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Widowed,Education Level_Associate Degree,Education Level_Bachelor's Degree,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,47,1,2,1,0,318.019103,2,1,0,0,0,0,False,True,False,False,False,False,False,True,False
1,21,0,0,0,0,236.338169,1,1,0,0,0,1,False,False,True,False,True,False,False,False,False
2,66,0,2,2,0,373.673066,0,1,0,0,1,1,False,False,False,True,False,False,False,True,False
3,59,3,2,2,1,88.284087,1,2,2,1,0,1,False,True,False,False,False,True,False,False,False
4,71,0,2,1,1,158.385006,0,1,0,1,0,1,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331009,76,2,2,1,0,241.632738,2,2,2,0,0,0,False,True,False,False,True,False,False,False,False
331010,27,1,2,2,0,311.621485,1,2,2,0,0,0,False,True,False,False,False,True,False,False,False
331011,31,0,2,2,0,358.538171,2,1,0,0,0,0,False,False,True,False,False,False,False,True,False
331012,22,0,2,1,0,166.246895,2,2,1,0,0,0,False,False,True,False,False,False,True,False,False


In [7]:
y_train

0         1
1         0
2         0
3         0
4         1
         ..
331009    0
331010    1
331011    0
331012    1
331013    1
Name: History of Mental Illness, Length: 331014, dtype: int64

In [8]:
x_test

Unnamed: 0,Age,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Widowed,Education Level_Associate Degree,Education Level_Bachelor's Degree,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,42,3,1,1,0,282.392741,1,0,1,0,1,0,True,False,False,False,False,True,False,False,False
1,66,1,2,2,0,158.991100,1,1,0,0,0,0,False,True,False,False,False,False,True,False,False
2,45,0,1,2,0,176.341487,1,1,1,0,1,0,False,True,False,False,False,False,True,False,False
3,79,1,1,2,0,271.742985,0,2,0,0,1,0,False,True,False,False,False,True,False,False,False
4,67,1,2,1,1,119.467945,1,1,2,1,0,0,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82749,23,0,2,1,1,138.430055,2,0,2,0,0,0,False,False,True,False,False,False,False,True,False
82750,47,1,2,1,1,103.292304,1,2,0,0,0,0,True,False,False,False,False,False,True,False,False
82751,68,3,2,1,0,238.824643,1,0,2,0,0,0,False,True,False,False,False,True,False,False,False
82752,56,1,2,1,0,181.973460,1,2,2,0,0,0,False,True,False,False,False,False,True,False,False


In [9]:
y_test

0        1
1        0
2        0
3        0
4        0
        ..
82749    1
82750    0
82751    0
82752    0
82753    0
Name: History of Mental Illness, Length: 82754, dtype: int64

# 2. Machine Learning Modeling :

In [10]:
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    """
    Train the model and evaluate it on test data.
    Returns the classification report as a dictionary.
    """
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return report

In [None]:
# Initialize models
models = {
    "SVC (Linear)": SVC(kernel="linear", probability=True),
    "SVC (Non-Linear)": SVC(kernel="rbf", probability=True),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training model: {model_name}...")
    report = train_and_evaluate_model(model, x_train, y_train, x_test, y_test)
    results[model_name] = {
        "F1-Score (0)": report["0"]["f1-score"],
        "F1-Score (1)": report["1"]["f1-score"]
    }

Training model: SVC (Linear)...


In [None]:
# Create a comparative table
results_df = pd.DataFrame(results).T
print("\nComparative Table of F1-Scores:")

# Display the table
print(results_df)