In [1]:
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import pickle
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from arize.pandas.logger import Client, Schema
import datetime as dt
from arize.utils.types import ModelTypes, Environments
warnings.filterwarnings("ignore")

In [2]:
version = "v2.0"
data_url = "../data/depression_data.csv"

In [3]:
import sys  
sys.path.insert(0, '../backend/src')

In [4]:
from data_preprocessing_monitoring import transform_data
from clean_data_csv import clean_data

In [5]:
from dotenv import load_dotenv
import os
load_dotenv("../backend/src/.env")

DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token")

In [6]:
import os
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token

In [7]:
#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/KoubaaMahdi/MLOps_project.mlflow') #your mlfow tracking uri
mlflow.set_experiment("depression-detection-experiment")

<Experiment: artifact_location='mlflow-artifacts:/2770073eb03f43e99e9f9ae224f726e1', creation_time=1733241503098, experiment_id='0', last_update_time=1733241503098, lifecycle_stage='active', name='depression-detection-experiment', tags={}>

In [8]:
#read the data
raw_train = pd.read_csv(data_url)

In [9]:
raw_train.head(3)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,Married,High School,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes
2,Shannon Church,78,Widowed,Master's Degree,1,Non-smoker,Sedentary,Employed,125332.79,Low,Unhealthy,Good,No,No,Yes,No


In [10]:
#cleaning and preprocessing
X,y = transform_data(raw_train)

In [11]:
#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 11.99it/s]


mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.sklearn
  run_id: fbd82e561ae74c6f88fa2fb2fab6f9e7



In [12]:
baseline = raw_train.drop('Name', axis=1)


In [13]:
baseline['History of Mental Illness'] = baseline['History of Mental Illness'].map({'Yes': 1, 'No': 0})
baseline.rename(columns = {'History of Mental Illness':'actual_label'}, inplace = True)

In [14]:
transform_bin_str = { 0 : 'stable', 1 : 'ill'}
baseline['actual_label'] = baseline['actual_label'].map(transform_bin_str)

In [15]:
preds = loaded_model.predict(X)
baseline['prediction_label'] = preds
baseline['prediction_label'] = baseline['prediction_label'].map(transform_bin_str)

In [16]:
import uuid
# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

In [17]:
baseline["prediction_id"]=generate_prediction_ids(baseline)

In [18]:
baseline.head(3)


Unnamed: 0,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,actual_label,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,prediction_label,prediction_id
0,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,29.725522,Moderate,Moderate,Fair,ill,No,Yes,Yes,stable,ff553dab-dc28-41bd-9f64-95fa32ccd935
1,55,Married,High School,1,Non-smoker,Sedentary,Employed,34.955143,High,Unhealthy,Fair,ill,No,No,Yes,ill,2569e9ce-f4ad-4931-8189-5b4b1537a402
2,78,Widowed,Master's Degree,1,Non-smoker,Sedentary,Employed,50.044333,Low,Unhealthy,Good,stable,No,Yes,No,stable,042d2932-963b-456b-a737-483926680344


In [22]:
SPACE_KEY = "feac5cc"
API_KEY = "0fcb051ee1b940b963a"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "Illness-dector-model"  # This is the model name that will show up in Arize
)
model_version = "v2"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

✅ Arize setup complete!


In [23]:
features = feature_column_names=list(baseline.columns.drop(
        ["prediction_id", "prediction_label", "actual_label"]))

In [24]:
# Define a Schema() object for Arize to pick up data from the correct columns for logging
training_schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="prediction_label",
    actual_label_column_name="actual_label",
    feature_column_names=features)

# Logging Training DataFrame
training_response = arize_client.log(
    dataframe=baseline,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.TRAINING,
    schema=training_schema,
)

# If successful, the server will return a status_code of 200
if training_response.status_code != 200:
    print(
        f"logging failed with response code {training_response.status_code}, {training_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjoxMjc0MTpTaVlN/spaces/U3BhY2U6MTMzNjA6SW5YVg==/models/modelName/Illness-dector-model?selectedTab=performance[0m
✅ You have successfully logged training set to Arize
