In [190]:
!pip install catboost
!pip install lightgbm



In [191]:
!pip install imblearn



In [192]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import pickle

In [193]:
data = pd.read_csv("distribution_data.csv")
data = data.drop(["User_ID","Country"], axis =1)

In [194]:
print(data.columns)

Index(['Age', 'Gender', 'Occupation', 'Mental_Health_Condition', 'Severity',
       'Consultation_History', 'Stress_Level', 'Sleep_Hours', 'Work_Hours',
       'Physical_Activity_Hours', 'Social_Media_Usage', 'Diet_Quality',
       'Smoking_Habit', 'Alcohol_Consumption', 'Medication_Usage'],
      dtype='object')


In [195]:
#Create a combined index.

data["Work_Sleep_Ratio"] = data["Work_Hours"] / (data["Sleep_Hours"] + 1)  # avoid divide by zero
data["Activity_Sleep_Balance"] = data["Physical_Activity_Hours"] / (data["Sleep_Hours"] + 1)

# Define custom mappings
stress_map = {"Low": 0, "Medium": 1, "High": 2}
diet_map = {"Healthy": 0, "Average": 1, "Unhealthy": 2}
alcohol_map = {"Non-Drinker": 0, "Social Drinker": 1, "Regular Drinker": 2, "Heavy Drinker": 3}
smoke_map = {"Non-Smoker": 0, "Occasional Smoker": 1, "Regular Smoker": 2, "Heavy Smoker": 3}

# Apply mappings
data["Stress_Score"] = data["Stress_Level"].map(stress_map)
data["Diet_Score"] = data["Diet_Quality"].map(diet_map)
data["Alcohol_Score"] = data["Alcohol_Consumption"].map(alcohol_map)
data["Smoke_Score"] = data["Smoking_Habit"].map(smoke_map)

# Total Risk Score
data["Lifestyle_Risk_Score"] = data["Stress_Score"] + data["Diet_Score"] + data["Alcohol_Score"] + data["Smoke_Score"]
data["Final_Risk_Index"] = data["Lifestyle_Risk_Score"] + data["Work_Sleep_Ratio"] + data["Activity_Sleep_Balance"]


In [196]:
# Initialize dictionary to store encoders
encoders = {}

# Encode categorical columns and save each encoder
categorical_cols = ['Gender', 'Occupation', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption', 'Stress_Level','Consultation_History','Medication_Usage']
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    encoders[col] = le

In [197]:
#Define X and y
features = [
    'Age',
    'Gender',
    'Occupation',
    'Diet_Quality',
    'Smoking_Habit',
    'Alcohol_Consumption',
    'Sleep_Hours',
    'Physical_Activity_Hours',
    'Stress_Level',
    'Social_Media_Usage',
    'Consultation_History',
    'Medication_Usage',
    'Work_Hours'
]

X = data[features]
y_Class = data['Mental_Health_Condition']

y_severity= data['Severity']
sev_encoder = LabelEncoder()
y_severity_enc = sev_encoder.fit_transform(y_severity)


print(X.head())
print(type(X))

   Age  Gender  Occupation  Diet_Quality  Smoking_Habit  Alcohol_Consumption  \
0   36       1           0             1              3                    2   
1   48       1           1             2              0                    3   
2   18       3           6             1              0                    3   
3   30       2           1             0              3                    2   
4   58       1           4             2              3                    1   

   Sleep_Hours  Physical_Activity_Hours  Stress_Level  Social_Media_Usage  \
0          7.6                        8             1                 2.2   
1          6.8                        2             1                 3.4   
2          7.1                        9             2                 5.9   
3          6.9                        4             1                 5.4   
4          4.7                       10             0                 3.3   

   Consultation_History  Medication_Usage  Work_Hours  


In [198]:
print("Available encoders:", list(encoders.keys()))

Available encoders: ['Gender', 'Occupation', 'Diet_Quality', 'Smoking_Habit', 'Alcohol_Consumption', 'Stress_Level', 'Consultation_History', 'Medication_Usage']


In [199]:
#Split the data.
X_train_Class, X_test_Class, y_train_Class, y_test_Class = train_test_split(X, y_Class, test_size=0.2, random_state=42)
X_train_severity, X_test_severity, y_train_severity, y_test_severity = train_test_split(X, y_severity_enc, test_size=0.2, random_state=42)
print(type(X_train_Class))
print(type(X_train_severity))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [200]:
#Model Parameters for tuning.
catboost_params = {
    'iterations': [100, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

lgbm_params = {
    'n_estimators': [100, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 63],
    'min_child_samples': [10, 20]
}

lr_params = {
    'C': [0.1, 1.0, 10],
    'solver': ['liblinear', 'lbfgs']
}


In [201]:
#Catboost tuning
#Classifier.
cat_Class = CatBoostClassifier(verbose=0, random_state=42)
cat_search_Class = RandomizedSearchCV(cat_Class, catboost_params, cv=3, scoring='accuracy', n_iter=10, random_state=42)
cat_search_Class.fit(X_train_Class, y_train_Class)

best_cat_Class = cat_search_Class.best_estimator_

#Severity.
cat_sev = CatBoostClassifier(verbose=0, random_state=42)
cat_search_sev = RandomizedSearchCV(cat_sev, catboost_params, cv=3, scoring='accuracy', n_iter=10, random_state=42)
cat_search_sev.fit(X_train_severity, y_train_severity)

best_cat_sev = cat_search_sev.best_estimator_

In [202]:
#LGBM tuning
#Classifier
lgb_Class = LGBMClassifier(random_state=42)
lgb_search_Class = RandomizedSearchCV(lgb_Class, lgbm_params, cv=3, scoring='accuracy', n_iter=10, random_state=42)
lgb_search_Class.fit(X_train_Class, y_train_Class)

best_lgb_Class = lgb_search_Class.best_estimator_

#Severity
lgb_sev = LGBMClassifier(random_state=42)
lgb_search_sev = RandomizedSearchCV(lgb_sev, lgbm_params, cv=3, scoring='accuracy', n_iter=10, random_state=42)
lgb_search_sev.fit(X_train_severity, y_train_severity)

best_lgb_sev = lgb_search_sev.best_estimator_

[LightGBM] [Info] Number of positive: 13336, number of negative: 13330
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 26666, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500113 -> initscore=0.000450
[LightGBM] [Info] Start training from score 0.000450
[LightGBM] [Info] Number of positive: 13336, number of negative: 13331
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 26667, number of used features: 13
[LightGBM] [Info] [bin

In [203]:
#Logistic regression tuning
#Classifier
lr_Class = LogisticRegression(max_iter=1000)
lr_search_Class = RandomizedSearchCV(lr_Class, lr_params, cv=3, scoring='accuracy', n_iter=5, random_state=42)
lr_search_Class.fit(X_train_Class, y_train_Class)

best_lr_Class = lr_search_Class.best_estimator_

#Severity
lr_sev = LogisticRegression(max_iter=1000)
lr_search_sev = RandomizedSearchCV(lr_sev, lr_params, cv=3, scoring='accuracy', n_iter=5, random_state=42)
lr_search_sev.fit(X_train_severity, y_train_severity)

best_lr_sev = lr_search_sev.best_estimator_

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [204]:
# Base learners

base_learners = [
    ('catboost', CatBoostClassifier(verbose=0, random_state=42)),
    ('lightgbm', LGBMClassifier(random_state=42))
]

# Meta-learner: Logistic Regression
meta_learner = LogisticRegression()

# Stacking Classifier
stack_model_Class = StackingClassifier(
    estimators=[
        ('catboost', best_cat_Class),
        ('lightgbm', best_lgb_Class)
    ],
    final_estimator=best_lr_Class,
    passthrough=True,
    cv=5
)

#Stacking Severity.
stack_model_sev = StackingClassifier(
    estimators=[
        ('catboost', best_cat_sev),
        ('lightgbm', best_lgb_sev)
    ],
    final_estimator=best_lr_sev,
    passthrough=True,
    cv=5
)


In [205]:
#Fit Classifier
stack_model_Class.fit(X_train_Class, y_train_Class)
y_pred_Class = stack_model_Class.predict(X_test_Class)

# Evaluation
print(" Accuracy:", accuracy_score(y_test_Class, y_pred_Class))
print(classification_report(y_test_Class, y_pred_Class))


#Fit Severity.
stack_model_sev.fit(X_train_severity, y_train_severity)
y_pred_sev = stack_model_sev.predict(X_test_severity)

# Evaluation
print(" Accuracy:", accuracy_score(y_test_severity, y_pred_sev))
print(classification_report(y_test_severity, y_pred_sev))


[LightGBM] [Info] Number of positive: 20004, number of negative: 19996
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500100 -> initscore=0.000400
[LightGBM] [Info] Start training from score 0.000400
[LightGBM] [Info] Number of positive: 16004, number of negative: 15996
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 13
[LightGBM] [Info] [bin

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [206]:
def get_user_inputs_and_predict():
    print("*** Mental Health Risk & Severity Predictor\n")

    # Input collection
    try:
        age = int(input("Enter your Age (e.g., 24): "))
        gender = input("Enter Gender (Male/Female/Other): ").strip()
        occupation = input("Enter Occupation (e.g., IT, Education, Sales): ").strip()
        diet_quality = input("Diet Quality (Healthy/Average/Unhealthy): ").strip()
        smoking = input("Smoking Habit (Non-Smoker/Occasional Smoker/Regular Smoker/Heavy Smoker): ").strip()
        alcohol = input("Alcohol Consumption (Non-Drinker/Social Drinker/Regular Drinker/Heavy Drinker): ").strip()
        sleep_hours = float(input("Sleep Hours (e.g., 7.5): "))
        physical_activity = float(input("Physical Activity Hours per day (e.g., 1.5): "))
        stress_level = input("Stress Level (Low/Medium/High): ").strip()
        social_media_hours = float(input("Social Media Usage Hours (e.g., 4): "))
        consultation_history = input("Consultation History (Yes/No): ").strip()
        medication = input("Medication Usage (Yes/No): ").strip()
        work_hours = float(input("Work Hours (e.g., 8): ")) 

        # Create DataFrame for prediction
        sample = pd.DataFrame([{
            'Age': age,
            'Gender': encoders['Gender'].transform([gender])[0],
            'Occupation': encoders['Occupation'].transform([occupation])[0],
            'Consultation_History': encoders['Consultation_History'].transform([consultation_history])[0],
            'Stress_Level': stress_map[stress_level],
            'Sleep_Hours': sleep_hours,
            'Work_Hours': work_hours,
            'Physical_Activity_Hours': physical_activity,
            'Social_Media_Usage': social_media_hours,
            'Diet_Quality': diet_map[diet_quality],
            'Smoking_Habit': smoke_map[smoking],
            'Alcohol_Consumption': alcohol_map[alcohol],
            'Medication_Usage': encoders['Medication_Usage'].transform([medication])[0]
        }])

        # Predictions
        risk_pred = stack_model_Class.predict(sample)[0]
        risk_label = "Yes" if risk_pred == 1 else "No"

        print("\n** Prediction Results:")
        print("* Mental Health Risk:", risk_label)

        if risk_label == "Yes":
            severity_pred = stack_model_sev.predict(sample)[0]
            print("* Severity Level:", severity_pred)
        else:
            print("* No disorder detected at this time. Keep maintaining healthy habits. 🧘‍♂️")

    except Exception as e:
        print("** Input Error:", e)


In [207]:
# Save the model and encoders
structured_bundle = {
    "risk_model": stack_model_Class,
    "severity_model": stack_model_sev,
    "encoders": encoders,  # LabelEncoders or dicts for Gender, Diet, etc.
}

with open("model_structured.pkl", "wb") as f:
    pickle.dump(structured_bundle, f)
