In [64]:
# Packages needed for the project
import warnings

### Packages for graphs and reading files
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import yaml

#### Packages for data cleaning and preprocessing
#import missingno as msno
#from kneed import KneeLocator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression

#### Packages to verify multicolineality
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

### Packages for using models 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
from sklearn.model_selection import KFold
#from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RepeatedStratifiedKFold
import multiprocessing
from sklearn.svm import SVC
#from sklearn.datasets import make_classification


#### Packages for cross validation and hyperparameter tuning
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [None]:

# Define the base path relative to the repo structure

repo_root = Path.home() / "OneDrive" / "Documentos"  # or manually set if needed

# Load YAML config file
with open(repo_root / "Mental_Health_State_Prediction" / "Notebooks"/ "model_training.ipynb"/"config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Get filenames from config
data_filename = config["data_file"]
#target_filename = config["target_file"]

# Define common data folder path
data_path = repo_root / "Mental_Health_State_Prediction" / "Data" / "final_dataset_for_model_training"

# Build full paths
data_dataset_path = data_path / data_filename
#target_path = data_path / target_filename

# Load CSV files
dataset1= pd.read_csv(data_dataset_path, sep=",", index_col=False)
#target = pd.read_csv(target_path, index_col=False)

dataset1.head()


Unnamed: 0.1,Unnamed: 0,Program Category,Region Served,Age Group,Sex,Religious Preference,Veteran Status,Cultural Group,Serious Mental Illness,Smokes,...,Employment Status,Mental Illness,Cash Assistance Situation,Education Group,Unknown Insurance Coverage,Insured_or_Not,Has_Public_Insurance,Has_Private_or_Other_Insurance,Confirmed_Medicaid_Managed,Mental Illness.1
0,0,OUTPATIENT,DOWNSTATE,ADULT,MALE,UNKNOWN,NON-VETERAN/UNKNOWN,Hispanic,YES,NO,...,EMPLOYED,YES,No/Unknown,Educated,False,Yes,Yes,No,Yes,YES
1,1,OUTPATIENT,NEW YORK CITY,ADULT,MALE,RELIGIOUS,NON-VETERAN/UNKNOWN,Majority US,YES,NO,...,EMPLOYED,YES,No/Unknown,Educated,False,Yes,Yes,No,Yes,YES
2,2,OUTPATIENT,DOWNSTATE,ADULT,MALE,RELIGIOUS,NON-VETERAN/UNKNOWN,Majority US,YES,YES,...,EMPLOYED,YES,No/Unknown,Educated,False,Yes,Yes,No,Yes,YES
3,3,COMMUNITY/SUPPORTIVE,NEW YORK CITY,ADULT,MALE,UNKNOWN,NON-VETERAN/UNKNOWN,Unknown,YES,YES,...,EMPLOYED,YES,Receiving Cash Assistance,Educated,False,Yes,Yes,No,No,YES
4,4,OUTPATIENT,UPSTATE,ADULT,FEMALE,SPIRITUAL/NON-RELIGIOUS,NON-VETERAN/UNKNOWN,Majority US,YES,YES,...,EMPLOYED,YES,Receiving Cash Assistance,Educated,False,Yes,Yes,No,Yes,YES


In [83]:
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194117 entries, 0 to 194116
Data columns (total 33 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   Unnamed: 0                      194117 non-null  int64 
 1   Program Category                194117 non-null  object
 2   Region Served                   194117 non-null  object
 3   Age Group                       194117 non-null  object
 4   Sex                             194117 non-null  object
 5   Religious Preference            194117 non-null  object
 6   Veteran Status                  194117 non-null  object
 7   Cultural Group                  194117 non-null  object
 8   Serious Mental Illness          194117 non-null  object
 9   Smokes                          194117 non-null  object
 10  Diagnosis                       194117 non-null  object
 11  Disorder Group                  194117 non-null  object
 12  Mental disability             

In [84]:
for col in dataset1.columns:
    print(f"Column: {col}")
    print(dataset1[col].unique())
    print("-" * 40)

Column: Unnamed: 0
[     0      1      2 ... 194114 194115 194116]
----------------------------------------
Column: Program Category
['OUTPATIENT' 'COMMUNITY/SUPPORTIVE' 'CRISIS/INPATIENT']
----------------------------------------
Column: Region Served
['DOWNSTATE' 'NEW YORK CITY' 'UPSTATE']
----------------------------------------
Column: Age Group
['ADULT' 'CHILD' 'UNKNOWN']
----------------------------------------
Column: Sex
['MALE' 'FEMALE' 'UNKNOWN']
----------------------------------------
Column: Religious Preference
['UNKNOWN' 'RELIGIOUS' 'SPIRITUAL/NON-RELIGIOUS']
----------------------------------------
Column: Veteran Status
['NON-VETERAN/UNKNOWN' 'VETERAN']
----------------------------------------
Column: Cultural Group
['Hispanic' 'Majority US' 'Unknown' 'Immigrant/Other Lang']
----------------------------------------
Column: Serious Mental Illness
['YES' 'NO' 'UNKNOWN']
----------------------------------------
Column: Smokes
['NO' 'YES' 'UNKNOWN']
-----------------------

In [85]:
dataset1.drop(columns=['Unnamed: 0','Mental Illness.1'], inplace=True)

In [86]:
##  Label Encoding (Binary features)
binary_map = {
    'Yes': 1, 'No': 0,
    'YES':1, 'NO':0,
    True: 1, False: 0,
'NO CHRONICAL MEDICAL CONDITION':0, 'CHRONICAL MEDICAL CONDITION':1,
'NO,  CHRONIC ILLNESS':0 ,'YES, CHRONIC ILLNESS':1,
'VETERAN': 1, 'NON-VETERAN/UNKNOWN': 0,
}

binary_cols = [
    'Mental Illness', 'Insured_or_Not', 'Has_Public_Insurance',
    'Has_Private_or_Other_Insurance', 'Confirmed_Medicaid_Managed', 'Unknown Insurance Coverage','Chronical diseases',
    'Otherchron_group','Veteran Status'
]

for col in binary_cols:
    dataset1[col] = dataset1[col].map(binary_map)

In [87]:
### Ordinal Encoding (Categorical features)

# Education group (assumed order: Low < Others < Educated)
education_map = {'Low Educated': 1,'Others/Unknown': 2,'Educated': 3}
dataset1['Education Group'] = dataset1['Education Group'].map(education_map)

employ_map={'EMPLOYED':3, 'NOT IN LABOR FORCE':2, 'UNEMPLOYED/UNKNOW':1}

dataset1['Employment Status'] = dataset1['Employment Status'].map(employ_map)

In [92]:
onehot_cols = [
    'Program Category', 'Region Served', 'Age Group', 'Sex', 'Disorder Group', 'Mental disability',
    'Impairment Group', 'Users Canabis',
    'Smoking treatment', 'Service_drug_alcohol_opiod',
    'Other_testchronic_group', 'Heartchronic', 'Brainchronic', 'Cash Assistance Situation',
    'Household Composition', 'Religious Preference', 'Diagnosis','Cultural Group','Serious Mental Illness',
    'Smokes'
]

# One-hot encoding for categorical features

X1 = pd.get_dummies(dataset1, columns=onehot_cols, drop_first=True)

X1 = X1.astype(int)


X1.head()

Unnamed: 0,Veteran Status,Chronical diseases,Otherchron_group,Employment Status,Mental Illness,Education Group,Unknown Insurance Coverage,Insured_or_Not,Has_Public_Insurance,Has_Private_or_Other_Insurance,...,Diagnosis_NO ADDITIONAL DIAGNOSIS,Diagnosis_NOT MI/DEVELOPMENT/ORGANIC/SUBSTANCEADDICTIVE/DISORDER,Diagnosis_UNKNOWN,Cultural Group_Immigrant/Other Lang,Cultural Group_Majority US,Cultural Group_Unknown,Serious Mental Illness_UNKNOWN,Serious Mental Illness_YES,Smokes_UNKNOWN,Smokes_YES
0,0,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,3,1,3,0,1,1,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0,0,3,1,3,0,1,1,0,...,0,1,0,0,1,0,0,1,0,1
3,0,0,0,3,1,3,0,1,1,0,...,0,1,0,0,0,1,0,1,0,1
4,0,1,0,3,1,3,0,1,1,0,...,0,0,0,0,1,0,0,1,0,1


In [93]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194117 entries, 0 to 194116
Data columns (total 52 columns):
 #   Column                                                                 Non-Null Count   Dtype
---  ------                                                                 --------------   -----
 0   Veteran Status                                                         194117 non-null  int64
 1   Chronical diseases                                                     194117 non-null  int64
 2   Otherchron_group                                                       194117 non-null  int64
 3   Employment Status                                                      194117 non-null  int64
 4   Mental Illness                                                         194117 non-null  int64
 5   Education Group                                                        194117 non-null  int64
 6   Unknown Insurance Coverage                                             194117 non-null  int6

In [94]:
for col in X1.columns:
    print(f"Column: {col}")
    print(X1[col].unique())
    print("-" * 40)

Column: Veteran Status
[0 1]
----------------------------------------
Column: Chronical diseases
[0 1]
----------------------------------------
Column: Otherchron_group
[0 1]
----------------------------------------
Column: Employment Status
[3 2 1]
----------------------------------------
Column: Mental Illness
[1 0]
----------------------------------------
Column: Education Group
[3 2 1]
----------------------------------------
Column: Unknown Insurance Coverage
[0 1]
----------------------------------------
Column: Insured_or_Not
[1 0]
----------------------------------------
Column: Has_Public_Insurance
[1 0]
----------------------------------------
Column: Has_Private_or_Other_Insurance
[0 1]
----------------------------------------
Column: Confirmed_Medicaid_Managed
[1 0]
----------------------------------------
Column: Program Category_CRISIS/INPATIENT
[0 1]
----------------------------------------
Column: Program Category_OUTPATIENT
[1 0]
---------------------------------------

In [97]:
### traget (y) and predictors (X) variables
X = X1.loc[:, X1.columns!='Mental Illness'].copy()
y = X1['Mental Illness'].copy()
X.head()

Unnamed: 0,Veteran Status,Chronical diseases,Otherchron_group,Employment Status,Education Group,Unknown Insurance Coverage,Insured_or_Not,Has_Public_Insurance,Has_Private_or_Other_Insurance,Confirmed_Medicaid_Managed,...,Diagnosis_NO ADDITIONAL DIAGNOSIS,Diagnosis_NOT MI/DEVELOPMENT/ORGANIC/SUBSTANCEADDICTIVE/DISORDER,Diagnosis_UNKNOWN,Cultural Group_Immigrant/Other Lang,Cultural Group_Majority US,Cultural Group_Unknown,Serious Mental Illness_UNKNOWN,Serious Mental Illness_YES,Smokes_UNKNOWN,Smokes_YES
0,0,0,0,3,3,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,3,3,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,0,0,0,3,3,0,1,1,0,1,...,0,1,0,0,1,0,0,1,0,1
3,0,0,0,3,3,0,1,1,0,0,...,0,1,0,0,0,1,0,1,0,1
4,0,1,0,3,3,0,1,1,0,1,...,0,0,0,0,1,0,0,1,0,1
