#### Importing libraries

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

#### Loading data 

In [3]:
data = pd.read_csv("../data/processed/data/data.csv", header=0)

#### Encoding categorical variables (LabelEncoder)

In [4]:
data_encoded = data.copy()

label_encoders = {}
for col in data_encoded.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    data_encoded[col] = le.fit_transform(data_encoded[col].astype(str))
    label_encoders[col] = le

### Analysis of the significance of categorical variables using the chi² test

In [None]:
from sklearn.feature_selection import chi2

# On suppose que la colonne cible s'appelle 'Mental Illness'
X_chi2 = data_encoded.drop(columns=['Mental Illness'])
y_chi2 = data_encoded['Mental Illness']

chi2_scores, p_values = chi2(X_chi2, y_chi2)
chi2_results = pd.Series(chi2_scores, index=X_chi2.columns).sort_values(ascending=False)

print("Ranking of categorical variables according to chi2 :")
print(chi2_results.tail(20))

Classement des variables catégorielles selon le chi2 (plus le score est élevé, plus l'association avec la cible est forte) :
Sex                               18.641248
Public Assistance Cash Program    16.159539
Region Served                     13.126061
Criminal Justice Status            8.151811
Medicaid Managed Insurance         6.303055
Living Situation                   5.744376
Other Insurance                    5.586460
Race                               4.619652
Alzheimer or Dementia              1.591413
Transgender                        1.490449
Opioid 12m Service                 1.383592
Autism Spectrum                    1.036827
Veterans Disability Benefits       1.017512
Veterans Cash Assistance           0.929992
Alcohol 12m Service                0.848235
Sexual Orientation                 0.368403
Preferred Language                 0.335821
Speech Impairment                  0.075401
Veteran Status                     0.061611
Opioid Related Disorder            0.00

<span style="color:gray">
Identification of variables with low correlation to the target (chi² < 1 or < 2)
</span>

In [None]:
print((chi2_results < 1).sum(), "variables with chi2 < 1")
print((chi2_results < 2).sum(), "variables with chi2 < 2")

7 variables avec chi2 < 1
12 variables avec chi2 < 2


<span style="color:gray">
Filtering variables according to the chi² score (threshold ≥ 2)
</span>

In [7]:
data_encoded_cleaned = data_encoded[chi2_results[chi2_results >= 2].index.tolist() + ['Mental Illness']]

<span style="color:gray">
Showing the new shape of the dataset without low correlated varibales
</span>

In [8]:
data_encoded_cleaned.shape

(189850, 57)

#### Saving pre-processed data (before and after feature selection)

In [None]:
data_encoded.to_csv("../data/processed/data/data_preprocessed.csv", index=False)
data_encoded_cleaned.to_csv("../data/processed/data/data_preprocessed_cleaned.csv", index=False)

: 