#### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#### Reading the dataset & deletion of duplicated data

In [None]:
df = pd.read_csv("../data/raw/Patients.csv", sep=";")
df = df.drop_duplicates()

#### Definition and grouping of different variables in the dataset

In [None]:
target_col = "Mental Illness"
categories_column = {
    "Socio-demographic information": [
        "Survey Year","Program Category","Region Served","Age Group","Sex","Transgender","Sexual Orientation",
        "Hispanic Ethnicity","Race","Preferred Language","Religious Preference"
    ],
    "Family and social situation": [
        "Living Situation","Household Composition","Veteran Status","Criminal Justice Status",
        "Three Digit Residence Zip Code"
    ],
    "Professional and educational background": [
        "Employment Status","Number Of Hours Worked Each Week","Education Status","Special Education Services"
    ],
    "Developmental disorders": [
        "Serious Mental Illness","Intellectual Disability","Autism Spectrum","Other Developmental Disability"
    ],
    "Addictions / Substances": [
        "Alcohol Related Disorder","Drug Substance Disorder","Opioid Related Disorder",
        "Cannabis Recreational Use","Cannabis Medicinal Use","Smokes",
        "Received Smoking Medication","Received Smoking Counseling",
        "Alcohol 12m Service","Opioid 12m Service","Drug/Substance 12m Service"
    ],
    "Physical health / Medical conditions": [
        "Mobility Impairment Disorder","Hearing Impairment","Visual Impairment","Speech Impairment",
        "Hyperlipidemia","High Blood Pressure","Diabetes","Obesity","Heart Attack","Stroke","Other Cardiac",
        "Pulmonary Asthma","Alzheimer or Dementia","Kidney Disease","Liver Disease","Endocrine Condition",
        "Neurological Condition","Traumatic Brain Injury","Joint Disease","Cancer",
        "Other Chronic Med Condition","No Chronic Med Condition","Unknown Chronic Med Condition"
    ],
    "Medical diagnosis": [
        "Principal Diagnosis Class","Additional Diagnosis Class"
    ],
    "Financial and social assistance": [
        "SSI Cash Assistance","SSDI Cash Assistance","Veterans Disability Benefits","Veterans Cash Assistance",
        "Public Assistance Cash Program","Other Cash Benefits"
    ],
    "Health insurance": [
        "Medicaid and Medicare Insurance","No Insurance","Unknown Insurance Coverage","Medicaid Insurance",
        "Medicaid Managed Insurance","Medicare Insurance","Private Insurance","Child Health Plus Insurance","Other Insurance"
    ]
}

#### Visualization of the data


In [None]:
print("Shape du dataset :", df.shape)
df.head()

In [None]:
df.tail()

In [None]:
for col in df.columns:
    uniques = df[col].unique()
    print(f"{col}: {uniques}")

In [None]:
df.info()

In [None]:
print("\nVariable types :")
df.dtypes.value_counts()

In [None]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
bool_cols = df.select_dtypes(include=['bool']).columns.tolist()

In [None]:
print("\nNumerical variables :", num_cols)
print("Categorical variables :", len(cat_cols))
print("Boolean variables :", bool_cols)

<span style="color:gray">
Removal of the columns ‘Serious Mental Illness’, ‘Principal Diagnosis Class’ and ‘Additional Diagnosis Class’, as they contain the target variable, then update of the dictionnary categories.
</span>

In [None]:
df = df.drop(num_cols + ["Serious Mental Illness", "Principal Diagnosis Class","Additional Diagnosis Class"], axis=1) # Informations pas utiles à l'étude et trop proche de la cible

categories = {cat: [col for col in cols if col in df.columns] 
              for cat, cols in categories_column.items()}

<span style="color:gray">
Checking of there is missing values
</span>

In [None]:
df.isna().sum()

#### Visualization of the data without treating missinf values

In [None]:
print("Unique values of the target :", df[target_col].unique())
print("\nDistribution of the target :")
print(df[target_col].value_counts(normalize=True))

os.makedirs("../data/processed/df/graph", exist_ok=True)
plt.figure()
sns.countplot(x=target_col, data=df)
plt.title("Distribution of the target")
plt.savefig("../data/processed/df/graph/target_distribution.png", bbox_inches='tight')
plt.show()

In [None]:
# Variables catégorielles
for cat, cols in categories.items():  
    if not cols:
        continue

    cat_dir = f"../data/processed/df/graph/univariate_analysis/{cat.replace('/', '_').replace(' ', '_')}"
    os.makedirs(cat_dir, exist_ok=True)

    plt.figure(figsize=(14, 4 * len(cols)))

    plot_idx = 1
    for col in cols:
        if col not in df.columns:
            continue

        # Affichage groupé par catégories
        plt.subplot(len(cols), 1, plot_idx)
        ax = sns.countplot(y=col, data=df, order=df[col].value_counts().index)

        # Ajouter les pourcentages sur les barres
        total = len(df[col].dropna())
        for container in ax.containers:
            ax.bar_label(container, labels=[f"{(x/total*100):.1f}%" for x in container.datavalues])

        plt.title(f"Distribution of {col}")
        plt.xlabel("Count")

        # Sauvegarde individuelle
        fig_indiv, ax_indiv = plt.subplots(figsize=(10, 6))
        ax2 = sns.countplot(y=col, data=df, order=df[col].value_counts().index, ax=ax_indiv)
        for container in ax2.containers:
            ax2.bar_label(container, labels=[f"{(x/total*100):.1f}%" for x in container.datavalues])
        ax_indiv.set_title(f"Distribution of {col} ({cat})")
        ax_indiv.set_xlabel("Count")
        plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_')
        fig_indiv.savefig(f"{cat_dir}/{safe_col}_distribution.png", bbox_inches='tight')
        plt.close(fig_indiv)

        plot_idx += 1

    plt.suptitle(f"{cat}", fontsize=20, y=1.02)
    plt.tight_layout()
    plt.show()

#### Data cleansing

<span style="color:gray">
In our case, there are no missing values in the strict sense, but some can be considered as such. We duplicate the base DataFrame and replace these values with nan, so that they are recognised as missing by the pandas library.
</span>

In [None]:
data = df.copy()
missing_values_list = [
    'DATA NOT AVAILABLE', 'UNKNOWN', 'NOT APPLICABLE'
] 

cat_cols_valid = [col for col in cat_cols if col in data.columns]

for col in cat_cols_valid:
    data[col] = data[col].replace(missing_values_list, np.nan)

data.isna().sum().sort_values(ascending=False)


<span style="color:gray">
Columns with more than 40% missing values are deleted.
</span>

In [None]:
data = data.drop(columns=["Number Of Hours Worked Each Week", "Special Education Services"], axis=1)

In [None]:
data.isna().sum().sort_values(ascending=False)

In [None]:
data.shape

<span style="color:gray">
The data is duplicated again, this time to create a DataFrame containing only usable variables, removing all rows with missing values.
</span>

In [None]:
cleaned_data = data.copy()
cleaned_data = cleaned_data.dropna()

In [None]:
cleaned_data.shape

<span style="color:gray">
In the data DataFrame, missing values are replaced with the most frequent modality.
</span>

In [None]:
for col in data.columns:
    if col in cat_cols:
        mode_value = data[col].mode()[0]
        print(f"Column “{col}”: most frequent modality = '{mode_value}'")
        data[col] = data[col].fillna(mode_value)

print(data.isna().sum().sort_values(ascending=False))

#### Visualisation of distributions for the dataframe data

<span style="color:gray">
Visualisation of the distribution of the target
</span>

In [None]:
print("Unique values of the target :", data[target_col].unique())
print("\nDistribution of the target :")
print(data[target_col].value_counts(normalize=True))

os.makedirs("../data/processed/data/graph", exist_ok=True)
plt.figure()
sns.countplot(x=target_col, data=data)
plt.title("Distribution of the target")
plt.savefig("../data/processed/data/graph/target_distribution.png", bbox_inches='tight')
plt.show()

<span style="color:gray">
Visualisation of the distribution of each variables of the dataset (univariate analysis)
</span>

In [None]:
for cat, cols in categories.items():  
    if not cols:
        continue

    cat_dir = f"../data/processed/data/graph/univariate_analysis/{cat.replace('/', '_').replace(' ', '_')}"
    os.makedirs(cat_dir, exist_ok=True)

    plt.figure(figsize=(14, 4 * len(cols)))

    plot_idx = 1
    
    for col in cols:
        if col not in data.columns:
            continue

        # Grouped display by category
        plt.subplot(len(cols), 1, plot_idx)
        ax = sns.countplot(y=col, data=data, order=data[col].value_counts().index)

        # Display of distribution percentages
        total = len(data[col].dropna())
        for container in ax.containers:
            ax.bar_label(container, labels=[f"{(x/total*100):.1f}%" for x in container.datavalues])

        plt.title(f"Distribution of {col}")
        plt.xlabel("Count")

        # Saving the different graphs for each variable in each category
        fig_indiv, ax_indiv = plt.subplots(figsize=(10, 6))
        ax2 = sns.countplot(y=col, data=data, order=data[col].value_counts().index, ax=ax_indiv)
        for container in ax2.containers:
            ax2.bar_label(container, labels=[f"{(x/total*100):.1f}%" for x in container.datavalues])
        ax_indiv.set_title(f"Distribution of {col} ({cat})")
        ax_indiv.set_xlabel("Count")
        plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_')
        fig_indiv.savefig(f"{cat_dir}/{safe_col}_distribution.png", bbox_inches='tight')
        plt.close(fig_indiv)

        plot_idx += 1

    plt.suptitle(f"{cat}", fontsize=20, y=1.02)
    plt.tight_layout()
    plt.show()

<span style="color:gray">
Visualisation of the distribution of each variables of the dataset in relation to the target (bivariate analysis)
</span>

In [None]:
for cat, cols in categories.items():
    if not cols:
        continue
    plot_cols = [col for col in cols if col in data.columns]
    if not plot_cols:
        continue

    n = len(plot_cols)
    ncols = 3  
    nrows = (n + ncols - 1) // ncols

    plt.figure(figsize=(6 * ncols, 5 * nrows))

    cat_dir = f"../data/processed/data/graph/bivariate_analysis/{cat.replace('/', '_').replace(' ', '_')}"
    os.makedirs(cat_dir, exist_ok=True)

    for i, col in enumerate(plot_cols, 1):
        plt.subplot(nrows, ncols, i)

        ax = sns.countplot(
            data=data,
            x=col,
            hue=target_col,
            order=data[col].value_counts().index
        )
        total = len(data)
        for container in ax.containers:
            ax.bar_label(container, 
                         labels=[f"{(v/total*100):.1f}%" for v in container.datavalues],
                         fontsize=8)

        plt.title(f"{col}", fontsize=12)
        plt.xticks(rotation=45, ha="right")
        plt.ylabel("Number of observations")
        plt.legend(title=target_col)

        # Sauvegarde individuelle
        fig_indiv, ax_indiv = plt.subplots(figsize=(8, 5))
        ax2 = sns.countplot(
            data=data,
            x=col,
            hue=target_col,
            order=data[col].value_counts().index,
            ax=ax_indiv
        )
        for container in ax2.containers:
            ax2.bar_label(container, 
                          labels=[f"{(v/total*100):.1f}%" for v in container.datavalues],
                          fontsize=8)
        ax_indiv.set_title(f"{col} ({cat})", fontsize=12)
        ax_indiv.set_xticklabels(ax_indiv.get_xticklabels(), rotation=45, ha="right")
        ax_indiv.set_ylabel("Number of observations")
        ax_indiv.legend(title=target_col)
        plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_')
        fig_indiv.savefig(f"{cat_dir}/{safe_col}_target_effectif.png", bbox_inches='tight')
        plt.close(fig_indiv)

    plt.suptitle(f"{cat}", fontsize=20, y=1.02)
    plt.tight_layout()
    plt.show()

#### Visalisation des distribution pour le dataframe cleaned_data (without any missing values)

<span style="color:gray">
Visualisation of the distribution of the target
</span>

In [None]:
print("Unique values of the target :", data[target_col].unique())
print("\nDistribution of the target :")
print(cleaned_data[target_col].value_counts(normalize=True))

os.makedirs("../data/processed/cleaned_data/graph", exist_ok=True)
plt.figure()
sns.countplot(x=target_col, data=cleaned_data)
plt.title("Distribution of the target")
plt.savefig("../data/processed/cleaned_data/graph/target_distribution.png", bbox_inches='tight')
plt.show()

<span style="color:gray">
Visualisation of the distribution of each variables of the dataset (univariate analysis)
</span>

In [None]:
for cat, cols in categories.items():  
    if not cols:
        continue

    cat_dir = f"../data/processed/cleaned_data/graph/univariate_analysis/{cat.replace('/', '_').replace(' ', '_')}"
    os.makedirs(cat_dir, exist_ok=True)

    plt.figure(figsize=(14, 4 * len(cols)))

    plot_idx = 1
    for col in cols:
        if col not in data.columns:
            continue

        plt.subplot(len(cols), 1, plot_idx)
        ax = sns.countplot(y=col, data=cleaned_data, order=cleaned_data[col].value_counts().index)

        total = len(cleaned_data[col].dropna())
        for container in ax.containers:
            ax.bar_label(container, labels=[f"{(x/total*100):.1f}%" for x in container.datavalues])

        plt.title(f"Distribution of {col}")
        plt.xlabel("Count")

        fig_indiv, ax_indiv = plt.subplots(figsize=(10, 6))
        ax2 = sns.countplot(y=col, data=cleaned_data, order=cleaned_data[col].value_counts().index, ax=ax_indiv)
        for container in ax2.containers:
            ax2.bar_label(container, labels=[f"{(x/total*100):.1f}%" for x in container.datavalues])
        ax_indiv.set_title(f"Distribution of {col} ({cat})")
        ax_indiv.set_xlabel("Count")
        plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_')
        fig_indiv.savefig(f"{cat_dir}/{safe_col}_distribution.png", bbox_inches='tight')
        plt.close(fig_indiv)

        plot_idx += 1

    plt.suptitle(f"{cat}", fontsize=20, y=1.02)
    plt.tight_layout()
    plt.show()

<span style="color:gray">
Visualisation of the distribution of each variables of the dataset in relation to the target (bivariate analysis)
</span>

In [None]:
for cat, cols in categories.items():
    if not cols:
        continue
    plot_cols = [col for col in cols if col in data.columns]
    if not plot_cols:
        continue

    n = len(plot_cols)
    ncols = 3  
    nrows = (n + ncols - 1) // ncols  

    plt.figure(figsize=(6 * ncols, 5 * nrows))

    # Dossier de sauvegarde
    cat_dir = f"../data/processed/cleaned_data/graph/bivariate_analysis/{cat.replace('/', '_').replace(' ', '_')}"
    os.makedirs(cat_dir, exist_ok=True)

    for i, col in enumerate(plot_cols, 1):
        plt.subplot(nrows, ncols, i)

        ax = sns.countplot(
            data=cleaned_data,
            x=col,
            hue=target_col,
            order=cleaned_data[col].value_counts().index
        )

        total = len(cleaned_data)
        for container in ax.containers:
            ax.bar_label(container, 
                         labels=[f"{(v/total*100):.1f}%" for v in container.datavalues],
                         fontsize=8)

        plt.title(f"{col}", fontsize=12)
        plt.xticks(rotation=45, ha="right")
        plt.ylabel("Number of observations")
        plt.legend(title=target_col)

        fig_indiv, ax_indiv = plt.subplots(figsize=(8, 5))
        ax2 = sns.countplot(
            data=cleaned_data,
            x=col,
            hue=target_col,
            order=cleaned_data[col].value_counts().index,
            ax=ax_indiv
        )
        for container in ax2.containers:
            ax2.bar_label(container, 
                          labels=[f"{(v/total*100):.1f}%" for v in container.datavalues],
                          fontsize=8)
        ax_indiv.set_title(f"{col} ({cat})", fontsize=12)
        ax_indiv.set_xticklabels(ax_indiv.get_xticklabels(), rotation=45, ha="right")
        ax_indiv.set_ylabel("Number of observations")
        ax_indiv.legend(title=target_col)
        plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_')
        fig_indiv.savefig(f"{cat_dir}/{safe_col}_target_effectif.png", bbox_inches='tight')
        plt.close(fig_indiv)

    plt.suptitle(f"{cat}", fontsize=20, y=1.02)
    plt.tight_layout()
    plt.show()

#### Creation of a dataframe using some feature engineering

#### Saves each dataframe in its dedicated folder for use in other notebooks.

In [None]:
df.to_csv("../data/processed/df/df.csv", index=False)
data.to_csv("../data/processed/data/data.csv", index=False)
cleaned_data.to_csv("../data/processed/cleaned_data/cleaned_data.csv", index=False)