In [1]:
import sys
from pathlib import Path
import pandas as pd
import os

# Add project root to Python path
PROJECT_ROOT = Path.cwd().parent

if "notebooks" in Path.cwd().parts:
    os.chdir(PROJECT_ROOT)
    
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

# Import custom utility functions
from src.config.utils import (
    read_csv,
    left_join_excel_sheets,
    aggregate_count,
    aggregate_sum,
    write_excel,
)

from src.config.config import DATA_DIR, RESULTS_DIR, MOMENT_OF_SUICIDE_FEATURES, SOCIO_DEMOGRAPHIC_FEATURES

In [40]:
# Read encoded data
csv_file_path = DATA_DIR / "processed" / "encoded_data.csv"
df_encoded = read_csv(csv_file_path)

In [41]:
# Read LCA classes
excel_file_path = DATA_DIR / "processed" / "lca_group_results.xlsx"
df_lca_classes = left_join_excel_sheets(excel_file_path, base_df=None, on="ID")

In [42]:
# Read group set
csv_file_path = DATA_DIR / "processed" / "group_set.csv"
df_groups = read_csv(csv_file_path)

In [43]:
FEATURES = list(set(SOCIO_DEMOGRAPHIC_FEATURES + MOMENT_OF_SUICIDE_FEATURES))
columns_to_aggregate = [
    column
    for column in df_encoded.columns
    if any(column.startswith(feature) for feature in FEATURES)
]
class_columns = [
    column for column in df_lca_classes.columns if column.endswith("_class")
]
group_columns = [column for column in df_groups.columns if column.startswith("Group")]

In [44]:
df = df_encoded.merge(df_lca_classes[["ID"] + class_columns], on="ID", how="left")

In [45]:
df = df.merge(df_groups[["ID"] + group_columns], on="ID", how="left")

In [46]:
df = df[["DateY"] + class_columns + columns_to_aggregate + group_columns]

In [47]:
df["DateY"] = df["DateY"].astype(int)

In [48]:
df.columns

Index(['DateY', 'LCA_Group_AF_class', 'LCA_Group_AG_class',
       'LCA_Group_AGF_class', 'Gender', 'Marital_Cohabitant',
       'Marital_Cohabiting', 'Marital_Divorced', 'Marital_Married',
       'Marital_Separated', 'Marital_Single', 'Marital_Widowed',
       'Education_Higher', 'Education_LowerSecondary', 'Education_PrePrimary',
       'Education_Primary', 'Education_Secondary', 'Education_Vocational',
       'WorkInfo_Agriculturalist', 'WorkInfo_Employed', 'WorkInfo_Student',
       'WorkInfo_Unemployed', 'Income_Benefits', 'Income_Dependent',
       'Income_NoSteady', 'Income_Steady', 'Fatal', 'Place_Forest',
       'Place_House', 'Place_Institution', 'Place_Isolation', 'Place_Other',
       'Place_PoliceArmy', 'Place_Railway', 'Place_Road', 'Place_School',
       'Place_UtilitySpaces', 'Place_WaterRes', 'Place_Work',
       'Method_Drowning', 'Method_Drugs', 'Method_Gas', 'Method_Hanging',
       'Method_Jumping', 'Method_Other', 'Method_Poisoning',
       'Method_Schooting', 'Me

# Group_AG

In [49]:
group_columns = ["Group_AF", "Group_AG", "Group_AGF"]

In [50]:
for group_column in group_columns:
    results_path = (
        RESULTS_DIR
        / "lca_class_exploration"
        / f"lca_class_exploration_{group_column}.xlsx"
    )

    # In group
    count_result = aggregate_count(
        df,
        group_columns=[group_column],
        value_columns=[f"LCA_{group_column}_class"],
        header="Count",
    )

    write_excel(
        file_path=results_path,
        data=count_result,
        sheet_name="cr",
        mode="w",
        index=True,
    )

    groups = sorted(list(set(df[group_column])))

    for group in groups:
        df_exploration = df[df[group_column] == group]

        # In Features
        feature_result = aggregate_sum(
            df_exploration,
            group_columns=[f"LCA_{group_column}_class"],
            value_columns=columns_to_aggregate,
        )
        feature_result = feature_result.sort_index()

        write_excel(
            file_path=results_path,
            data=feature_result,
            sheet_name=f"fc_{group}",
            mode="a",
            index=True,
        )

        # In Years
        count_years_result = aggregate_count(
            df_exploration,
            group_columns=[f"LCA_{group_column}_class"],
            value_columns=["DateY"],
        )
        count_years_result = count_years_result.sort_index()

        write_excel(
            file_path=results_path,
            data=count_years_result,
            sheet_name=f"cy_{group}",
            mode="a",
            index=True,
        )
