In [None]:
import sys
from pathlib import Path
import pandas as pd
import os

# Add project root to Python path
PROJECT_ROOT = Path.cwd().parent

if "notebooks" in Path.cwd().parts:
    os.chdir(PROJECT_ROOT)
    
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

# Import custom utility functions
from src.config.utils import (
    read_csv,
    aggregate_count,
    aggregate_sum,
    write_excel
)

from src.config.config import DATA_DIR, RESULTS_DIR, MOMENT_OF_SUICIDE_FEATURES, SOCIO_DEMOGRAPHIC_FEATURES

In [2]:
# Read CSV File
csv_file_path = DATA_DIR / "processed" / "encoded_data.csv"
df_encoded = read_csv(csv_file_path)
csv_file_path = DATA_DIR / "processed" / "group_set.csv"
df_groups = read_csv(csv_file_path)

In [3]:
FEATURES = list(set(SOCIO_DEMOGRAPHIC_FEATURES + MOMENT_OF_SUICIDE_FEATURES))
columns_to_aggregate = [
    column
    for column in df_encoded.columns
    if any(column.startswith(feature) for feature in FEATURES)
]
group_columns = [column for column in df_groups.columns if column.startswith("Group")]

In [4]:
df = df_encoded.merge(df_groups[["ID"] + group_columns], on="ID", how="left")

In [5]:
df = df[["DateY"] + group_columns + columns_to_aggregate]

In [14]:
df["DateY"] = df["DateY"].astype(int)

# Group Count

In [7]:
# Apply the function
count_result = aggregate_count(
    df,
    group_columns=["Group_A", "Group_AF", "Group_AG", "Group_AGF"],
    header="Count",
)
count_result = count_result.sort_index()
count_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Column,Value,Unnamed: 2_level_1
Group_A,00_18,10777
Group_A,19_34,39482
Group_A,35_64,62147
Group_A,65,14628
Group_AF,00_18_0,9435
Group_AF,00_18_1,1342
Group_AF,19_34_0,25729
Group_AF,19_34_1,13753
Group_AF,35_64_0,29480
Group_AF,35_64_1,32667


In [8]:
write_excel(
    file_path=RESULTS_DIR / "group_exploration.xlsx",
    data=count_result,
    sheet_name="count_result",
    mode="w",
    index=True,
)

# Feature Count

In [9]:
# Apply the function
feature_result = aggregate_sum(
    df,
    group_columns=["Group_A", "Group_AF", "Group_AG", "Group_AGF"],
    value_columns=columns_to_aggregate,
)
feature_result = feature_result.sort_index()
feature_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Gender,Marital_Cohabitant,Marital_Cohabiting,Marital_Divorced,Marital_Married,Marital_Separated,Marital_Single,Marital_Widowed,Education_Higher,Education_LowerSecondary,...,Context_Other,Context_FamilyConflict,Context_SchoolWork,Context_MentalHealth,Context_HealthLoss,Context_HeartBreak,Context_CloseDeath,Context_Finances,Context_Crime,Context_Disability
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Group_A,00_18,3416,9,2,2,12,0,10752,0,0,252,...,1207,1585,1275,4674,63,1919,169,24,28,9
Group_A,19_34,29398,2455,289,1109,6163,68,29341,57,3192,445,...,5629,6294,979,13949,671,9397,716,1950,285,74
Group_A,35_64,48912,3365,451,8117,32311,458,15312,2133,7178,64,...,10603,11213,942,22913,3906,5530,1629,5256,533,340
Group_A,65,11016,210,27,946,7829,50,1048,4518,1361,10,...,1994,915,38,5384,4708,133,721,562,47,251
Group_AF,00_18_0,2528,9,2,0,9,0,9415,0,0,230,...,1038,1415,1110,4157,56,1631,148,18,20,9
Group_AF,00_18_1,888,0,0,2,3,0,1337,0,0,22,...,169,170,165,517,7,288,21,6,8,0
Group_AF,19_34_0,17247,1688,215,696,4024,39,19023,44,2035,318,...,3423,4274,738,9153,375,6315,493,1174,136,39
Group_AF,19_34_1,12151,767,74,413,2139,29,10318,13,1157,127,...,2206,2020,241,4796,296,3082,223,776,149,35
Group_AF,35_64_0,20649,2002,315,3588,15023,182,7408,962,3722,39,...,4363,6326,558,10720,1213,3455,865,2135,207,139
Group_AF,35_64_1,28263,1363,136,4529,17288,276,7904,1171,3456,25,...,6240,4887,384,12193,2693,2075,764,3121,326,201


In [10]:
write_excel(
    file_path=RESULTS_DIR / "group_exploration.xlsx",
    data=feature_result,
    sheet_name="feature_result",
    mode="a",
    index=True,
)

# Feature Percentage 

In [11]:
# Ensure the indices of both DataFrames match
if not feature_result.index.equals(count_result.index):
    raise ValueError("Indices of feature_result and count_result do not match.")

# Divide each column in feature_result by the "Count" column in count_result
percentage_feature_result = (
    feature_result.div(count_result["Count"], axis=0).mul(100).round(1)
)

percentage_feature_result


Unnamed: 0_level_0,Unnamed: 1_level_0,Gender,Marital_Cohabitant,Marital_Cohabiting,Marital_Divorced,Marital_Married,Marital_Separated,Marital_Single,Marital_Widowed,Education_Higher,Education_LowerSecondary,...,Context_Other,Context_FamilyConflict,Context_SchoolWork,Context_MentalHealth,Context_HealthLoss,Context_HeartBreak,Context_CloseDeath,Context_Finances,Context_Crime,Context_Disability
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Group_A,00_18,31.7,0.1,0.0,0.0,0.1,0.0,99.8,0.0,0.0,2.3,...,11.2,14.7,11.8,43.4,0.6,17.8,1.6,0.2,0.3,0.1
Group_A,19_34,74.5,6.2,0.7,2.8,15.6,0.2,74.3,0.1,8.1,1.1,...,14.3,15.9,2.5,35.3,1.7,23.8,1.8,4.9,0.7,0.2
Group_A,35_64,78.7,5.4,0.7,13.1,52.0,0.7,24.6,3.4,11.6,0.1,...,17.1,18.0,1.5,36.9,6.3,8.9,2.6,8.5,0.9,0.5
Group_A,65,75.3,1.4,0.2,6.5,53.5,0.3,7.2,30.9,9.3,0.1,...,13.6,6.3,0.3,36.8,32.2,0.9,4.9,3.8,0.3,1.7
Group_AF,00_18_0,26.8,0.1,0.0,0.0,0.1,0.0,99.8,0.0,0.0,2.4,...,11.0,15.0,11.8,44.1,0.6,17.3,1.6,0.2,0.2,0.1
Group_AF,00_18_1,66.2,0.0,0.0,0.1,0.2,0.0,99.6,0.0,0.0,1.6,...,12.6,12.7,12.3,38.5,0.5,21.5,1.6,0.4,0.6,0.0
Group_AF,19_34_0,67.0,6.6,0.8,2.7,15.6,0.2,73.9,0.2,7.9,1.2,...,13.3,16.6,2.9,35.6,1.5,24.5,1.9,4.6,0.5,0.2
Group_AF,19_34_1,88.4,5.6,0.5,3.0,15.6,0.2,75.0,0.1,8.4,0.9,...,16.0,14.7,1.8,34.9,2.2,22.4,1.6,5.6,1.1,0.3
Group_AF,35_64_0,70.0,6.8,1.1,12.2,51.0,0.6,25.1,3.3,12.6,0.1,...,14.8,21.5,1.9,36.4,4.1,11.7,2.9,7.2,0.7,0.5
Group_AF,35_64_1,86.5,4.2,0.4,13.9,52.9,0.8,24.2,3.6,10.6,0.1,...,19.1,15.0,1.2,37.3,8.2,6.4,2.3,9.6,1.0,0.6


In [12]:
write_excel(
    file_path=RESULTS_DIR / "group_exploration.xlsx",
    data=percentage_feature_result,
    sheet_name="percentage_feature_result",
    mode="a",
    index=True,
)

# Group Count Years

In [15]:
# Apply the function
count_years_result = aggregate_count(
    df,
    group_columns=["Group_A", "Group_AF", "Group_AG", "Group_AGF"],
    value_columns=["DateY"],
)
count_years_result = count_years_result.sort_index()
count_years_result

Unnamed: 0_level_0,Unnamed: 1_level_0,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Group_A,00_18,356,442,481,475,730,772,950,843,1496,2093,2139
Group_A,19_34,2265,3137,3200,3056,3605,3473,3772,3729,4342,4359,4544
Group_A,35_64,4297,5115,5081,5011,5518,5616,5734,5960,6396,6488,6931
Group_A,65,994,1146,1109,1215,1277,1299,1491,1474,1541,1570,1512
Group_AF,00_18_0,209,315,362,372,614,675,852,736,1369,1937,1994
Group_AF,00_18_1,147,127,119,103,116,97,98,107,127,156,145
Group_AF,19_34_0,934,1724,1824,1812,2347,2302,2536,2513,3148,3210,3379
Group_AF,19_34_1,1331,1413,1376,1244,1258,1171,1236,1216,1194,1149,1165
Group_AF,35_64_0,1016,1723,1877,2024,2605,2718,2933,3207,3606,3748,4023
Group_AF,35_64_1,3281,3392,3204,2987,2913,2898,2801,2753,2790,2740,2908


In [21]:
write_excel(
    file_path=RESULTS_DIR / "group_exploration.xlsx",
    data=count_years_result,
    sheet_name="count_years_result",
    mode="a",
    index=True,
)

In [20]:
# Calculate total sum for each group (index 'Column') and year (columns in count_years_result)
group_year_totals = count_years_result.groupby("Column").transform("sum")

# Divide each value by the corresponding group-year total to calculate percentages
percentage_years_result = count_years_result.div(group_year_totals) * 100
percentage_years_result = percentage_years_result.round(1)  # Round to 1 decimal place

percentage_years_result

Unnamed: 0_level_0,Unnamed: 1_level_0,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Group_A,00_18,4.5,4.5,4.9,4.9,6.6,6.9,8.0,7.0,10.9,14.4,14.1
Group_A,19_34,28.6,31.9,32.4,31.3,32.4,31.1,31.6,31.1,31.5,30.0,30.0
Group_A,35_64,54.3,52.0,51.5,51.4,49.6,50.3,48.0,49.6,46.4,44.7,45.8
Group_A,65,12.6,11.6,11.2,12.5,11.5,11.6,12.5,12.3,11.2,10.8,10.0
Group_AF,00_18_0,2.6,3.2,3.7,3.8,5.5,6.0,7.1,6.1,9.9,13.3,13.2
Group_AF,00_18_1,1.9,1.3,1.2,1.1,1.0,0.9,0.8,0.9,0.9,1.1,1.0
Group_AF,19_34_0,11.8,17.5,18.5,18.6,21.1,20.6,21.2,20.9,22.9,22.1,22.3
Group_AF,19_34_1,16.8,14.4,13.9,12.7,11.3,10.5,10.3,10.1,8.7,7.9,7.7
Group_AF,35_64_0,12.8,17.5,19.0,20.7,23.4,24.4,24.6,26.7,26.2,25.8,26.6
Group_AF,35_64_1,41.5,34.5,32.5,30.6,26.2,26.0,23.4,22.9,20.3,18.9,19.2


In [22]:
write_excel(
    file_path=RESULTS_DIR / "group_exploration.xlsx",
    data=percentage_years_result,
    sheet_name="percentage_years_result",
    mode="a",
    index=True,
)