# Importing Data

In [None]:
#importing libraries
import pandas as pd
from IPython.display import display
import numpy as np

In [65]:
#import data from data folder
suicide_data = pd.read_csv('data/suicide_rates_1990-2022.csv')
hdi_data = pd.read_csv('data/hdi.csv')
gini_coef_data = pd.read_csv('data/gini-coefficient.csv')


In [66]:
suicide_data.head()

Unnamed: 0,RegionCode,RegionName,CountryCode,CountryName,Year,Sex,SuicideCount,CauseSpecificDeathPercentage,StdDeathRate,DeathRatePer100K,Population,GDP,GDPPerCapita,GNI,GNIPerCapita,InflationRate,EmploymentPopulationRatio
0,EU,Europe,ALB,Albania,1992,Male,33,0.331959,2.335802,2.076386,3247039.0,652175000.0,200.85222,906184200.0,1740.0,226.005421,45.315
1,EU,Europe,ALB,Albania,1992,Female,14,0.19186,0.86642,0.874563,3247039.0,652175000.0,200.85222,906184200.0,1740.0,226.005421,45.315
2,EU,Europe,ALB,Albania,1993,Male,46,0.477724,3.330938,2.937233,3227287.0,1185315000.0,367.279225,1024263000.0,2110.0,85.004751,47.798
3,EU,Europe,ALB,Albania,1993,Female,27,0.385164,1.755077,1.686025,3227287.0,1185315000.0,367.279225,1024263000.0,2110.0,85.004751,47.798
4,EU,Europe,ALB,Albania,1994,Male,37,0.419406,2.678796,2.332619,3207536.0,1880951000.0,586.416135,1216681000.0,2300.0,22.565053,50.086


In [67]:
hdi_data.head()

Unnamed: 0,ISO3,Country,Human Development Groups,UNDP Developing Regions,HDI Rank (2021),Human Development Index (1990),Human Development Index (1991),Human Development Index (1992),Human Development Index (1993),Human Development Index (1994),...,Material footprint per capita (tonnes) (2012),Material footprint per capita (tonnes) (2013),Material footprint per capita (tonnes) (2014),Material footprint per capita (tonnes) (2015),Material footprint per capita (tonnes) (2016),Material footprint per capita (tonnes) (2017),Material footprint per capita (tonnes) (2018),Material footprint per capita (tonnes) (2019),Material footprint per capita (tonnes) (2020),Material footprint per capita (tonnes) (2021)
0,AFG,Afghanistan,Low,SA,180.0,0.273,0.279,0.287,0.297,0.292,...,1.86,1.88,1.66,1.62,1.66,1.41,1.32,1.38,1.38,1.38
1,AGO,Angola,Medium,SSA,148.0,,,,,,...,4.09,4.53,3.97,3.59,2.79,2.64,2.28,2.18,2.18,2.18
2,ALB,Albania,High,ECA,67.0,0.647,0.629,0.614,0.617,0.624,...,12.44,11.49,13.14,12.61,14.39,14.46,12.85,12.96,12.96,12.96
3,AND,Andorra,Very High,,40.0,,,,,,...,,,,,,,,,,
4,ARE,United Arab Emirates,Very High,AS,26.0,0.728,0.739,0.742,0.748,0.755,...,49.56,49.68,55.49,59.76,64.95,75.61,65.97,68.95,68.95,68.95


In [68]:
gini_coef_data.head()

Unnamed: 0,Country,Year,Gini coefficient
0,Albania,1996,0.270103
1,Albania,2002,0.31739
2,Albania,2005,0.305957
3,Albania,2008,0.299847
4,Albania,2012,0.289605


## Summary of the Dataset

 - find all the features, what columns
 - how many records does it contain?

In [69]:
#suicide dataset
print("Suicide Dataset Summary:")
print("-----------------------")
print("All Features:")
print(list(suicide_data.columns))
print("Total Entries:", suicide_data.shape[0])

print("\n")

print("Human Development Index Dataset Summary:")
print("-----------------------")
print("All Features:")
print(list(hdi_data.columns))
print("Total Entries:", hdi_data.shape[0])

print("\n")

print("Gini Coefficient Dataset Summary:")
print("-----------------------")
print("All Features:")
print(list(gini_coef_data.columns))
print("Total Entries:", gini_coef_data.shape[0])

Suicide Dataset Summary:
-----------------------
All Features:
['RegionCode', 'RegionName', 'CountryCode', 'CountryName', 'Year', 'Sex', 'SuicideCount', 'CauseSpecificDeathPercentage', 'StdDeathRate', 'DeathRatePer100K', 'Population', 'GDP', 'GDPPerCapita', 'GNI', 'GNIPerCapita', 'InflationRate', 'EmploymentPopulationRatio']
Total Entries: 5928


Human Development Index Dataset Summary:
-----------------------
All Features:
['ISO3', 'Country', 'Human Development Groups', 'UNDP Developing Regions', 'HDI Rank (2021)', 'Human Development Index (1990)', 'Human Development Index (1991)', 'Human Development Index (1992)', 'Human Development Index (1993)', 'Human Development Index (1994)', 'Human Development Index (1995)', 'Human Development Index (1996)', 'Human Development Index (1997)', 'Human Development Index (1998)', 'Human Development Index (1999)', 'Human Development Index (2000)', 'Human Development Index (2001)', 'Human Development Index (2002)', 'Human Development Index (2003)', 'H

### Cleaning the data

- check for missing values, isnull. then also replace/format kya karna
- are there any missing values that were filled in using a default
- which parts of the data were entered by humans, are there any misspellings
- combining datasets, group by?
- merging columns if needed?
- cleaning the data format of entries if needed
- plots

In [70]:
#starting w suicide data.
#check are there any rows with null values

#what percent of columns can be missing
null_percent = 25

total_rows = len(suicide_data)
total_cols = suicide_data.shape[1]
null_count_per_row = suicide_data.isnull().sum(axis=1)
row_null_pct = (null_count_per_row / total_cols) * 100

rows_greater_than_null_perc = suicide_data.loc[row_null_pct > null_percent].copy()

number_of_null = len(rows_greater_than_null_perc)

if total_rows > 0:
    percent_null_rows = (number_of_null / total_rows) * 100
else:
    percent_null_rows = 0.0

percent_str = f"{percent_null_rows:.2f}"

print(f"total rows: {total_rows}")
print(f"Rows with >{null_percent}% nulls: {number_of_null} ({percent_str}%)\n")


# which columns are most frequently missing in these rows
print("\nColumns most frequently missing in these rows:")
col_missing = rows_greater_than_null_perc.isnull().sum().sort_values(ascending=False)
display(col_missing[col_missing > 0])


high_missing_suicide = rows_greater_than_null_perc
high_missing_suicide


total rows: 5928
Rows with >25% nulls: 310 (5.23%)


Columns most frequently missing in these rows:


InflationRate                310
GNIPerCapita                 310
GNI                          310
GDPPerCapita                 310
GDP                          310
EmploymentPopulationRatio    296
Population                   296
DeathRatePer100K               4
StdDeathRate                   4
dtype: int64

Unnamed: 0,RegionCode,RegionName,CountryCode,CountryName,Year,Sex,SuicideCount,CauseSpecificDeathPercentage,StdDeathRate,DeathRatePer100K,Population,GDP,GDPPerCapita,GNI,GNIPerCapita,InflationRate,EmploymentPopulationRatio
690,EU,Europe,BIH,Bosnia and Herzegovina,1991,Male,457,2.686655,19.391724,20.268772,4502386.0,,,,,,37.430
691,EU,Europe,BIH,Bosnia and Herzegovina,1991,Female,74,0.541331,3.270640,3.268984,4502386.0,,,,,,37.430
1525,NAC,North America and the Caribbean,CUB,Cuba,2021,Male,1476,1.604104,18.024194,26.273074,11256372.0,,,,,,53.125
1526,NAC,North America and the Caribbean,CUB,Cuba,2021,Female,322,0.425583,3.761503,5.649540,11256372.0,,,,,,53.125
2034,EU,Europe,EST,Estonia,1991,Male,319,3.321187,41.456444,43.689533,1561314.0,,,,,,65.767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5923,AF,Africa,MYT,Mayotte,2015,Female,0,0.000000,0.000000,0.000000,,,,,,,
5924,AF,Africa,REU,R?union,2016,Male,68,2.636681,15.399045,16.144005,,,,,,,
5925,AF,Africa,REU,R?union,2016,Female,17,0.796253,3.521507,3.790134,,,,,,,
5926,AF,Africa,REU,R?union,2015,Male,79,3.212688,18.702514,18.895318,,,,,,,


In [71]:
indexes_to_drop = high_missing_suicide.index.tolist()

suicide_data = suicide_data.drop(indexes_to_drop).reset_index(drop=True)
print(f"Dropped {len(indexes_to_drop)} rows. Remaining rows: {len(suicide_data)}")


Dropped 310 rows. Remaining rows: 5618


In [72]:
# need to check specifically are there any null values in GDP column
gdp_null_count = suicide_data['GDP'].isnull().sum()
print(f"Number of null values in GDP column: {gdp_null_count}")

#display rows with null values in GDP column
gdp_null_rows = suicide_data[suicide_data['GDP'].isnull()]
# display(gdp_null_rows)

#dropping these rows
indexes_to_drop = gdp_null_rows.index.tolist()
suicide_data = suicide_data.drop(indexes_to_drop).reset_index(drop=True)
print(f"Dropped {len(indexes_to_drop)} rows. Remaining rows: {len(suicide_data)}")



Number of null values in GDP column: 52
Dropped 52 rows. Remaining rows: 5566


In [75]:
#now check are there any rows left with null values
remaining_null_count = suicide_data.isnull().sum().sum()
print(f"Total remaining null values in the dataset: {remaining_null_count}")

#print the rows with remaining null values
remaining_null_rows = suicide_data[suicide_data.isnull().any(axis=1)]
display(remaining_null_rows)

#which columns have remaining null values amongst these rows
remaining_null_columns = remaining_null_rows.columns[remaining_null_rows.isnull().any()].tolist()
display(remaining_null_columns)

Total remaining null values in the dataset: 1394


Unnamed: 0,RegionCode,RegionName,CountryCode,CountryName,Year,Sex,SuicideCount,CauseSpecificDeathPercentage,StdDeathRate,DeathRatePer100K,Population,GDP,GDPPerCapita,GNI,GNIPerCapita,InflationRate,EmploymentPopulationRatio
180,CSA,Central and South America,ARG,Argentina,1991,Male,1459,1.031992,9.805622,9.011736,33105763.0,1.900000e+11,5730.723810,1.280000e+11,7740.0,,56.747
181,CSA,Central and South America,ARG,Argentina,1991,Female,502,0.445751,2.957554,2.992192,33105763.0,1.900000e+11,5730.723810,1.280000e+11,7740.0,,56.747
182,CSA,Central and South America,ARG,Argentina,1992,Male,1699,1.176186,11.201349,10.359882,33568285.0,2.290000e+11,6815.329330,2.040000e+11,8540.0,,56.867
183,CSA,Central and South America,ARG,Argentina,1992,Female,510,0.440784,2.963026,2.996263,33568285.0,2.290000e+11,6815.329330,2.040000e+11,8540.0,,56.867
186,CSA,Central and South America,ARG,Argentina,1993,Male,1755,1.191891,11.455239,10.561155,34027240.0,2.370000e+11,6957.417499,2.420000e+11,9390.0,,55.269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5561,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,1995,Female,0,0.000000,0.000000,0.000000,68398.0,5.772815e+08,8440.034526,5.587536e+08,14340.0,,
5562,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,1998,Male,0,0.000000,0.000000,0.000000,72572.0,7.278593e+08,10029.477750,6.789197e+08,16940.0,,
5563,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,1998,Female,0,0.000000,0.000000,0.000000,72572.0,7.278593e+08,10029.477750,6.789197e+08,16940.0,,
5564,AF,Africa,SYC,Seychelles,2021,Male,7,1.455301,13.708896,13.885307,99258.0,1.286687e+09,12963.058980,1.311264e+09,28840.0,,


['CauseSpecificDeathPercentage',
 'StdDeathRate',
 'DeathRatePer100K',
 'GNI',
 'GNIPerCapita',
 'InflationRate',
 'EmploymentPopulationRatio']

In [None]:
cols_to_fill = remaining_null_columns

def impute_columns(df, group_col='country', time_col='year', cols=cols_to_fill):
    df = df.copy()
    # ensure time col exists for sorting
    if time_col in df.columns:
        df = df.sort_values([group_col, time_col]) if group_col in df.columns else df.sort_values(time_col)
    else:
        df = df.sort_values(group_col) if group_col in df.columns else df

    # 1) try to compute DeathRatePer100K if possible
    if 'DeathRatePer100K' in cols and 'DeathRatePer100K' in df.columns:
        if {'suicides_no', 'population'}.issubset(df.columns):
            mask = df['DeathRatePer100K'].isnull() & df['suicides_no'].notnull() & df['population'].notnull() & (df['population'] != 0)
            computed = (df.loc[mask, 'suicides_no'] / df.loc[mask, 'population']) * 100000
            df.loc[mask, 'DeathRatePer100K'] = computed
            df.loc[mask, 'DeathRatePer100K_imputed'] = True

    # for each column: interpolate within group, then fill with group median, then global median
    for c in cols:
        if c not in df.columns:
            continue
        was_na = df[c].isnull()
        filled_any = np.zeros(len(df), dtype=bool)

        # group-wise linear interpolation (time-ordered) if possible
        if group_col in df.columns and time_col in df.columns:
            def _interp(s):
                return s.interpolate(method='linear', limit_direction='both')
            df[c] = df.groupby(group_col)[c].apply(_interp)
            filled_any = filled_any | (~was_na & df[c].notnull())  # entries that became non-null

        # forward/back fill within group
        if group_col in df.columns:
            df[c] = df.groupby(group_col)[c].apply(lambda s: s.fillna(method='ffill').fillna(method='bfill'))

        # fill remaining with group median
        if group_col in df.columns:
            group_median = df.groupby(group_col)[c].transform('median')
            df[c] = df[c].fillna(group_median)

        # final fallback: global median (for numeric) or mode (if somehow non-numeric)
        if pd.api.types.is_numeric_dtype(df[c]):
            global_fill = df[c].median()
        else:
            mode = df[c].mode(dropna=True)
            global_fill = mode.iloc[0] if not mode.empty else ""
        df[c] = df[c].fillna(global_fill)

        # flag imputed rows (originally null and now filled)
        df[f"{c}_imputed"] = was_na & df[c].notnull()

    # report summary
    summary = {c: int(df[f"{c}_imputed"].sum()) for c in cols if c in df.columns}
    print("Imputation summary (counts filled):")
    display(summary)
    return df

# Apply to suicide_data
suicide_data = impute_columns(suicide_data)
