Author: Niamh Hogan

# Suicide Mortality in Ireland: Demographic Trends and EU Comparison (2012â€“2019)

In [116]:
# Imports

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


## <b>Data Cleansing</b>

in this section --

<b>Step 1: Read in datasets</b>

In [117]:
# Read in irish data: irishdata_year_age_sex_cso.csv
irish_age_sex_df = pd.read_csv('./data/irishdata_year_age_sex_cso.csv')

# sanity check
irish_age_sex_df.head(3)

Unnamed: 0,Statistic Label,Year,Sex,Cause of Death,Age Group at Death,UNIT,VALUE
0,Revised Deaths Occurring,2007,Both sexes,X60-X84 Intentional self-harm,Under 1 year,Number,
1,Revised Deaths Occurring,2007,Both sexes,X60-X84 Intentional self-harm,1 - 4 years,Number,
2,Revised Deaths Occurring,2007,Both sexes,X60-X84 Intentional self-harm,5 - 9 years,Number,


In [118]:
# Read in irish data: irishdata_year_counties_sex_cso.csv
irish_counties_df = pd.read_csv('./data/irishdata_year_counties_sex_cso.csv')

# sanity check
irish_counties_df.head(3)

Unnamed: 0,Statistic Label,Year,Sex,County,Cause of Death,UNIT,VALUE
0,Deaths Occuring,2015,Both sexes,Ireland,Intentional self-harm (X60-X84),Number,500.0
1,Deaths Occuring,2015,Both sexes,Carlow County Council,Intentional self-harm (X60-X84),Number,7.0
2,Deaths Occuring,2015,Both sexes,Dublin City Council,Intentional self-harm (X60-X84),Number,54.0


In [119]:
# Read in EU Deaths: who_eu_deaths.csv
eu_deaths_df = pd.read_csv('./data/who_eu_deaths.csv', skiprows=30, low_memory=False)

# sanity check
eu_deaths_df.head(3)

Unnamed: 0,COUNTRY,COUNTRY_GRP,AGE_GRP_LIST,SEX,SUBNATIONAL_MDB,YEAR,VALUE
0,ALB,,TOTAL,FEMALE,,1987.0,25.0
1,ALB,,TOTAL,FEMALE,,1988.0,22.0
2,ALB,,TOTAL,FEMALE,,1989.0,15.0


In [120]:
# Read in EU population: eu_pop_2012_2022
eu_pop_df = pd.read_csv('./data/eu_pop_2012_2022.csv')

# sanity check
eu_pop_df.head(3)

Unnamed: 0,Time,geo,Value,age,sex,unit
0,2012,AT,8408121,TOTAL,T,NR
1,2012,BE,11075889,TOTAL,T,NR
2,2012,BG,7327224,TOTAL,T,NR


<b>Step 2: Drop Unnecessary Columns</b>

In [121]:
# drop unnecessary columns for irish_age_sex_df
drop_col_list1 = ["Statistic Label", "Cause of Death", "UNIT"]

irish_age_sex_df.drop(columns=drop_col_list1, inplace=True)

# sanity check
print(irish_age_sex_df.head(3))

   Year         Sex Age Group at Death  VALUE
0  2007  Both sexes       Under 1 year    NaN
1  2007  Both sexes        1 - 4 years    NaN
2  2007  Both sexes        5 - 9 years    NaN


In [122]:
# drop unnecessary columns for irish_counties_df 
drop_col_list2= ["Statistic Label", "Cause of Death", "UNIT"]

irish_counties_df.drop(columns=drop_col_list2, inplace=True)

# sanity check
print(irish_counties_df.head(3))

   Year         Sex                 County  VALUE
0  2015  Both sexes                Ireland  500.0
1  2015  Both sexes  Carlow County Council    7.0
2  2015  Both sexes    Dublin City Council   54.0


In [123]:
# drop unnecessary columns for eu_deaths_df
drop_col_list3= ["COUNTRY_GRP", "AGE_GRP_LIST", "SUBNATIONAL_MDB"]

eu_deaths_df.drop(columns=drop_col_list3, inplace=True)

# sanity check
print(eu_deaths_df.head(3))

  COUNTRY     SEX    YEAR  VALUE
0     ALB  FEMALE  1987.0   25.0
1     ALB  FEMALE  1988.0   22.0
2     ALB  FEMALE  1989.0   15.0


In [124]:
# drop unnecessary columns for eu_pop_df
drop_col_list4= ["age", "sex", "unit"]

eu_pop_df.drop(columns=drop_col_list4, inplace=True)

# sanity check
print(eu_pop_df.head(3)) 

   Time geo     Value
0  2012  AT   8408121
1  2012  BE  11075889
2  2012  BG   7327224


<b>Step 3: Dropping Unnecessary Rows</b>

In [125]:
#eu_deaths_df

# EU member state variable
eu_members = [
    "AUT", "BEL", "BGR", "HRV", "CYP", "CZE", "DNK", "EST", "FIN", "FRA",
    "DEU", "GRC", "HUN", "IRL", "ITA", "LVA", "LTU", "LUX", "MLT", "NLD",
    "POL", "PRT", "ROU", "SVK", "SVN", "ESP", "SWE"
]

# Drop non-EU member states 
eu_deaths_df = eu_deaths_df[eu_deaths_df["COUNTRY"].isin(eu_members)]

# Print countries alphabetically
countries = sorted(eu_deaths_df["COUNTRY"].unique())
print(countries)

['AUT', 'BEL', 'BGR', 'CYP', 'CZE', 'DEU', 'DNK', 'ESP', 'EST', 'FIN', 'FRA', 'GRC', 'HRV', 'HUN', 'IRL', 'ITA', 'LTU', 'LUX', 'LVA', 'MLT', 'NLD', 'POL', 'PRT', 'ROU', 'SVK', 'SVN', 'SWE']


In [126]:
#eu_deaths_df

# Drop years not 2012-2019
eu_deaths_df = eu_deaths_df[(eu_deaths_df["YEAR"] >= 2012) & (eu_deaths_df["YEAR"] <= 2019)]

print(eu_deaths_df.head(3)) 

    COUNTRY     SEX    YEAR  VALUE
110     AUT  FEMALE  2012.0  289.0
111     AUT  FEMALE  2013.0  324.0
112     AUT  FEMALE  2014.0  324.0


<b>Step 4: Checking & Converting data types </b>

In [127]:
#irish_age_sex_df
print(irish_age_sex_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Year                960 non-null    int64  
 1   Sex                 960 non-null    object 
 2   Age Group at Death  960 non-null    object 
 3   VALUE               782 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 30.1+ KB
None


In [128]:
# Convert VALUE to int 
irish_age_sex_df["VALUE"] = irish_age_sex_df["VALUE"].astype("Int64")

In [129]:
# Print Age Group at Death values
print(irish_age_sex_df["Age Group at Death"].unique())

['Under 1 year' '1 - 4 years' '5 - 9 years' '10 - 14 years'
 '15 - 19 years' '20 - 24 years' '25 - 29 years' '30 - 34 years'
 '35 - 39 years' '40 - 44 years' '45 - 49 years' '50 - 54 years'
 '55 - 59 years' '60 - 64 years' '65 - 69 years' '70 - 74 years'
 '75 - 79 years' '80 - 84 years' '85 years and over' 'All ages']


In [130]:
# Age Group at Death covert to int
def age_to_int(age_str):
    if age_str == 'All ages':
        return np.nan  
    if age_str == 'Under 1 year':
        return 0
    if 'and over' in age_str: 
        return int(age_str.split()[0])
    return int(age_str.split(' - ')[0])

# Apply to the column
irish_age_sex_df["Age Group at Death"] = irish_age_sex_df["Age Group at Death"].apply(age_to_int).astype('Int64')


In [131]:
# coverting All Ages to midpoint
all_ages_midpoint = 42

# Fill <NA> values with the midpoint
irish_age_sex_df["Age Group at Death"] = irish_age_sex_df["Age Group at Death"].fillna(all_ages_midpoint)

# Check result
print(irish_age_sex_df["Age Group at Death"].unique())

<IntegerArray>
[0, 1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 42]
Length: 20, dtype: Int64


In [132]:
print(irish_age_sex_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year                960 non-null    int64 
 1   Sex                 960 non-null    object
 2   Age Group at Death  960 non-null    Int64 
 3   VALUE               782 non-null    Int64 
dtypes: Int64(2), int64(1), object(1)
memory usage: 32.0+ KB
None


In [133]:
#irish_counties_df
print(irish_counties_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    768 non-null    int64  
 1   Sex     768 non-null    object 
 2   County  768 non-null    object 
 3   VALUE   745 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 24.1+ KB
None


In [134]:
#eu_deaths_df
print(eu_deaths_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 606 entries, 110 to 5878
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   COUNTRY  606 non-null    object 
 1   SEX      606 non-null    object 
 2   YEAR     606 non-null    float64
 3   VALUE    606 non-null    float64
dtypes: float64(2), object(2)
memory usage: 23.7+ KB
None


In [135]:
#eu_pop_df
print(eu_pop_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Time    297 non-null    int64 
 1   geo     297 non-null    object
 2   Value   297 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.1+ KB
None


<b>Step 4: Set Indexes</b>

<b>Step 5: Sort Columns</b>

save cleaned data