In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import utils
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.10f' % x)


In [None]:
df = pd.read_csv('datasets/suicide_ds_2016.csv')

In [None]:
df.info()
dir(utils)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

# Tidy data
- Drop columns:
    - remove HDI for year
    - remove country-year
- Rename columns:
    - 'suicides/100k pop'  => suicides_per_100k_pop
    - ' gdp_for_year ($) ' => gdp_for_year
    - 'gdp_per_capita ($)' => gdp_per_capita
- Remove data points:
    - remove data from 2016 due to incompleteness

In [None]:
# HDI has a lot of missing data -> drop the column
# country-year is not needed -> drop
# generation also controversial according to kaggle examples
tidy_df = df.drop(['HDI for year', 'country-year'], axis='columns')

# rename collumns to be snake case confirm and remove currency
tidy_df = tidy_df.rename(columns=
    {
        'suicides/100k pop': 'suicides_per_100k_pop',
        # the extra spaces in the name below are intended!
        ' gdp_for_year ($) ': 'gdp_for_year',
        'gdp_per_capita ($)':'gdp_per_capita'
    })
tidy_df.columns


In [None]:
# find missing rows
counts = tidy_df.groupby(['country', 'year']).count()
counts.where(counts < 12).dropna()
# for 16 countries there is data missing in 2016 
# => remove that year to prevent misinterpretation
tidy_df = tidy_df[tidy_df['year'] != 2016]

In [None]:
temp = tidy_df.groupby(['country', 'year']).count().reset_index()
temp = temp[['country','year']]
country_years = temp.groupby('country').count()
ax = country_years.value_counts(sort=False).plot.bar()
ax.set_xlabel('#years with data')
ax.set_ylabel('frequency')

plot above shows available data.  
31 is max (1985 to 2015)  
Whats a reasonable cutoff?  
10 years?

In [None]:
# get country names with less than k years of data available
temp = country_years.reset_index()
MIN_DATA_YEARS = 3
names = temp[temp.year <= MIN_DATA_YEARS].country.values
names

In [None]:
# remove countries determined above by using .isin with the prepended negation (~))
tidy_df = tidy_df[~tidy_df['country'].isin(names)]
tidy_df.shape

## Add contintent info to data
use python countrycode package

In [None]:
import pycountry_convert as pc
from pycountry import pycountry
# # [c.name for c in pycountry.countries]
name_map = {
    'Saint Vincent and Grenadines':'Saint Vincent and the Grenadines',
    'Republic of Korea': 'Korea, Republic of',
}


# def country_name_to_continent_code(country):
#     if country in name_map:
#         country = name_map[country]
#     country_code = pc.country_name_to_country_alpha2(country, cn_name_format="default")
#     # print(country_code)
#     continent_code = pc.country_alpha2_to_continent_code(country_code)
#     # print(continent_name)
#     return pc.convert_continent_code_to_continent_name(continent_code)

# tidy_df['continent'] = [country_name_to_continent_code(country) for country in tidy_df.country]
# tidy_df

In [None]:
# get num of recorded years per country from tidy_df
recorded_years_per_country = tidy_df.groupby(['country','year']).count().reset_index()[['country','year']]
recorded_years_per_country = recorded_years_per_country.groupby('country').count().reset_index().sort_values(by='year')
recorded_years_per_country = recorded_years_per_country.rename(columns={
    'year' : 'recorded_years'
})
recorded_years_per_country['country_code'] = [
  pc.country_name_to_country_alpha3(name_map.get(c,c))  for c in recorded_years_per_country.country
]
recorded_years_per_country

import geopandas
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.columns
# world = world.drop(columns=['pop_est', 'iso_a3', 'gdp_md_est'], axis='columns')
# world = world.rename(columns=
#     {
#         'name':'country'
#     })
merged_df = pd.merge(world,recorded_years_per_country,left_on='iso_a3',right_on='country_code', how='left')

# replace NaN with 0
merged_df['recorded_years'] = merged_df['recorded_years'].fillna(0)
merged_df

## Plot showing the number of recorded years per country on a worldmap

In [None]:

merged_df.plot(column='recorded_years',
           figsize=(15, 5),
           legend=True,cmap='viridis')


# Plot suidice per 100k rates for country, year etc
confused about per 100k colum: how to aggregate them to get per year value? mean? does not seem to fit visualizations on kaggle 
=> not possible. simply recalcualte by summing population and suicides_no in all categories and them take the mean over the years

In [None]:
def add_per100k(df):
    df['per100k'] = ((df.suicides_no / df.population) * 100_000)
    return df

In [None]:
by_country_and_year = tidy_df.groupby(['country', 'year'])[
    ['population', 'suicides_no']].sum(numeric_only=True)
by_country_and_year['per100k'] = (
    by_country_and_year['suicides_no'] / by_country_and_year['population']) * 100_000
# by_country_and_year[['country','per100k']].mean('per100k')
country_alltime_avg = by_country_and_year.groupby('country').mean('per100k')
country_alltime_avg = country_alltime_avg.per100k
print(country_alltime_avg.filter(items=['Germany']))
country_alltime_avg = country_alltime_avg.sort_values()
country_alltime_avg[-20:].plot.barh()
# country_alltime_avg.plot.barh(x='per100k', y='country')
# country_alltime_avg.per100k.plot.barh()


In [None]:
# ger_2015 = tidy_df[tidy_df.year == 2015][tidy_df.country == 'Germany'][['population','suicides_no']].sum(numeric_only=True)
# ger_2015['per100k'] = ((ger_2015.suicides_no / ger_2015.population) * 100_000)
# ger_2015

In [None]:
avg_1995 = tidy_df[tidy_df.year == 1995][['population','suicides_no']].sum()
add_per100k(avg_1995)
avg_1995


# NOTE
reported values on kaggle are not averages over country per 100k but rather the facotr : sum all suicides/ sum all popluation

In [None]:
# plot avg per 100k rate across all coutnries per year in a line plot
tidy_df = utils.load_suicide_data()

avg_per_year = tidy_df.groupby(['year'])[
    ['population', 'suicides_no']].sum(numeric_only=True)
add_per100k(avg_per_year)
# avg_per_year.mean('per100k', axis='column')
# global_mean =
fig, ax = plt.subplots(figsize=(10, 5))  # for size of figure

t = sns.lineplot(data=avg_per_year.per100k, markers=True, ax=ax, marker="o")
ticks = np.arange(1985, 2016, 2)
t.set_xticks(ticks, labels=ticks)
t.axhline(avg_per_year.per100k.mean(), linestyle='--', color='blue')


## group trends separated by gender

In [None]:
avg_by_sex = tidy_df.groupby('sex')[
    ['population', 'suicides_no']].sum(numeric_only=True)
add_per100k(avg_by_sex)
p = avg_by_sex.per100k.plot.bar()
p.set_yticks(np.arange(0,22,1))
p.set_ylabel('Suicides per 100k')

In [None]:
# same for trend over time
avg_by_sex_per_year = tidy_df.groupby(['sex','year'])[
    ['population', 'suicides_no']].sum(numeric_only=True)
add_per100k(avg_by_sex_per_year)
avg_by_sex_per_year= avg_by_sex_per_year.reset_index()
male = avg_by_sex_per_year[avg_by_sex_per_year.sex == 'male']
female= avg_by_sex_per_year[avg_by_sex_per_year.sex == 'female']
fig, ax = plt.subplots(1,3, figsize=(12,5))
yticks = np.arange(5,25,2)
t1 = sns.lineplot(data=avg_by_sex_per_year, x="year", y="per100k", ax=ax[0], hue="sex",markers=True,marker="o")
t1.set_yticks(yticks)
t2 = sns.lineplot(data=male, x="year", y="per100k", ax=ax[1],markers=True,marker="o", color='blue')
t2.set_yticks(yticks)
t3 = sns.lineplot(data=female, x="year", y="per100k", ax=ax[2],markers=True,marker="o")
t3.set_yticks(yticks)

## analyze country trends

In [None]:
by_country_and_year = tidy_df.groupby(['country', 'year'])[
    ['population', 'suicides_no']].sum(numeric_only=True)
add_per100k(by_country_and_year)
by_country_and_year = by_country_and_year.reset_index()[
    ['country', 'year', 'per100k']]
by_country_and_year
# by_country_and_year= by_country_and_year[by_country_and_year.country == 'Germany']
# sns.relplot(data=by_country_and_year,x="year",y="per100k", col="country",kind="scatter", col_wrap=8)


def trendline(x, y, order=1):
    slope = np.polyfit(x, y, order)[-2]
    return float(slope)

def apply_trendline(df):
    return trendline(df.year.values, df.per100k.values)

trends = by_country_and_year.groupby('country').apply(apply_trendline).sort_values()[::-1]
# sns.scatterplot(trends_per_country, y='country')
fig, ax = plt.subplots(figsize=(3,12))
sns.scatterplot(x=trends.values, y=trends.keys(), ax=ax)


In [None]:
top_n_incr_trends = by_country_and_year[by_country_and_year.country.isin(trends[:6].keys())]
sns.relplot(data=top_n_incr_trends, x='year',y='per100k', col='country', col_wrap=2, kind='line')
top_n_decr_trends = by_country_and_year[by_country_and_year.country.isin(trends[-6:].keys())]
sns.relplot(data=top_n_decr_trends, x='year',y='per100k', col='country', col_wrap=2, kind='line')
