# Suicide analysis
### Lukas Forst

In [76]:
import pandas as pd
import numpy as np
import plotly as plt
import seaborn as sns

In [78]:
# fix plotly not showing graphs sometimes
plt.offline.init_notebook_mode(connected=True)

### Data insight

Now let's look at the data.

In [341]:
data = pd.read_csv('san_data.csv')
data.sample(5)

Unnamed: 0.1,Unnamed: 0,country,year,sex,age,suicides_no,population,gdp_for_year,gdp_per_capita,generation,continent
16705,16706,Mexico,2005,Male,25-34,956,9355305.0,877476200000.0,9071,Generation X,Americas
17130,17131,Netherlands,1999,Male,15-24,82,960000.0,441975300000.0,29799,Generation X,Europe
20846,20847,Russian Federation,1993,Male,55-74,10118,10681300.0,435083700000.0,3160,Silent,Europe
9390,9391,France,2014,Female,35-54,778,8608404.0,2852166000000.0,47318,Generation X,Europe
13611,13612,Japan,2012,Male,25-34,2244,7384000.0,6203213000000.0,51379,Millenials,Asia


We must polish the data since the `id` or as it is interpreted `Unnamed: 0` is useless.

In [342]:
data.drop(data.columns[0], axis=1,inplace=True) 
data.sample(5)

Unnamed: 0,country,year,sex,age,suicides_no,population,gdp_for_year,gdp_per_capita,generation,continent
6903,Cyprus,2000,Female,25-34,0,71455.0,10183320000.0,11635,Generation X,Asia
16634,Mexico,1999,Male,25-34,726,8214958.0,600232900000.0,6800,Generation X,Americas
15939,Malta,2003,Male,35-54,4,57171.0,5456584000.0,14444,Boomers,Europe
7326,Czech Republic,2012,Female,55-74,106,1333013.0,207376400000.0,20898,Boomers,Europe
1134,Armenia,2006,Female,75+,2,78684.0,6384452000.0,2310,Silent,Asia


In [338]:
data.describe()

Unnamed: 0,year,suicides_no,population,gdp_for_year,gdp_per_capita
count,27660.0,27660.0,27660.0,27660.0,27660.0
mean,2001.173102,243.413485,1850689.0,447100900000.0,16815.56833
std,8.418754,904.490185,3920658.0,1457574000000.0,18861.585497
min,1985.0,0.0,278.0,46919620.0,251.0
25%,1994.0,3.0,97535.25,8976208000.0,3436.0
50%,2002.0,25.0,430725.0,48013610000.0,9283.0
75%,2008.0,132.0,1491041.0,260202400000.0,24796.0
max,2015.0,22338.0,43805210.0,18120710000000.0,126352.0


In [330]:
def group_yearly(df, subset = ['year', 'suicides_no', 'population'], agg = {'population':'sum', 'suicides_no':'sum'}):
    '''
    Takes data frame, create subselect by given subset, groups by year and use aggregation function with given dict.abs
    '''
    return df[subset].groupby('year').agg(agg)

Let's visualise yearly values.

In [331]:
yearly = group_yearly(data)
yearly.describe()

Unnamed: 0,population,suicides_no
count,31.0,31.0
mean,1651292000.0,217187.645161
std,291711100.0,42969.612753
min,1008600000.0,116063.0
25%,1539583000.0,207556.5
50%,1745247000.0,233408.0
75%,1850237000.0,243515.5
max,1997297000.0,256119.0


In [306]:
years = yearly.index.values

Import the graphic tools.

In [307]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

And plot the yearly data.

In [308]:
fig = fig = make_subplots(specs=[[{"secondary_y": True}]])

# add summed suicides
fig.add_trace(
    go.Scatter(x = years, y = yearly['suicides_no'], mode = 'lines', name = 'Suicides'),
    secondary_y=False
)
# add populations
fig.add_trace(
    go.Scatter(x = years, y = yearly['population'], mode = 'lines', name = 'Population'),
    secondary_y=True
)

fig.update_layout(title_text="Suicides and population")

fig.update_xaxes(title_text="years")

fig.update_yaxes(title_text="suicides count", secondary_y=False)
fig.update_yaxes(title_text="population", secondary_y=True)

fig.show()

Let's see some normalized data - suicides per 100 000 people, to see the trends.

In [311]:
def get_suicides_per_100k(df):
    return(100_000 * df['suicides_no']) / df['population']

In [312]:
yearly = group_yearly(data)
years = yearly.index.values
yearly['suicides_per_100k'] = get_suicides_per_100k(yearly)

fig = go.Figure()

fig.add_trace(
    go.Scatter(x = years, y = yearly['suicides_per_100k'], mode = 'lines', name = 'Suicides per 100 000 people')
)

fig.update_layout(title_text="Suicides per 100 000 people")

fig.update_xaxes(title_text="years")
fig.update_yaxes(title_text="Suicides per 100 000 people")

fig.show()

**Graph insights**
We can see that the from the 1988 there was a increase in the suicides per capita with peak in 1995. Since then, the suicide rate is decreasing again.

Now let's look at social groups, starting with geneders.

In [314]:
males_yearly = group_yearly(data[data['sex'] == 'Male'])
males_yearly['suicides_per_100k'] = get_suicides_per_100k(males_yearly)

females_yearly = group_yearly(data[data['sex'] == 'Female'])
females_yearly['suicides_per_100k'] = get_suicides_per_100k(females_yearly)


fig = go.Figure()

fig.add_trace(
    go.Scatter(x = years, y = males_yearly['suicides_per_100k'], mode = 'lines', name = 'Males')
)

fig.add_trace(
    go.Scatter(x = years, y = females_yearly['suicides_per_100k'], mode = 'lines', name = 'Females')
)

fig.update_layout(title_text="Suicides per 100 000 people")

fig.update_xaxes(title_text="years")
fig.update_yaxes(title_text="Suicides per 100 000 people")

fig.show()

There seems to be significant difference between the male suicide rate and the female suicide rate. I will investigate this trend further in the following chapters.

### Countries related

In [317]:
countries_data = data.groupby('country').agg(
    {'population': 'sum', 'suicides_no' : 'sum'}
)
countries_data['suicides_per_100k'] = get_suicides_per_100k(countries_data)
countries_data = countries_data.sort_values('suicides_per_100k', ascending=False)

countries_mean = countries_data['suicides_per_100k'].mean()

In [320]:
def plot_countries_rate(df, title):
    fig = px.bar(df,
        y='country',
        x='suicides_per_100k',
        orientation='h', color='country',
        title=title
    )

    fig.add_trace(go.Scatter(
        x=np.repeat(countries_mean, len(largest_rate['country'])),
        y=df['country'],
        line=go.scatter.Line(color="black"),
        hoverinfo='none'
    ))
    
    fig.update_layout(showlegend=False)
    
    return fig

Now let's see some countries related data, graphs with highest and lowest suicide rate. The vertical line is mean from all countries.

In [321]:
plot_countries_rate(
    countries_data['suicides_per_100k'].nlargest(10).sort_values(ascending=False).reset_index(), 
    'Top 10 Countries with highest average of suicide per 100 000 population'
)

In [322]:
plot_countries_rate(
    countries_data['suicides_per_100k'].nsmallest(10).sort_values(ascending=False).reset_index(), 
    '10 Countries with smallest average of suicide per 100 000 population'
)

Interestingly, Dominica and Saint Kitts and Nevis has no suicides whatsoever. This could mean that the data are missing some values, or that the people in these states are really happy with themselves.

In [323]:
countries_data[countries_data['suicides_per_100k'] == 0]

Unnamed: 0_level_0,population,suicides_no,suicides_per_100k
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dominica,66400.0,0,0.0
Saint Kitts and Nevis,117300.0,0,0.0


### Data correlation

Let's dig more into the data and try to look, whether is there some correlation between the dataset features.

In [348]:
data.corr()

Unnamed: 0,year,suicides_no,population,gdp_for_year,gdp_per_capita
year,1.0,-0.002953,0.011583,0.09722,0.338854
suicides_no,-0.002953,1.0,0.616105,0.430011,0.062168
population,0.011583,0.616105,1.0,0.710834,0.083006
gdp_for_year,0.09722,0.430011,0.710834,1.0,0.304813
gdp_per_capita,0.338854,0.062168,0.083006,0.304813,1.0
