# Suicide analysis
### Lukas Forst

In [3]:
import pandas as pd
import numpy as np
import plotly as plt
import seaborn as sns

In [4]:
# fix plotly not showing graphs sometimes
plt.offline.init_notebook_mode(connected=True)

### Data insight

Now let's look at the data.

In [5]:
data = pd.read_csv('san_data.csv')
data.sample(5)

Unnamed: 0.1,Unnamed: 0,country,year,sex,age,suicides_no,population,gdp_for_year,gdp_per_capita,generation,continent
11189,11190,Guyana,2004,Female,15-24,17,62536.0,785918800.0,1180,Millenials,Americas
20509,20510,Romania,1992,Male,55-74,610,1942300.0,25121670000.0,1183,Silent,Europe
24467,24468,Sweden,2006,Male,5-14,2,538858.0,420032100000.0,48977,Millenials,Europe
3470,3471,Belgium,1989,Male,35-54,422,1267600.0,165100100000.0,17659,Silent,Europe
23588,23589,Spain,2001,Female,25-34,93,3333937.0,625975800000.0,16169,Generation X,Europe


We must polish the data since the `id` or as it is interpreted `Unnamed: 0` is useless. Also, the `country`, `sex`, `generation`, `age` and `continent` are in a fact the categories instead of the objects.

In [6]:
# drop the ID column
data.drop(data.columns[0], axis=1,inplace=True) 

In [7]:
# transform objects to the categories
data.country = data.country.astype('category')
data.sex = data.sex.astype('category')
data.generation = data.generation.astype('category')
data.continent = data.continent.astype('category')
data.age = data.age.astype('category')

In [8]:
data.sample(2)

Unnamed: 0,country,year,sex,age,suicides_no,population,gdp_for_year,gdp_per_capita,generation,continent
24377,Sweden,1999,Female,35-54,141,1200105.0,270847900000.0,32328,Boomers,Europe
27614,Uzbekistan,2011,Male,15-24,264,3211080.0,45915190000.0,1767,Millenials,Asia


### Year data

In [9]:
def group_yearly(df, subset = ['year', 'suicides_no', 'population'], agg = {'population':'sum', 'suicides_no':'sum'}):
    '''
    Takes data frame, create subselect by given subset, groups by year and use aggregation function with given dict.abs
    '''
    return df[subset].groupby('year').agg(agg)

Let's visualise yearly values.

In [10]:
yearly = group_yearly(data)
yearly.describe()

Unnamed: 0,population,suicides_no
count,31.0,31.0
mean,1651292000.0,217187.645161
std,291711100.0,42969.612753
min,1008600000.0,116063.0
25%,1539583000.0,207556.5
50%,1745247000.0,233408.0
75%,1850237000.0,243515.5
max,1997297000.0,256119.0


In [11]:
years = yearly.index.values

Import the graphic tools.

In [12]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

And plot the yearly data.

In [13]:
fig = fig = make_subplots(specs=[[{"secondary_y": True}]])

# add summed suicides
fig.add_trace(
    go.Scatter(x = years, y = yearly['suicides_no'], mode = 'lines', name = 'Suicides'),
    secondary_y=False
)
# add populations
fig.add_trace(
    go.Scatter(x = years, y = yearly['population'], mode = 'lines', name = 'Population'),
    secondary_y=True
)

fig.update_layout(title_text="Suicides and population")

fig.update_xaxes(title_text="years")

fig.update_yaxes(title_text="suicides count", secondary_y=False)
fig.update_yaxes(title_text="population", secondary_y=True)

fig.show()

Let's see some normalized data - suicides per 100 000 people, to see the trends.

In [14]:
def get_suicides_per_100k(df):
    return(100_000 * df['suicides_no']) / df['population']

In [15]:
yearly = group_yearly(data)
years = yearly.index.values
yearly['suicides_per_100k'] = get_suicides_per_100k(yearly)

fig = go.Figure()

fig.add_trace(
    go.Scatter(x = years, y = yearly['suicides_per_100k'], mode = 'lines', name = 'Suicides per 100 000 people')
)

fig.update_layout(title_text="Suicides per 100 000 people")

fig.update_xaxes(title_text="years")
fig.update_yaxes(title_text="Suicides per 100 000 people")

fig.show()

**Graph insights**
We can see that the from the 1988 there was a increase in the suicides per capita with peak in 1995. Since then, the suicide rate is decreasing again.

Now let's look at social groups, starting with geneders.

In [16]:
males_yearly = group_yearly(data[data['sex'] == 'Male'])
males_yearly['suicides_per_100k'] = get_suicides_per_100k(males_yearly)

females_yearly = group_yearly(data[data['sex'] == 'Female'])
females_yearly['suicides_per_100k'] = get_suicides_per_100k(females_yearly)


fig = go.Figure()

fig.add_trace(
    go.Scatter(x = years, y = males_yearly['suicides_per_100k'], mode = 'lines', name = 'Males')
)

fig.add_trace(
    go.Scatter(x = years, y = females_yearly['suicides_per_100k'], mode = 'lines', name = 'Females')
)

fig.update_layout(title_text="Suicides per 100 000 people")

fig.update_xaxes(title_text="years")
fig.update_yaxes(title_text="Suicides per 100 000 people")

fig.show()

There seems to be significant difference between the male suicide rate and the female suicide rate. I will investigate this trend further in the following chapters.

### Countries related

In [17]:
countries_data = data.groupby('country').agg(
    {'population': 'sum', 'suicides_no' : 'sum'}
)
countries_data['suicides_per_100k'] = get_suicides_per_100k(countries_data)
countries_data = countries_data.sort_values('suicides_per_100k', ascending=False)

countries_mean = countries_data['suicides_per_100k'].mean()

In [18]:
def plot_countries_rate(df, title):
    fig = px.bar(df,
        y='country',
        x='suicides_per_100k',
        orientation='h', color='country',
        title=title
    )

    fig.add_trace(go.Scatter(
        x=np.repeat(countries_mean, len(df['country'])),
        y=df['country'],
        line=go.scatter.Line(color="black"),
        hoverinfo='none'
    ))
    
    fig.update_layout(showlegend=False)
    
    return fig

Now let's see some countries related data, graphs with highest and lowest suicide rate. The vertical line is mean from all countries.

In [19]:
plot_countries_rate(
    countries_data['suicides_per_100k'].nlargest(10).sort_values(ascending=False).reset_index(), 
    'Top 10 Countries with highest average of suicide per 100 000 population'
)

In [20]:
plot_countries_rate(
    countries_data['suicides_per_100k'].nsmallest(10).sort_values(ascending=False).reset_index(), 
    '10 Countries with smallest average of suicide per 100 000 population'
)

Interestingly, Dominica and Saint Kitts and Nevis has no suicides whatsoever. This could mean that the data are missing some values, or that the people in these states are really happy with themselves.

In [21]:
countries_data[countries_data['suicides_per_100k'] == 0]

Unnamed: 0_level_0,population,suicides_no,suicides_per_100k
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dominica,66400.0,0,0.0
Saint Kitts and Nevis,117300.0,0,0.0


### Continents data

In [41]:
continent_data = data.groupby('continent').agg(
    {'population': 'sum', 'suicides_no' : 'sum'}
)
continent_data['suicides_per_100k'] = get_suicides_per_100k(continent_data)

fig = px.bar(continent_data.reset_index(), x='continent', y='suicides_per_100k', color='suicides_no')
fig.show()

We can clearly see that the suicides rate differs accross the continents. I will get back to it in section hypothesis testing.

### Data correlation

Let's dig more into the data and try to look, whether is there some correlation between the dataset features.

In [22]:
def categorize(df, cats):
    '''
    transform categries defined in 'cats' into the integer representation
    '''
    for cat in cats:
        df[cat] = df[cat].cat.codes

In [34]:
# copy existing data
correlation_data = data.copy()
correlation_data['suicides_per_100k'] = get_suicides_per_100k(correlation_data)

In [35]:
# compute correlation and round it on 3 decimals
correlation_table = correlation_data.corr().apply(lambda x: round(x, 3))
correlation_table

Unnamed: 0,year,suicides_no,population,gdp_for_year,gdp_per_capita,suicides_per_100k
year,1.0,-0.003,0.012,0.097,0.339,-0.04
suicides_no,-0.003,1.0,0.616,0.43,0.062,0.307
population,0.012,0.616,1.0,0.711,0.083,0.008
gdp_for_year,0.097,0.43,0.711,1.0,0.305,0.025
gdp_per_capita,0.339,0.062,0.083,0.305,1.0,0.002
suicides_per_100k,-0.04,0.307,0.008,0.025,0.002,1.0


Table contains a lot of unnecessary statistics as we are analyzing only suicide rates. 

In [36]:
# display only suicides_per_100k corraltion
correlation_table['suicides_per_100k'].sort_values(ascending=False)

suicides_per_100k    1.000
suicides_no          0.307
gdp_for_year         0.025
population           0.008
gdp_per_capita       0.002
year                -0.040
Name: suicides_per_100k, dtype: float64

Apart from the obvious colleration as `suicides_per_100k` and `suicides_no`, 

## Hypothesis testing

In [42]:
from scipy import stats

In [61]:
europe_mean = data[data.continent == 'Europe']['suicides_pe']

Unnamed: 0,country,year,sex,age,suicides_no,population,gdp_for_year,gdp_per_capita,generation,continent
0,Albania,1987,Male,15-24,21,312900.0,2.156625e+09,796,Generation X,Europe
1,Albania,1987,Male,35-54,16,308000.0,2.156625e+09,796,Silent,Europe
2,Albania,1987,Female,15-24,14,289700.0,2.156625e+09,796,Generation X,Europe
3,Albania,1987,Male,75+,1,21800.0,2.156625e+09,796,G.I. Generation,Europe
4,Albania,1987,Male,25-34,9,274300.0,2.156625e+09,796,Boomers,Europe
...,...,...,...,...,...,...,...,...,...,...
26683,United Kingdom,2015,Female,25-34,181,4414464.0,2.885570e+12,47240,Millenials,Europe
26684,United Kingdom,2015,Female,75+,108,3070457.0,2.885570e+12,47240,Silent,Europe
26685,United Kingdom,2015,Female,15-24,104,3966564.0,2.885570e+12,47240,Millenials,Europe
26686,United Kingdom,2015,Female,5-14,6,3663221.0,2.885570e+12,47240,Generation Z,Europe
