# Importing Data

In [1]:
import pandas as pd
import numpy as py

In [2]:
data_raw= pd.read_csv('../input/suicide-rates-overview-1985-to-2016/master.csv')
data_raw.head(10)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
5,Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
6,Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
7,Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
8,Albania,1987,male,55-74 years,1,137500,0.73,Albania1987,,2156624900,796,G.I. Generation
9,Albania,1987,female,5-14 years,0,311000,0.0,Albania1987,,2156624900,796,Generation X


# Basic Cleanup and Filtering

In [3]:
#renaming columns
data_raw.columns = ['country', 'year', 'sex', 'age', 'suicide_nos', 'population', 'suicide/100k_pop', 'county_year', 'HDI_for_year', 'gdp_for_year($)', 'gdp_per_capita($)', 'generation']

In [4]:
data_raw.dtypes

country               object
year                   int64
sex                   object
age                   object
suicide_nos            int64
population             int64
suicide/100k_pop     float64
county_year           object
HDI_for_year         float64
gdp_for_year($)       object
gdp_per_capita($)      int64
generation            object
dtype: object

In [5]:
#removing * , * from gdp_for_year($) values and changing type to int
for i in range(0, len(data_raw)):
    data_raw['gdp_for_year($)'] = data_raw['gdp_for_year($)'][i].replace(',',"")

data_raw.head()

Unnamed: 0,country,year,sex,age,suicide_nos,population,suicide/100k_pop,county_year,HDI_for_year,gdp_for_year($),gdp_per_capita($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [6]:
data_raw['gdp_for_year($)'] = data_raw['gdp_for_year($)'].astype('int')

In [7]:
data_raw.isna().any()

country              False
year                 False
sex                  False
age                  False
suicide_nos          False
population           False
suicide/100k_pop     False
county_year          False
HDI_for_year          True
gdp_for_year($)      False
gdp_per_capita($)    False
generation           False
dtype: bool

In [8]:
print(data_raw['HDI_for_year'].isna().count())
print(data_raw['HDI_for_year'].count())
print(data_raw['HDI_for_year'].isna().count() - data_raw['HDI_for_year'].count())

27820
8364
19456


In [9]:
#dropping HDI_for_year col as num of null val > num of non-null val 
data_raw.drop(columns = 'HDI_for_year', inplace = True)

In [10]:
data_raw.describe()

Unnamed: 0,year,suicide_nos,population,suicide/100k_pop,gdp_for_year($),gdp_per_capita($)
count,27820.0,27820.0,27820.0,27820.0,27820.0,27820.0
mean,2001.258375,242.574407,1844794.0,12.816097,2156625000.0,16866.464414
std,8.469055,902.047917,3911779.0,18.961511,0.0,18887.576472
min,1985.0,0.0,278.0,0.0,2156625000.0,251.0
25%,1995.0,3.0,97498.5,0.92,2156625000.0,3447.0
50%,2002.0,25.0,430150.0,5.99,2156625000.0,9372.0
75%,2008.0,131.0,1486143.0,16.62,2156625000.0,24874.0
max,2016.0,22338.0,43805210.0,224.97,2156625000.0,126352.0


# Visualization

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [12]:
dataVis = data_raw.copy()
dataVis.head(3)

Unnamed: 0,country,year,sex,age,suicide_nos,population,suicide/100k_pop,county_year,gdp_for_year($),gdp_per_capita($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,2156624900,796,Generation X


In [13]:
# countries with highest suicide rates segreated bases on age and sex
age_sex_country_wise = dataVis.groupby(['country','sex', 'age'], as_index = False).suicide_nos.sum().sort_values('suicide_nos', ascending = False)
fig = px.bar(age_sex_country_wise, x = 'country', y = 'suicide_nos', color = 'age', hover_name = 'sex')
fig.show()

In [14]:
#Suicide rate based on the years and sex
fig = px.histogram(dataVis, x = 'year', y = 'suicide_nos', color = 'sex', nbins = 65)
fig.show()

In [15]:
# top 10 countries which highest suicide rates
country_wise = dataVis.groupby(['country'], as_index = False).suicide_nos.sum().sort_values('suicide_nos', ascending = False).head(10)
fig = px.bar(country_wise, x = 'country', y = 'suicide_nos' )
fig.show()