In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("master.csv", thousands=',')
df.head()

In [54]:
countries_regions = pd.read_csv("countries-regions.csv")
countries_regions = countries_regions.set_index("name")["sub-region"]
countries_regions = countries_regions.append(pd.Series({'Czech Republic': "Eastern Europe", 
                                                        "Macau": "Eastern Asia",
                                                       "Republic of Korea": "Eastern Asia",
                                                       "Saint Vincent and Grenadines": "Latin America and the Caribbean",
                                                       "United Kingdom": "Western Europe",
                                                       "United States": "Northern America"}))
countries_regions.head()

Afghanistan         Southern Asia
Åland Islands     Northern Europe
Albania           Southern Europe
Algeria           Northern Africa
American Samoa          Polynesia
dtype: object

In [55]:
df['region'] = df.country.apply(lambda x: countries_regions[x])
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation,region
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X,Southern Europe
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent,Southern Europe
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X,Southern Europe
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation,Southern Europe
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers,Southern Europe


## One-hot encode categorical variables

In [56]:
ages = pd.get_dummies(df.age, prefix="age")
ages.head()

Unnamed: 0,age_15-24 years,age_25-34 years,age_35-54 years,age_5-14 years,age_55-74 years,age_75+ years
0,1,0,0,0,0,0
1,0,0,1,0,0,0
2,1,0,0,0,0,0
3,0,0,0,0,0,1
4,0,1,0,0,0,0


In [57]:
sexes = pd.get_dummies(df.sex, prefix="sex")
sexes.head()

Unnamed: 0,sex_female,sex_male
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1


In [58]:
regions = pd.get_dummies(df.region, prefix="region")
regions.head()

Unnamed: 0,region_Australia and New Zealand,region_Central Asia,region_Eastern Asia,region_Eastern Europe,region_Latin America and the Caribbean,region_Melanesia,region_Micronesia,region_Northern America,region_Northern Europe,region_South-eastern Asia,region_Southern Asia,region_Southern Europe,region_Sub-Saharan Africa,region_Western Asia,region_Western Europe
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


## Training set

In [59]:
X = pd.concat([ages, sexes, regions, df[["population", "year", "gdp_per_capita ($)"]]], axis=1)
X.head()

Unnamed: 0,age_15-24 years,age_25-34 years,age_35-54 years,age_5-14 years,age_55-74 years,age_75+ years,sex_female,sex_male,region_Australia and New Zealand,region_Central Asia,...,region_Northern Europe,region_South-eastern Asia,region_Southern Asia,region_Southern Europe,region_Sub-Saharan Africa,region_Western Asia,region_Western Europe,population,year,gdp_per_capita ($)
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,312900,1987,796
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,308000,1987,796
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,289700,1987,796
3,0,0,0,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,21800,1987,796
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,274300,1987,796


In [66]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, df["suicides/100k pop"])

## Linear regression

In [87]:
model = ElasticNet()
model.fit(Xtrain, ytrain)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [92]:
np.sqrt(mean_squared_error(ytest, model.predict(Xtest)))

17.684084650797804