In [1]:
import pandas as pd
import psycopg2
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
conn = psycopg2.connect(dbname="adsdb", user="adsdb", password="adsdb")

In [3]:
df_final = pd.read_sql('SELECT * from exploitation.demographic_deaths;', conn, params = dict(name="adsdb"))

# Feature selection

In [4]:
df_final.shape

(2014, 37)

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedKFold, GridSearchCV
from sklearn.inspection import permutation_importance

In [6]:
df_final.columns

Index(['Year', 'Country', 'Deaths', 'Name',
       'Fertility rate, total (births per woman)', 'GDP (constant LCU)',
       'GDP (current LCU)', 'GDP (current US$)',
       'GDP deflator (base year varies by country)', 'GDP growth (annual %)',
       'GDP per capita (current LCU)', 'GDP per capita (current US$)',
       'GDP per capita, PPP (constant 2011 international $)',
       'GDP per capita, PPP (current international $)',
       'GDP, PPP (constant 2011 international $)',
       'GDP, PPP (current international $)', 'GNI (current LCU)',
       'GNI per capita (current LCU)',
       'GNI per capita, Atlas method (current US$)',
       'GNI per capita, PPP (current international $)',
       'General government total expenditure (current LCU)',
       'Life expectancy at birth, total (years)',
       'Mortality rate, infant (per 1,000 live births)',
       'Official exchange rate (LCU per US$, period average)',
       'PPP conversion factor, GDP (LCU per international $)',
       '

In [7]:
# Droping columns with NaN in predictor
df = df_final[df_final['Deaths'].notna()]
df = df[df['Total population (thousands)'].notna()]
df['Deaths/Pop'] = (df['Deaths']/(df['Total population (thousands)']*1000))*100
df = df.drop(columns=['Deaths', 'Name'])
df.head()

Unnamed: 0,Year,Country,"Fertility rate, total (births per woman)",GDP (constant LCU),GDP (current LCU),GDP (current US$),GDP deflator (base year varies by country),GDP growth (annual %),GDP per capita (current LCU),GDP per capita (current US$),...,Population aged 25-64 years (thousands),Population aged 65 years or older (thousands),Population growth (annual %),Poverty headcount ratio at $3.20 a day (PPP) (% of population),"Prevalence of HIV, total (% of population ages 15-49)",Price level ratio of PPP conversion factor (GDP) to market exch,Rural population (% of total population),Total debt service (% of GNI),Total population (thousands),Deaths/Pop
0,1994,3380,1.911,4688180000000.0,3689090000000.0,146683500000.0,78.68917,7.99702,62659.41008,2491.42784,...,27588.383,3098.052,0.91714,19.2,1.9,0.39664,69.895,5.46978,58875.269,0.518938
1,1994,4050,1.81,1480126000000.0,993286500000.0,156162400000.0,67.10822,5.33246,190789.88375,29995.57962,...,2800.528,800.323,0.33771,,,1.3863,15.048,,5211.334,1.172425
2,1994,4274,1.67,34424150000.0,16995340000.0,20162940000.0,49.37039,6.20553,3178.87894,3771.35952,...,2646.994,571.808,0.39405,,,0.46834,43.402,,5362.896,0.95812
3,1995,2320,,,185887100.0,,,,18875.62134,,...,5.183,0.488,,,,,,,9.848,1.015435
4,1995,2430,3.072,9077419000.0,305683000.0,691590500.0,3.36751,0.0,691.82371,1565.21202,...,186.966,22.568,1.53991,,0.8,0.33136,33.924,,441.851,0.347402


In [8]:
# Train test datasets
train=df.sample(frac=0.75,random_state=200)
test=df.drop(train.index)

y_train = train["Deaths/Pop"]
X_train = train.drop(columns ="Deaths/Pop")
y_test = test["Deaths/Pop"]
X_test = test.drop(columns ="Deaths/Pop")

X_train.shape, X_test.shape

((1365, 35), (455, 35))

### Missings

In [9]:
df.isnull().sum(axis=0)/len(df)

Year                                                               0.000000
Country                                                            0.000000
Fertility rate, total (births per woman)                           0.101099
GDP (constant LCU)                                                 0.076374
GDP (current LCU)                                                  0.039560
GDP (current US$)                                                  0.070879
GDP deflator (base year varies by country)                         0.076374
GDP growth (annual %)                                              0.077473
GDP per capita (current LCU)                                       0.039560
GDP per capita (current US$)                                       0.070879
GDP per capita, PPP (constant 2011 international $)                0.085714
GDP per capita, PPP (current international $)                      0.085714
GDP, PPP (constant 2011 international $)                           0.085714
GDP, PPP (cu

In [10]:
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [11]:
numeric_features_names = X_train.select_dtypes(include=np.number).columns.to_list()
categorical_features_names = list(set(X_train.columns.to_list())-set(numeric_features_names))

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ("scaler", StandardScaler())]
)

categorical_transformer = SimpleImputer(strategy='constant', fill_value='missing')

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features_names),
        ('cat', categorical_transformer, categorical_features_names)
    ])

X_train_prep = preprocessor.fit_transform(X_train)
X_train = pd.DataFrame(X_train_prep, index=X_train.index, columns=X_train.columns)

### Random forest Regressor Model

In [12]:
param_grid = {'n_estimators': list(range(50,500,100)),
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth'   : [3, 10, 20],
             }

grid = GridSearchCV(
    estimator = RandomForestRegressor(random_state = 123),
    param_grid = param_grid,
    scoring = 'neg_root_mean_squared_error',
    n_jobs = -1,
    cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=123), 
    refit = True,
    return_train_score = True
)

grid.fit(X = X_train, y = y_train)

resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)').drop(columns = 'params').sort_values('mean_test_score', ascending = False).head(20)

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
43,20,log2,350,-0.058375,0.004041,-0.02209,0.000456
38,20,sqrt,350,-0.058375,0.004041,-0.02209,0.000456
42,20,log2,250,-0.058494,0.003819,-0.022351,0.000496
37,20,sqrt,250,-0.058494,0.003819,-0.022351,0.000496
44,20,log2,450,-0.058614,0.003995,-0.022041,0.000352
39,20,sqrt,450,-0.058614,0.003995,-0.022041,0.000352
41,20,log2,150,-0.059686,0.003903,-0.022694,0.000573
36,20,sqrt,150,-0.059686,0.003903,-0.022694,0.000573
35,20,sqrt,50,-0.062174,0.003873,-0.024463,0.000564
40,20,log2,50,-0.062174,0.003873,-0.024463,0.000564


In [13]:
final_model = make_pipeline(preprocessor, RandomForestRegressor(**grid.best_params_))

In [14]:
trained_model = final_model.fit(X_train, y_train)
trained_model.predict(X_test)

array([0.60575506, 0.73740196, 0.62336269, 0.69595357, 0.59446057,
       0.59557482, 0.61192775, 0.62197889, 0.5989841 , 0.59310653,
       0.64418726, 0.61090553, 0.62324317, 0.61621194, 0.70946334,
       0.59881034, 0.58947285, 0.61098505, 0.60078101, 0.76530268,
       0.60449779, 0.58603877, 0.59066225, 0.70032869, 0.67857477,
       0.59562474, 0.60270159, 0.59402866, 0.61480106, 0.61150169,
       0.73361012, 0.59470609, 0.6170553 , 0.63174685, 0.60726557,
       0.59350032, 0.59562474, 0.58683972, 0.59303614, 0.65236826,
       0.63637178, 0.64636861, 0.75852535, 0.61447044, 0.63096811,
       0.62798756, 0.61309835, 0.59704218, 0.60638628, 0.59389055,
       0.61601083, 0.59896233, 0.59214116, 0.59488974, 0.58729923,
       0.59320528, 0.58948912, 0.60738673, 0.62200254, 0.59459605,
       0.60310377, 0.64663452, 0.74675801, 0.60557205, 0.76329523,
       0.5958132 , 0.73440244, 0.59215759, 0.66494191, 0.79916446,
       0.58715359, 0.58666865, 0.57993878, 0.62670164, 0.59172

In [15]:
from joblib import dump, load
from datetime import datetime

dump(trained_model, f'./models/model-{datetime.now().timestamp()}.joblib')

['./models/model-1642968148.80823.joblib']