In [1]:
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('russian_demography.csv', delimiter=',')
data

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,Republic of Adygea,1.9,14.2,12.3,84.66,52.42
1,1990,Altai Krai,1.8,12.9,11.1,80.24,58.07
2,1990,Amur Oblast,7.6,16.2,8.6,69.55,68.37
3,1990,Arkhangelsk Oblast,3.7,13.5,9.8,73.26,73.63
4,1990,Astrakhan Oblast,4.7,15.1,10.4,77.05,68.01
...,...,...,...,...,...,...,...
2375,2017,Chuvash Republic,-1.3,11.3,12.6,76.63,62.00
2376,2017,Chukotka Autonomous Okrug,3.7,13.1,9.4,58.23,70.00
2377,2017,Sakha (Yakutia) Republic,6.4,14.5,8.1,70.10,65.50
2378,2017,Yamalo-Nenets Autonomous Okrug,9.1,14.0,4.9,53.57,83.70


In [3]:
data.dropna(inplace=True)
encoder = preprocessing.LabelEncoder()
data['region'] = encoder.fit_transform(data[['region']].squeeze())
data

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,47,1.9,14.2,12.3,84.66,52.42
1,1990,0,1.8,12.9,11.1,80.24,58.07
2,1990,2,7.6,16.2,8.6,69.55,68.37
3,1990,3,3.7,13.5,9.8,73.26,73.63
4,1990,4,4.7,15.1,10.4,77.05,68.01
...,...,...,...,...,...,...,...
2375,2017,10,-1.3,11.3,12.6,76.63,62.00
2376,2017,9,3.7,13.1,9.4,58.23,70.00
2377,2017,62,6.4,14.5,8.1,70.10,65.50
2378,2017,82,9.1,14.0,4.9,53.57,83.70


In [4]:
train = data[data['year'] != 2017]
test = data[data['year'] == 2017]

In [5]:
train

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,47,1.9,14.2,12.3,84.66,52.42
1,1990,0,1.8,12.9,11.1,80.24,58.07
2,1990,2,7.6,16.2,8.6,69.55,68.37
3,1990,3,3.7,13.5,9.8,73.26,73.63
4,1990,4,4.7,15.1,10.4,77.05,68.01
...,...,...,...,...,...,...,...
2290,2016,10,0.0,13.2,13.2,74.05,61.30
2291,2016,9,3.6,13.6,10.0,56.18,69.20
2292,2016,62,7.6,16.0,8.4,67.83,65.40
2293,2016,82,10.1,15.4,5.3,51.10,83.70


In [6]:
test

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
2295,2017,47,-2.0,10.6,12.6,80.22,47.3
2296,2017,0,-3.2,10.8,14.0,83.61,56.3
2297,2017,2,-1.6,11.8,13.4,75.14,67.3
2298,2017,3,-2.7,10.5,13.2,81.16,78.0
2299,2017,4,0.7,12.1,11.4,78.19,66.5
...,...,...,...,...,...,...,...
2375,2017,10,-1.3,11.3,12.6,76.63,62.0
2376,2017,9,3.7,13.1,9.4,58.23,70.0
2377,2017,62,6.4,14.5,8.1,70.10,65.5
2378,2017,82,9.1,14.0,4.9,53.57,83.7


In [7]:
x_train = train.drop('birth_rate', axis=1)
y_train = train[['birth_rate']]
x_test = test.drop('birth_rate', axis=1)
y_test = test[['birth_rate']]

In [8]:
x_train

Unnamed: 0,year,region,npg,death_rate,gdw,urbanization
0,1990,47,1.9,12.3,84.66,52.42
1,1990,0,1.8,11.1,80.24,58.07
2,1990,2,7.6,8.6,69.55,68.37
3,1990,3,3.7,9.8,73.26,73.63
4,1990,4,4.7,10.4,77.05,68.01
...,...,...,...,...,...,...
2290,2016,10,0.0,13.2,74.05,61.30
2291,2016,9,3.6,10.0,56.18,69.20
2292,2016,62,7.6,8.4,67.83,65.40
2293,2016,82,10.1,5.3,51.10,83.70


In [9]:
y_train

Unnamed: 0,birth_rate
0,14.2
1,12.9
2,16.2
3,13.5
4,15.1
...,...
2290,13.2
2291,13.6
2292,16.0
2293,15.4


In [10]:
model = xgb.XGBRegressor(eval_metric=mean_absolute_error)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)])

[0]	validation_0-rmse:7.79545	validation_0-mean_absolute_error:7.58632
[1]	validation_0-rmse:5.27654	validation_0-mean_absolute_error:5.08431
[2]	validation_0-rmse:3.59040	validation_0-mean_absolute_error:3.41618
[3]	validation_0-rmse:2.42599	validation_0-mean_absolute_error:2.24018
[4]	validation_0-rmse:1.63014	validation_0-mean_absolute_error:1.42184
[5]	validation_0-rmse:1.13715	validation_0-mean_absolute_error:0.92614
[6]	validation_0-rmse:0.80639	validation_0-mean_absolute_error:0.61460
[7]	validation_0-rmse:0.62278	validation_0-mean_absolute_error:0.47036
[8]	validation_0-rmse:0.52289	validation_0-mean_absolute_error:0.41353
[9]	validation_0-rmse:0.48990	validation_0-mean_absolute_error:0.38805
[10]	validation_0-rmse:0.48424	validation_0-mean_absolute_error:0.38638
[11]	validation_0-rmse:0.48394	validation_0-mean_absolute_error:0.38400
[12]	validation_0-rmse:0.47744	validation_0-mean_absolute_error:0.38482
[13]	validation_0-rmse:0.48773	validation_0-mean_absolute_error:0.40045
[1

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=<function mean_absolute_error at 0x000001C5934C6C10>,
             feature_types=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_threshold=64,
             max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
             random_state=0, ...)

In [11]:
y_pred = model.predict(x_test)

In [12]:
mean_absolute_error(y_test, y_pred)

0.29183025584501376