In [1]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, f1_score

In [2]:
# Данные загружены отсюда: https://www.kaggle.com/dwdkills/russian-demography
# Читаем данные из файла
example_data = pd.read_csv("datasets/russian_demography.csv")
example_data.head()

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,Republic of Adygea,1.9,14.2,12.3,84.66,52.42
1,1990,Altai Krai,1.8,12.9,11.1,80.24,58.07
2,1990,Amur Oblast,7.6,16.2,8.6,69.55,68.37
3,1990,Arkhangelsk Oblast,3.7,13.5,9.8,73.26,73.63
4,1990,Astrakhan Oblast,4.7,15.1,10.4,77.05,68.01


In [3]:
# Так как список регионов меняется от года к году, в данных есть строки без значений. Удалим их
example_data.dropna(inplace=True)

In [4]:
val_test_size = round(0.2*len(example_data))
print(val_test_size)

463


In [5]:
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=1)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=1)
print(len(train), len(val), len(test))

1389 463 463


In [6]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = ['year', 'npg', 'birth_rate', 'death_rate', 'gdw', 'urbanization']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['year', 'npg', 'birth_rate', 'death_rate',
                                  'gdw', 'urbanization'])])

In [7]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [8]:
# Устанавливаем названия столбцов
column_names = columns_to_scale + ['region']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [9]:
x_labels = column_names[0:-2]
y_labels = ['urbanization']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_labels]
y_test = sc_test[y_labels]
y_val = sc_val[y_labels]

In [10]:
x_train

Unnamed: 0,year,npg,birth_rate,death_rate,gdw
0,0.037037,0.479012,0.337662,0.331776,0.761251
1,0.62963,0.461728,0.406926,0.439252,0.247437
2,0.925926,0.224691,0.186147,0.649533,0.691399
3,0.666667,0.320988,0.238095,0.53271,0.359513
4,0.481481,0.192593,0.125541,0.64486,0.481668
...,...,...,...,...,...
1384,0.111111,0.251852,0.138528,0.546729,0.72424
1385,0.518519,0.049383,0.142857,0.929907,0.556038
1386,0.703704,0.160494,0.21645,0.803738,0.487055
1387,0.333333,0.251852,0.112554,0.518692,0.609209


In [11]:
parameters = {
    'max_depth':  range(1, 101, 10),
    'n_estimators': range(1, 101, 10),
}
grid = GridSearchCV(GradientBoostingRegressor(), parameters, scoring='accuracy',  refit=False, cv=2, n_jobs=-1)
print(grid)

GridSearchCV(cv=2, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'max_depth': range(1, 101, 10),
                         'n_estimators': range(1, 101, 10)},
             refit=False, scoring='accuracy')


In [None]:
grid.fit(x_train, y_train)
grid.best_params_

In [None]:
score_list = []
model_list = [GradientBoostingRegressor(**grid.best_params_),
              KNeighborsRegressor(n_neighbors=5),
              KNeighborsRegressor(n_neighbors=10),
              KNeighborsRegressor(n_neighbors=15),
              KNeighborsRegressor(n_neighbors=20),
              KNeighborsRegressor(n_neighbors=25)]
for i in range(len(model_list)):
    model_list[i].fit(x_train, y_train)
    score_list.append(model_list[i].score(x_val, y_val))
    print(score_list[i])

In [None]:
best_model = model_list[score_list.index(max(score_list))]
print(best_model)

In [None]:
test_pred = best_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse, sep='\n')