Загружаем библиотеки

In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_val_score

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

matplotlib.rcParams.update({'font.size': 14})

Загружаем данные

In [96]:
train_ds = pd.read_csv('C:/Users/Glsteel/train.csv')
test_ds = pd.read_csv('C:/Users/Glsteel/test.csv')

Изучаем датасет.

In [97]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [98]:
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [99]:
train_ds['Id'] = train_ds['Id'].astype(str) #преобразуем столбец Id в строковую переменную
test_ds['Id'] = test_ds['Id'].astype(str)

In [100]:
train_object_features = train_ds.select_dtypes(include=['object']) #выделяем количественные переменные
train_num_features = train_ds.select_dtypes(exclude=['object']) #выделяем категориальные переменные
train_object_features.head()

Unnamed: 0,Id,Ecology_2,Ecology_3,Shops_2
0,14038,B,B,B
1,15053,B,B,B
2,4765,B,B,B
3,5809,B,B,B
4,10783,B,B,B


In [101]:
train_object_features['Shops_2'].value_counts()
train_object_features['Ecology_2'].value_counts()
train_object_features['Ecology_3'].value_counts()

B    9725
A     275
Name: Ecology_3, dtype: int64

In [102]:
train_num_features.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [103]:
test_ds.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,51.2792,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [115]:
len(train_ds) - train_ds.count() #определяем нулевые значения 

Id               0
DistrictId       0
Rooms            0
Square           0
LifeSquare       0
KitchenSquare    0
Floor            0
HouseFloor       0
HouseYear        0
Ecology_1        0
Ecology_2        0
Ecology_3        0
Social_1         0
Social_2         0
Social_3         0
Healthcare_1     0
Helthcare_2      0
Shops_1          0
Shops_2          0
Price            0
dtype: int64

In [116]:
len(test_ds) - test_ds.count()

Id               0
DistrictId       0
Rooms            0
Square           0
LifeSquare       0
KitchenSquare    0
Floor            0
HouseFloor       0
HouseYear        0
Ecology_1        0
Ecology_2        0
Ecology_3        0
Social_1         0
Social_2         0
Social_3         0
Healthcare_1     0
Helthcare_2      0
Shops_1          0
Shops_2          0
dtype: int64

Заменяем нулевые значения, корректируем выбросы

In [106]:
train_ds.loc[train_ds['LifeSquare'].isnull(), 'LifeSquare'] = train_ds['Square'] - train_ds['KitchenSquare']
test_ds.loc[test_ds['LifeSquare'].isnull(), 'LifeSquare'] = test_ds['Square'] - test_ds['KitchenSquare']
#убрали нулевые значения признака "lifeSquare" путем вычитания Площади кухни из Общей площади

In [107]:
train_ds.loc[train_ds['Healthcare_1'].isnull(), 'Healthcare_1'] = train_ds['Healthcare_1'].median
test_ds.loc[test_ds['Healthcare_1'].isnull(), 'Healthcare_1'] = test_ds['Healthcare_1'].median
#приравняли пропущенные значения признака "Healthcare_1" к медианному значению

In [108]:
train_ds.loc[train_ds['Rooms'] > 5, 'Rooms'] = train_ds['Rooms'].median()
train_ds.loc[train_ds['Rooms'] == 0, 'Rooms'] = train_ds['Rooms'].median()
test_ds.loc[test_ds['Rooms'] > 5, 'Rooms'] = test_ds['Rooms'].median()
test_ds.loc[test_ds['Rooms'] == 0, 'Rooms'] = test_ds['Rooms'].median()
#приравняли количество комнат равное нулю и большее 5 к моде.

In [109]:
train_ds.loc[train_ds['Square'] > 200, 'Square'] = train_ds['Square'].median()
train_ds.loc[train_ds['Square'] < 20, 'Square'] = train_ds['Square'].median()
test_ds.loc[test_ds['Square'] > 200, 'Square'] = test_ds['Square'].median()
test_ds.loc[test_ds['Square'] < 20, 'Square'] = test_ds['Square'].median()
#убираем выбросы по признаку Площадь

In [110]:
train_ds.loc[train_ds['LifeSquare'] > 200, 'LifeSquare'] = train_ds['LifeSquare'].median()
train_ds.loc[train_ds['LifeSquare'] < 10, 'LifeSquare'] = train_ds['LifeSquare'].median()
test_ds.loc[test_ds['LifeSquare'] > 200, 'LifeSquare'] = test_ds['LifeSquare'].median()
test_ds.loc[test_ds['LifeSquare'] < 10, 'LifeSquare'] = test_ds['LifeSquare'].median()
#убираем выбросы по признаку Жилая площадь

In [111]:
train_ds.loc[train_ds['KitchenSquare'] > 25, 'KitchenSquare'] = train_ds['KitchenSquare'].median()
test_ds.loc[test_ds['KitchenSquare'] > 25, 'KitchenSquare'] = test_ds['KitchenSquare'].median()
#убираем выбросы по признаку Площадь кухни

In [112]:
train_ds.loc[train_ds['LifeSquare'] > train_ds['Square'], 'LifeSquare'] = train_ds['Square'] - train_ds['KitchenSquare']
test_ds.loc[test_ds['LifeSquare'] > test_ds['Square'], 'LifeSquare'] = test_ds['Square'] - test_ds['KitchenSquare']
#убираем случаи, когда жилая площадь больше общей                                                                                   

In [113]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null object
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       10000 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     10000 non-null object
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(7), int64(8), object(5)
memory usage: 1.5+ MB


In [114]:
test_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null object
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     5000 non-null object
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(6), int64(8), object(5)
memory usage: 742.3+ KB


In [62]:
test_ds.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,51.2792,1.9062,56.51174,41.315858,5.7008,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1.3194,4.2428
std,44.179466,0.805935,18.623536,18.892673,3.821282,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1.47994,4.777365
min,0.0,1.0,20.011215,10.692499,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0
25%,21.0,1.0,41.97776,27.830435,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,0.0,1.0
50%,37.0,2.0,52.920459,36.931498,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,1.0,3.0
75%,77.0,2.0,66.26976,49.74912,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,2.0,6.0
max,212.0,5.0,189.679576,168.729035,24.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,6.0,23.0


In [117]:
train_ds.drop(['Id'],axis=1,inplace=True)
train_ds.drop(['Ecology_2'],axis=1,inplace=True)
train_ds.drop(['Ecology_3'],axis=1,inplace=True)
train_ds.drop(['Shops_2'],axis=1,inplace=True)
train_ds.drop(['Healthcare_1'],axis=1,inplace=True)

test_ds.drop(['Id'],axis=1,inplace=True)
test_ds.drop(['Ecology_2'],axis=1,inplace=True)
test_ds.drop(['Ecology_3'],axis=1,inplace=True)
test_ds.drop(['Shops_2'],axis=1,inplace=True)
test_ds.drop(['Healthcare_1'],axis=1,inplace=True)

In [118]:
train_ds.keys()

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price'],
      dtype='object')

In [119]:
feature_names = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
                 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
                 'Social_3', 'Helthcare_2', 'Shops_1']

target_name = 'Price'

In [120]:
train_ds.head()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Price
0,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,33,7976,5,0,11,184966.93073
1,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,46,10309,1,1,16,300009.450063
2,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,7759,0,1,3,220925.908524
3,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,5735,3,0,5,175616.227217
4,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,5776,1,2,4,150226.531644


In [121]:
X = pd.DataFrame(train_ds, columns=['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1'])
X.head()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1
0,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,33,7976,5,0,11
1,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,46,10309,1,1,16
2,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,7759,0,1,3
3,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,5735,3,0,5
4,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,5776,1,2,4


In [122]:
y = pd.DataFrame(train_ds, columns=[target_name])
y.head(5)

Unnamed: 0,Price
0,184966.93073
1,300009.450063
2,220925.908524
3,175616.227217
4,150226.531644


Работаем с тренировочной выборкой

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [124]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [77]:
#Осуществляем выбор гиперпараметров.
rf_mod_1 = RandomForestRegressor(random_state=42)

random_grid = {
    'n_estimators': np.arange(200, 501, 20),
    'max_depth': np.arange(2, 51, 2),
    'max_features': [0.5, 0.6, 0.7, 0.8, 0.9],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=rf_mod_1,
    param_distributions=random_grid,
    n_iter=50,
    scoring='r2',
    cv=10,
    verbose=2,
    random_state=42
)

random_search.fit(X_train_scaled, y_train)

print(random_search.best_score_)
print(random_search.best_params_)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44, total=   5.2s
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.4s remaining:    0.0s


[CV]  n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44, total=   5.2s
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44 
[CV]  n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44, total=   5.3s
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44 
[CV]  n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44, total=   5.2s
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44 
[CV]  n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44, total=   5.2s
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44 
[CV]  n_estimators=260, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=44, total=   5.3s
[CV] n_estimators=260, min_samples_split=2, min_samples_leaf=

[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32, total=   6.5s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32, total=   6.5s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32, total=   6.5s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32, total=   6.8s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=2, max_features=0.8, max_depth=32, total=   6.8s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=

[CV]  n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42, total=   5.6s
[CV] n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42 
[CV]  n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42, total=   5.7s
[CV] n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42 
[CV]  n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42, total=   5.8s
[CV] n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42 
[CV]  n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42, total=   5.8s
[CV] n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42 
[CV]  n_estimators=340, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=42, total=   5.7s
[CV] n_estimators=340, min_samples_split=5, min_samples_leaf=

[CV]  n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6, total=   2.2s
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6 
[CV]  n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6, total=   2.2s
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6 
[CV]  n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6, total=   2.2s
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6 
[CV]  n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6, total=   2.2s
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6 
[CV]  n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_features=0.6, max_depth=6, total=   2.2s
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=4, max_fe

[CV]  n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16, total=   6.9s
[CV] n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16 
[CV]  n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16, total=   7.2s
[CV] n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16 
[CV]  n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16, total=   6.9s
[CV] n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16 
[CV]  n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16, total=   7.0s
[CV] n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16 
[CV]  n_estimators=420, min_samples_split=2, min_samples_leaf=1, max_features=0.5, max_depth=16, total=   6.8s
[CV] n_estimators=420, min_samples_split=2, min_samples_leaf=

[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26, total=   6.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26, total=   6.8s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26, total=   7.0s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26, total=   6.7s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=0.5, max_depth=26, total=   6.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=

[CV]  n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6, total=   4.2s
[CV] n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6 
[CV]  n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6, total=   4.2s
[CV] n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6 
[CV]  n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6, total=   4.3s
[CV] n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6 
[CV]  n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6, total=   4.2s
[CV] n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6 
[CV]  n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_features=0.8, max_depth=6, total=   4.2s
[CV] n_estimators=420, min_samples_split=5, min_samples_leaf=1, max_fe

[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10, total=   6.3s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10, total=   5.7s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10, total=   4.8s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10, total=   4.7s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10 
[CV]  n_estimators=280, min_samples_split=2, min_samples_leaf=4, max_features=0.9, max_depth=10, total=   4.7s
[CV] n_estimators=280, min_samples_split=2, min_samples_leaf=

[CV]  n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30, total=   5.5s
[CV] n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30 
[CV]  n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30, total=   5.4s
[CV] n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30 
[CV]  n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30, total=   5.4s
[CV] n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30 
[CV]  n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30, total=   5.3s
[CV] n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30 
[CV]  n_estimators=260, min_samples_split=5, min_samples_leaf=4, max_features=0.8, max_depth=30, total=   5.4s
[CV] n_estimators=260, min_samples_split=5, min_samples_leaf=

[CV]  n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24, total=   4.6s
[CV] n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24 
[CV]  n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24, total=   4.6s
[CV] n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24 
[CV]  n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24, total=   4.5s
[CV] n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24 
[CV]  n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24, total=   4.6s
[CV] n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24 
[CV]  n_estimators=220, min_samples_split=10, min_samples_leaf=2, max_features=0.8, max_depth=24, total=   4.6s
[CV] n_estimators=220, min_samples_split=10, min_sam

[CV]  n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44, total=   9.7s
[CV] n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44 
[CV]  n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44, total=   9.7s
[CV] n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44 
[CV]  n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44, total=   9.8s
[CV] n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44 
[CV]  n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44, total=  10.0s
[CV] n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44 
[CV]  n_estimators=420, min_samples_split=10, min_samples_leaf=2, max_features=0.9, max_depth=44, total=   9.7s
[CV] n_estimators=420, min_samples_split=10, min_sam

[CV]  n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42, total=   6.8s
[CV] n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42 
[CV]  n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42, total=   7.9s
[CV] n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42 
[CV]  n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42, total=   6.6s
[CV] n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42 
[CV]  n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42, total=   6.4s
[CV] n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42 
[CV]  n_estimators=460, min_samples_split=10, min_samples_leaf=4, max_features=0.5, max_depth=42, total=   6.5s
[CV] n_estimators=460, min_samples_split=10, min_sam

[CV]  n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4, total=   2.8s
[CV] n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4 
[CV]  n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4, total=   2.7s
[CV] n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4 
[CV]  n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4, total=   2.9s
[CV] n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4 
[CV]  n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4, total=   2.8s
[CV] n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4 
[CV]  n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_features=0.7, max_depth=4, total=   2.8s
[CV] n_estimators=460, min_samples_split=2, min_samples_leaf=1, max_fe

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 51.3min finished


0.735945501348817
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.5, 'max_depth': 26}


In [125]:
# обучаем подобранную модель на всех тренировочных данных и проверяем качество.

scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)

rfr_mod = RandomForestRegressor(
    n_estimators=400,
    max_depth=26,
    max_features=0.5,
    min_samples_leaf=1,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rfr_mod.fit(X_scaled, y)

scores = cross_val_score(model, X_scaled, y, scoring='r2', cv=10, n_jobs=-1)

print(f'cv mean: {np.round(np.mean(scores), 4)}')
print(f'cv std:  {np.round(np.std(scores), 4)}')

#y_pred_rfr = rfr_mod.predict(X)
#r2(y, y_pred_rfr)

cv mean: 0.7413
cv std:  0.024


Работаем с тестовой выборкой

In [126]:
X_test = pd.DataFrame(test_ds, columns=['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1'])
X_test.head()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1
0,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,11,2748,1,0,0
1,74,2.0,69.263183,68.263183,1.0,6,1.0,1977,0.075779,6,1437,3,0,2
2,190,1.0,52.919578,15.948246,12.0,2,5.0,1909,0.0,30,7538,87,5,5
3,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,23,4583,3,3,3
4,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,2,629,1,0,0


In [127]:
y_test = pd.DataFrame(train_ds, columns=[target_name])
y_test.head()

Unnamed: 0,Price
0,184966.93073
1,300009.450063
2,220925.908524
3,175616.227217
4,150226.531644


In [129]:
# масштабируем новые данные подготовленным на тренировочных данных объектом класса MinMaxScaler

X_test_scaled = scaler.transform(X_test)

In [130]:
# предсказываем цены квартир для новых наблюдений
y_test_pred = rfr_mod.predict(X_test_scaled)

In [92]:
# записываем id объекта и предсказанную цену в датасет

predictions = pd.DataFrame({
    'Id': X_test.index,
    'Price': y_test_pred
})
predictions.head()

Unnamed: 0,Id,Price
0,0,341515.621378
1,1,371344.725479
2,2,327040.548148
3,3,329737.513976
4,4,311184.70451


In [93]:
predictions.describe()

Unnamed: 0,Id,Price
count,5000.0,5000.0
mean,2499.5,332746.201678
std,1443.520003,15335.638532
min,0.0,300466.361122
25%,1249.75,322846.900649
50%,2499.5,332332.122458
75%,3749.25,337489.910333
max,4999.0,407526.072659


In [131]:
#сохраняем результат предсказаний
predictions.to_csv('GAzanov_predictions.csv', sep=',', index=False, encoding='utf-8')


In [132]:
y.describe()

Unnamed: 0,Price
count,10000.0
mean,214138.857399
std,92872.293865
min,59174.778028
25%,153872.633942
50%,192269.644879
75%,249135.462171
max,633233.46657
