### Задание для курсового проекта
#### Метрика: R2 - коэффициент детерминации (sklearn.metrics.r2_score)

#### Подключение библиотек и скриптов

In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Загрузка данных
## Описание датасета

#### Id - идентификационный номер квартиры
#### DistrictId - идентификационный номер района
#### Rooms - количество комнат
#### Square - площадь
#### LifeSquare - жилая площадь
#### KitchenSquare - площадь кухни
#### Floor - этаж
#### HouseFloor - количество этажей в доме
#### HouseYear - год постройки дома
#### Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
#### Social_1, Social_2, Social_3 - социальные показатели местности
#### Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья
#### Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
#### Price - цена квартиры

In [3]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    """
    # дописать документация
    """
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Valid R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))

    

    plt.show()

In [4]:
TRAIN_DATASET_PATH = './train.csv' # y_train, y_valid
TEST_DATASET_PATH = './test.csv'

In [5]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df.tail()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
9995,1260,61,2.0,49.090728,33.272626,6.0,3,12.0,1981,0.300323,B,B,52,10311,6,,1,9,B,119367.455796
9996,16265,27,2.0,64.307684,37.03842,9.0,13,0.0,1977,0.072158,B,B,2,629,1,,0,0,A,199715.148807
9997,2795,178,1.0,29.648057,16.555363,5.0,3,5.0,1958,0.460556,B,B,20,4386,14,,1,5,B,165953.91258
9998,14561,21,1.0,32.330292,22.32687,5.0,3,9.0,1969,0.194489,B,B,47,8004,3,125.0,3,5,B,171842.411855
9999,7202,94,1.0,35.815476,22.301367,6.0,9,9.0,1975,0.127376,B,B,43,8429,3,,3,9,B,177685.627486


In [6]:
test_df = pd.read_csv(TEST_DATASET_PATH)
test_df.tail()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
4995,10379,29,2.0,43.177521,30.339945,5.0,6,5.0,1962,0.06966,B,B,31,6119,4,,1,2,B
4996,16138,38,3.0,93.698122,94.521465,10.0,21,27.0,2018,0.060753,B,B,15,2787,2,520.0,0,7,B
4997,3912,101,1.0,33.656723,19.003259,5.0,2,5.0,1966,0.038693,B,B,28,6533,1,1015.0,2,5,B
4998,5722,10,1.0,38.635155,20.976257,9.0,8,14.0,1970,0.08904,B,B,33,7976,5,,0,11,B
4999,11004,21,2.0,67.122742,33.944344,13.0,9,17.0,2009,0.194489,B,B,47,8004,3,125.0,3,5,B


In [7]:
train_df['Id'] = train_df['Id'].astype(str)
train_df['DistrictId'] = train_df['DistrictId'].astype(str)

In [8]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,7887.0,37.199645,86.241209,0.370619,22.769832,32.78126,45.128803,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0
Social_2,10000.0,5352.1574,4006.799803,168.0,1564.0,5285.0,7227.0,19083.0


In [9]:
train_df.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


### Rooms

In [10]:
train_df['Rooms'].value_counts().sort_index()

0.0        8
1.0     3705
2.0     3880
3.0     2235
4.0      150
5.0       18
6.0        1
10.0       2
19.0       1
Name: Rooms, dtype: int64

In [11]:
test_df['Rooms'].value_counts().sort_index()

0.0        2
1.0     1769
2.0     2030
3.0     1099
4.0       90
5.0        7
6.0        2
17.0       1
Name: Rooms, dtype: int64

#### Меньше 1, и больше 5 считаем выброс

## Square, KitchenSquare, LifeSquare

### Square

In [12]:
train_df['Square'].value_counts().sort_index()

1.136859      1
1.988943      1
2.377248      1
2.596351      1
2.954309      1
             ..
212.932361    1
275.645284    1
409.425181    1
604.705972    1
641.065193    1
Name: Square, Length: 10000, dtype: int64

In [13]:
square = train_df['Square'] < 20
train_df.loc[square].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,21.0,21.0,20.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,10.0,21.0,21.0,21.0
mean,1.428571,9.535226,17.049,1.857143,7.761905,8.380952,1978.190476,0.161617,23.52381,4760.0,4.285714,1547.8,1.857143,4.142857,196294.071932
std,0.87014,7.073706,21.459359,2.535463,7.993152,8.102322,16.536684,0.107945,19.15625,3969.757776,5.367894,1177.678394,1.930951,3.650832,142438.658387
min,0.0,1.136859,0.873147,0.0,1.0,0.0,1960.0,0.007122,1.0,264.0,0.0,125.0,0.0,0.0,74222.110465
25%,1.0,4.380726,4.615146,1.0,3.0,1.0,1967.0,0.069753,5.0,1564.0,0.0,525.0,0.0,1.0,97560.720383
50%,1.0,5.129222,10.121392,1.0,5.0,5.0,1977.0,0.194489,23.0,4756.0,3.0,1477.0,1.0,4.0,126596.941798
75%,2.0,17.046188,16.16631,1.0,8.0,17.0,1977.0,0.225825,43.0,8004.0,6.0,2803.5,3.0,8.0,237716.681261
max,3.0,19.927423,79.767964,8.0,28.0,25.0,2016.0,0.319809,53.0,13670.0,16.0,2857.0,5.0,11.0,483283.488083


In [14]:
square = train_df['Square'] > 100
train_df.loc[square].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,262.0,262.0,192.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,132.0,262.0,262.0,262.0
mean,3.09542,124.086956,87.231327,9.374046,9.648855,13.064885,1995.427481,0.086629,20.877863,4637.770992,10.969466,1128.113636,1.427481,4.320611,367826.118487
std,0.948107,52.628932,58.01064,13.937923,6.981556,10.46686,20.310438,0.099206,16.741223,3809.600979,30.238627,1068.293457,1.817457,5.300571,146493.745024
min,0.0,100.141389,0.641822,0.0,1.0,0.0,1912.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,82342.775873
25%,3.0,103.715406,63.500289,1.0,4.0,3.0,1977.0,0.014073,5.0,1279.5,1.0,219.75,0.0,0.0,232867.447926
50%,3.0,111.002795,73.237133,10.0,7.0,14.0,2002.5,0.069753,22.0,4635.0,2.5,875.0,0.5,3.0,354445.714529
75%,4.0,124.997035,103.05168,13.0,14.75,22.0,2014.0,0.093443,31.0,6398.0,5.0,1356.5,3.0,6.0,498178.94974
max,5.0,641.065193,638.163193,123.0,26.0,48.0,2019.0,0.437885,74.0,19083.0,141.0,4508.0,6.0,23.0,627525.072788


#### Меньше 20 и больше 100 будем считать выбросами

In [15]:
square = train_df['Square'] > 1000
train_df.loc[square, 'Square'] = train_df['Square'] / 100

In [16]:
square = test_df['Square'] > 1000
test_df.loc[square, 'Square'] = test_df['Square'] / 100

In [17]:
square = train_df['Square'] > 100
train_df.loc[square, 'Square'] = train_df['Square'] / 10

In [18]:
square = test_df['Square'] > 100
test_df.loc[square, 'Square'] = test_df['Square'] / 10

In [19]:
square = train_df['Square'] < 20
train_df.loc[square, 'Square'] = train_df['Square'] + 20

In [20]:
square = test_df['Square'] < 20
test_df.loc[square, 'Square'] = test_df['Square'] + 20

In [21]:
train_df['Square'].value_counts().sort_index()

20.033454    1
20.213128    1
20.606762    1
20.924926    1
21.121279    1
            ..
99.462854    1
99.467012    1
99.597011    1
99.805880    1
99.893009    1
Name: Square, Length: 10000, dtype: int64

In [22]:
test_df['Square'].value_counts().sort_index()

20.011215    1
21.378543    1
21.452899    1
21.511539    1
21.588919    1
            ..
99.112581    1
99.293191    1
99.519397    1
99.614906    1
99.615882    1
Name: Square, Length: 5000, dtype: int64

### KitchenSquare

In [23]:
train_df['KitchenSquare'].value_counts().sort_index()

0.0        697
1.0       2460
2.0          4
3.0         22
4.0         39
5.0       1169
6.0       1038
7.0        609
8.0       1306
9.0        843
10.0      1075
11.0       233
12.0       249
13.0        67
14.0        51
15.0        31
16.0        16
17.0        12
18.0         6
19.0        11
20.0        14
21.0         1
22.0         3
23.0         1
25.0         2
26.0         1
27.0         1
29.0         1
30.0         2
31.0         1
32.0         2
35.0         1
36.0         1
37.0         2
39.0         1
40.0         1
41.0         2
42.0         1
43.0         2
48.0         1
51.0         2
53.0         1
54.0         1
58.0         2
60.0         1
62.0         1
63.0         1
66.0         1
72.0         1
73.0         1
75.0         1
78.0         1
84.0         1
96.0         1
112.0        2
123.0        1
1970.0       1
2014.0       1
Name: KitchenSquare, dtype: int64

In [24]:
test_df['KitchenSquare'].value_counts().sort_index()

0.0       335
1.0      1191
2.0         4
3.0        11
4.0        21
5.0       612
6.0       595
7.0       297
8.0       606
9.0       417
10.0      558
11.0      107
12.0      132
13.0       33
14.0       20
15.0       17
16.0       13
17.0        5
18.0        3
19.0        2
20.0        2
22.0        1
24.0        1
30.0        1
33.0        1
34.0        1
40.0        1
41.0        1
42.0        3
44.0        1
57.0        1
60.0        1
61.0        1
62.0        1
65.0        1
97.0        1
112.0       1
620.0       1
Name: KitchenSquare, dtype: int64

In [25]:
kitchensquare = train_df['KitchenSquare'] < 3
train_df.loc[kitchensquare].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,3161.0,3161.0,1184.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,3161.0,1166.0,3161.0,3161.0,3161.0
mean,1.773173,56.970274,53.804694,0.780766,9.839924,12.893072,1987.612148,0.106386,11.755457,2828.158178,10.614363,741.048885,0.639355,3.224929,184460.560227
std,0.77508,16.540891,217.853384,0.416841,5.730857,8.577455,17.447938,0.114508,13.930342,3252.469912,34.325346,523.301366,1.196893,5.801382,81233.410482
min,0.0,20.606762,0.370619,0.0,1.0,0.0,1955.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,60502.5835
25%,1.0,42.640536,33.574023,1.0,5.0,3.0,1977.0,0.007122,2.0,469.0,0.0,540.0,0.0,0.0,131103.958685
50%,2.0,56.007234,45.892153,1.0,9.0,17.0,1977.0,0.072158,5.0,1437.0,0.0,540.0,0.0,1.0,169188.936898
75%,2.0,67.080012,64.76108,1.0,14.0,17.0,2014.0,0.195781,22.0,5648.0,3.0,1046.0,1.0,4.0,211714.807132
max,5.0,99.467012,7480.592129,2.0,37.0,117.0,2020.0,0.521867,74.0,19083.0,141.0,4702.0,6.0,23.0,633233.46657


In [26]:
kitchensquare = train_df['KitchenSquare'] > 30
train_df.loc[kitchensquare].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,36.0,36.0,35.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,12.0,36.0,36.0,36.0
mean,1.888889,49.596914,54.272334,165.611111,8.888889,12.833333,2005.333333,0.111995,19.75,4586.277778,2.75,1273.416667,0.972222,3.166667,225025.284313
std,0.918937,16.772612,34.273814,449.900769,6.422184,8.872107,16.654472,0.120576,17.911489,4292.169079,4.44249,985.898709,1.612205,3.798496,119450.007515
min,1.0,30.642769,1.626502,31.0,1.0,1.0,1971.0,0.000699,0.0,168.0,0.0,540.0,0.0,0.0,88150.01251
25%,1.0,37.588371,33.302006,40.75,4.0,3.0,1992.25,0.019968,2.0,629.0,0.0,723.75,0.0,0.0,143531.593061
50%,2.0,43.1379,45.023531,53.5,6.5,15.5,2014.5,0.072158,17.5,4327.0,1.0,992.0,0.0,1.0,193827.844495
75%,2.25,60.720097,72.097812,73.5,12.5,17.0,2016.0,0.165883,32.25,6369.0,4.0,1340.0,1.0,5.25,268888.725097
max,4.0,97.490674,138.0046,2014.0,26.0,40.0,2019.0,0.437885,53.0,14892.0,24.0,4129.0,6.0,11.0,624549.354546


#### Меньше 3 и больше 20 будем считать выбросами

In [27]:
kitchensquare = train_df['KitchenSquare'] < 3 
train_df.loc[kitchensquare, 'KitchenSquare'] = train_df['KitchenSquare'] + 3

In [28]:
kitchensquare = train_df['KitchenSquare'] > 20 
train_df.loc[kitchensquare, 'KitchenSquare'] = 20

In [29]:
kitchensquare = test_df['KitchenSquare'] < 3 
test_df.loc[kitchensquare, 'KitchenSquare'] = test_df['KitchenSquare'] + 3

In [30]:
kitchensquare = test_df['KitchenSquare'] > 20 
test_df.loc[kitchensquare, 'KitchenSquare'] = 20

In [31]:
train_df['KitchenSquare'].value_counts().sort_index()

3.0      719
4.0     2499
5.0     1173
6.0     1038
7.0      609
8.0     1306
9.0      843
10.0    1075
11.0     233
12.0     249
13.0      67
14.0      51
15.0      31
16.0      16
17.0      12
18.0       6
19.0      11
20.0      62
Name: KitchenSquare, dtype: int64

In [32]:
test_df['KitchenSquare'].value_counts().sort_index()

3.0      346
4.0     1212
5.0      616
6.0      595
7.0      297
8.0      606
9.0      417
10.0     558
11.0     107
12.0     132
13.0      33
14.0      20
15.0      17
16.0      13
17.0       5
18.0       3
19.0       2
20.0      21
Name: KitchenSquare, dtype: int64

### LifeSquare

In [33]:
train_df['LifeSquare'].value_counts().sort_index()

0.370619       1
0.641822       1
0.795539       1
0.873147       1
1.049867       1
              ..
263.542020     1
410.639749     1
461.463614     1
638.163193     1
7480.592129    1
Name: LifeSquare, Length: 7887, dtype: int64

In [34]:
test_df['LifeSquare'].value_counts().sort_index()

0.333490      1
0.567267      1
0.809265      1
0.863989      1
1.181139      1
             ..
128.913470    1
136.689191    1
136.921090    1
169.901701    1
303.071094    1
Name: LifeSquare, Length: 3959, dtype: int64

In [35]:
square = train_df['LifeSquare'] < 9
train_df.loc[square].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,104.0,205.0,205.0,205.0
mean,1.707317,55.617887,3.48663,4.131707,9.990244,10.960976,1996.44878,0.130682,15.853659,3755.570732,7.521951,607.028846,0.863415,3.302439,198654.073282
std,0.781022,17.147043,1.571329,1.731264,6.133315,9.151151,19.795859,0.108378,16.094636,3755.039495,28.741508,480.989362,1.40757,5.33452,98864.274504
min,0.0,21.136859,0.370619,3.0,1.0,0.0,1965.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,64530.702769
25%,1.0,41.87259,2.077811,4.0,5.0,1.0,1977.0,0.041116,4.0,915.0,0.0,535.0,0.0,0.0,133552.372112
50%,2.0,55.828425,3.512299,4.0,9.0,15.0,1977.0,0.075779,6.0,1564.0,1.0,540.0,0.0,1.0,178486.385759
75%,2.0,66.542124,4.763693,4.0,14.0,17.0,2016.0,0.243205,25.0,5648.0,3.0,716.25,1.0,5.0,220109.387427
max,4.0,98.047744,8.822988,20.0,28.0,30.0,2020.0,0.521867,53.0,14892.0,141.0,2643.0,6.0,23.0,611250.912034


In [36]:
square = train_df['LifeSquare'] > 80
train_df.loc[square].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0,75.0,210.0,210.0,210.0
mean,2.82381,65.477173,139.17626,7.309524,10.095238,12.980952,2000.747619,0.089096,19.214286,4522.038095,8.742857,931.386667,0.995238,4.414286,297108.575127
std,0.897847,27.073231,512.091772,5.165621,6.743452,9.712598,20.545157,0.099194,18.589928,4526.946996,28.93584,920.760113,1.575827,5.543632,114260.971039
min,0.0,20.033454,80.046996,3.0,1.0,0.0,1912.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,90470.43083
25%,3.0,32.411852,83.437871,4.0,5.0,3.0,1977.0,0.020741,2.0,629.0,0.0,520.0,0.0,0.0,217666.935653
50%,3.0,81.321946,89.414743,4.0,8.0,16.5,2014.0,0.069753,15.0,2942.0,1.0,645.0,0.0,2.0,259157.536117
75%,3.0,87.103374,102.711717,10.0,14.0,19.0,2016.75,0.075779,30.0,6398.0,4.0,1065.0,1.0,7.0,378271.249786
max,5.0,99.121404,7480.592129,20.0,26.0,47.0,2020.0,0.437885,74.0,19083.0,141.0,4508.0,6.0,23.0,622888.825328


#### Меньше 9 и больше 80 будем считать выбросами

In [37]:
lifesquare = train_df['LifeSquare'].isna()
train_df.loc[lifesquare, 'LifeSquare'] = train_df['Square'] - train_df['KitchenSquare'] - random.randint(3, 15)

In [38]:
lifesquare = train_df['LifeSquare'] > 1000
train_df.loc[lifesquare, 'LifeSquare'] = train_df['LifeSquare'] / 100

In [39]:
lifesquare = train_df['LifeSquare'] > 80
train_df.loc[lifesquare, 'LifeSquare'] = train_df['LifeSquare'] / 10

In [40]:
lifesquare = train_df['LifeSquare'] < 9
train_df.loc[lifesquare, 'LifeSquare'] = train_df['LifeSquare'] + 9

In [41]:
lifesquare = test_df['LifeSquare'].isna()
test_df.loc[lifesquare, 'LifeSquare'] = test_df['Square'] - test_df['KitchenSquare'] - random.randint(3, 15)

In [42]:
lifesquare = test_df['LifeSquare'] > 1000
test_df.loc[lifesquare, 'LifeSquare'] = test_df['LifeSquare'] / 100

In [43]:
lifesquare = test_df['LifeSquare'] > 80
test_df.loc[lifesquare, 'LifeSquare'] = test_df['LifeSquare'] / 10

In [44]:
lifesquare = test_df['LifeSquare'] < 9
test_df.loc[lifesquare, 'LifeSquare'] = test_df['LifeSquare'] + 9

### Сумма Площадей


In [45]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,431.0,1075.0,1075.0,1075.0
mean,1.773023,50.541481,53.209049,6.55814,9.869767,14.587907,1999.933953,0.108396,16.062326,3671.75814,9.504186,899.784223,0.913488,3.500465,205088.053884
std,0.883649,14.495443,13.966831,4.144853,5.856427,8.85032,19.549162,0.114839,16.387896,3741.493948,30.295576,746.061546,1.398641,5.327693,114505.446486
min,0.0,20.033454,12.580698,3.0,1.0,0.0,1916.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,59877.592523
25%,1.0,39.477657,41.551099,4.0,5.0,9.0,1977.0,0.011654,2.0,629.0,0.0,540.0,0.0,0.0,131789.049943
50%,2.0,47.0801,52.737471,4.0,9.0,17.0,2014.0,0.072158,7.0,1892.0,1.0,705.0,0.0,1.0,171584.899672
75%,2.0,62.601247,64.269649,10.0,14.0,17.0,2016.0,0.195781,26.0,6137.0,4.0,1065.0,2.0,5.0,232963.055457
max,5.0,85.087643,79.847797,20.0,28.0,117.0,2020.0,0.486246,74.0,19083.0,141.0,4508.0,6.0,23.0,624680.099059


In [46]:
train_df['KitchenSquare'].median()

6.0

In [47]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare, 'Square'] = train_df['Square'] + 10
train_df.loc[fullsquare, 'KitchenSquare'] = train_df['KitchenSquare'].median()

In [48]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare, 'LifeSquare'] = train_df['LifeSquare'] - 20

In [49]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare, 'LifeSquare'] = train_df['LifeSquare'] - 20

In [50]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare, 'LifeSquare'] = train_df['LifeSquare'] - 20

In [51]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,
max,,,,,,,,,,,,,,,


In [52]:
fullsquare = train_df['Square'] < (train_df['LifeSquare'] + train_df['KitchenSquare'])
train_df.loc[fullsquare].describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,
max,,,,,,,,,,,,,,,


In [53]:
test_df['KitchenSquare'].median()

6.0

In [54]:
fullsquare = test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare'])
test_df.loc[fullsquare, 'Square'] = test_df['Square'] + 10
test_df.loc[fullsquare, 'KitchenSquare'] = test_df['KitchenSquare'].median()

In [55]:
fullsquare = test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare'])
test_df.loc[fullsquare, 'LifeSquare'] = test_df['LifeSquare'] - 20

In [56]:
fullsquare = test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare'])
test_df.loc[fullsquare, 'LifeSquare'] = test_df['LifeSquare'] - 20

In [57]:
fullsquare = test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare'])
test_df.loc[fullsquare, 'LifeSquare'] = test_df['LifeSquare'] - 20

In [58]:
fullsquare = test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare'])
test_df.loc[fullsquare].describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,,
max,,,,,,,,,,,,,,,,


## HouseFloor

In [59]:
train_df['HouseFloor'].value_counts().sort_index()

0.0       269
1.0       497
2.0        48
3.0       127
4.0       134
5.0      1031
6.0        58
7.0        59
8.0       141
9.0      1389
10.0      135
11.0       28
12.0     1074
13.0       27
14.0      625
15.0      123
16.0      576
17.0     2331
18.0       89
19.0      108
20.0      105
21.0       33
22.0      343
23.0       40
24.0      176
25.0      374
26.0        1
27.0        6
28.0        3
29.0        1
30.0       21
31.0        1
32.0        4
36.0        1
37.0        4
38.0        1
39.0        2
40.0        7
44.0        1
45.0        1
47.0        1
48.0        2
99.0        2
117.0       1
Name: HouseFloor, dtype: int64

In [60]:
test_df['HouseFloor'].value_counts()

17.0    1151
9.0      724
5.0      550
12.0     526
14.0     344
16.0     235
1.0      230
22.0     193
25.0     179
0.0      131
24.0      98
8.0       72
3.0       69
10.0      64
4.0       58
19.0      51
15.0      49
20.0      48
18.0      44
6.0       35
7.0       28
2.0       26
23.0      22
13.0      13
11.0      11
40.0       9
21.0       9
27.0       5
30.0       5
26.0       4
48.0       3
32.0       2
37.0       2
28.0       2
39.0       2
34.0       1
31.0       1
33.0       1
99.0       1
29.0       1
36.0       1
Name: HouseFloor, dtype: int64

In [61]:
housefloor = train_df['HouseFloor'] > 30
train_df.loc[housefloor, 'HouseFloor'] = train_df['HouseFloor'].median()

In [62]:
housefloor = train_df['HouseFloor'] < 1
train_df.loc[housefloor, 'HouseFloor'] = train_df['HouseFloor'].median()

#### 0, и все выше 30 уровня примем за выбросы

In [63]:
housefloor = test_df['HouseFloor'] > 30
test_df.loc[housefloor, 'HouseFloor'] = test_df['HouseFloor'].median()

In [64]:
housefloor = test_df['HouseFloor'] < 1
test_df.loc[housefloor, 'HouseFloor'] = test_df['HouseFloor'].median()

#### HouseYear

In [65]:
train_df['HouseYear'].value_counts().sort_index()

1910          1
1912          2
1914          1
1916          1
1917          3
           ... 
2018        175
2019         79
2020         19
4968          1
20052011      1
Name: HouseYear, Length: 97, dtype: int64

In [66]:
test_df['HouseYear'].value_counts().sort_index()

1908      1
1909      1
1910      1
1911      1
1912      1
       ... 
2016    172
2017    141
2018     73
2019     28
2020      9
Name: HouseYear, Length: 97, dtype: int64

In [67]:
houseyear = train_df['HouseYear'] > 2020
train_df.loc[houseyear, 'HouseYear'] = 2020

In [68]:
houseyear = test_df['HouseYear'] > 2020
test_df.loc[houseyear, 'HouseYear'] = 2020

### Floor

In [69]:
train_df['Floor'].value_counts().sort_index()

1     235
2     588
3     872
4     949
5     876
6     802
7     759
8     643
9     604
10    553
11    457
12    438
13    398
14    362
15    287
16    290
17    256
18    185
19    121
20     68
21     59
22     54
23     25
24     53
25     34
26     18
27      5
28      3
31      1
32      2
33      1
37      1
42      1
Name: Floor, dtype: int64

In [70]:
test_df['Floor'].value_counts().sort_index()

1     107
2     263
3     406
4     523
5     495
6     405
7     347
8     327
9     298
10    267
11    224
12    212
13    191
14    179
15    151
16    137
17    104
18     92
19     68
20     42
21     35
22     26
23     33
24     18
25     19
26     14
27      8
28      1
31      1
32      2
33      1
34      1
38      1
46      1
78      1
Name: Floor, dtype: int64

#### 0, и все выше 30 уровня примем за выбросы

In [71]:
floor = train_df['Floor'] > 30
train_df.loc[floor, 'Floor'] = train_df['Floor'].median()

In [72]:
floor = test_df['Floor'] > 30
test_df.loc[floor, 'Floor'] = test_df['Floor'].median()

In [73]:
floorhouse = train_df['Floor'] > train_df['HouseFloor']
train_df.loc[floorhouse, 'Floor'] = train_df['HouseFloor']

In [74]:
floorhouse = test_df['Floor'] > test_df['HouseFloor']
test_df.loc[floorhouse, 'Floor'] = test_df['HouseFloor']

## Healthcare_1

In [75]:
train_df['Healthcare_1'].head()

0       NaN
1    1183.0
2     240.0
3       NaN
4       NaN
Name: Healthcare_1, dtype: float64

In [76]:
train_df.loc[train_df['Healthcare_1'].isnull(), 'Healthcare_1'] = train_df['Healthcare_1'].median()

In [77]:
test_df['Healthcare_1'].head()

0    1036.0
1       NaN
2       NaN
3       NaN
4     990.0
Name: Healthcare_1, dtype: float64

In [78]:
test_df.loc[test_df['Healthcare_1'].isnull(), 'Healthcare_1'] = test_df['Healthcare_1'].median()

### Ecology_2_bin, Ecology_3_bin, Shops_2_bin

In [79]:
train_df['Ecology_2_bin'] = train_df['Ecology_2'].replace({'A':0, 'B':1})
train_df['Ecology_3_bin'] = train_df['Ecology_3'].replace({'A':0, 'B':1})
train_df['Shops_2_bin'] = train_df['Shops_2'].replace({'A':0, 'B':1})

In [80]:
test_df['Ecology_2_bin'] = test_df['Ecology_2'].replace({'A':0, 'B':1})
test_df['Ecology_3_bin'] = test_df['Ecology_3'].replace({'A':0, 'B':1})
test_df['Shops_2_bin'] = test_df['Shops_2'].replace({'A':0, 'B':1})

In [81]:
train_df.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price,Ecology_2_bin,Ecology_3_bin,Shops_2_bin
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.8905,55.018805,36.452414,6.6312,7.8939,12.8669,1984.8724,0.118858,24.687,5352.1574,8.0392,1026.3589,1.3195,4.2313,214138.857399,0.9903,0.9725,0.9175
std,0.839512,16.132579,15.285155,2.593658,5.128964,6.100304,18.416347,0.119025,17.532614,4006.799803,23.831875,746.662828,1.493601,4.806341,92872.293865,0.098015,0.163543,0.275139
min,0.0,20.213128,9.011504,3.0,1.0,1.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028,0.0,0.0,0.0
25%,1.0,41.73371,23.339088,4.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,830.0,0.0,1.0,153872.633942,1.0,1.0,1.0
50%,2.0,52.330931,33.642751,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879,1.0,1.0,1.0
75%,2.0,65.680737,46.239755,8.0,11.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,990.0,2.0,6.0,249135.462171,1.0,1.0,1.0
max,19.0,99.893009,79.963266,20.0,28.0,30.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657,1.0,1.0,1.0


In [82]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,55.018805,16.132579,20.213128,41.73371,52.330931,65.680737,99.893009
LifeSquare,10000.0,36.452414,15.285155,9.011504,23.339088,33.642751,46.239755,79.963266
KitchenSquare,10000.0,6.6312,2.593658,3.0,4.0,6.0,8.0,20.0
Floor,10000.0,7.8939,5.128964,1.0,4.0,7.0,11.0,28.0
HouseFloor,10000.0,12.8669,6.100304,1.0,9.0,13.0,17.0,30.0
HouseYear,10000.0,1984.8724,18.416347,1910.0,1974.0,1977.0,2001.0,2020.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.521867
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0
Social_2,10000.0,5352.1574,4006.799803,168.0,1564.0,5285.0,7227.0,19083.0


In [83]:
train_df.shape

(10000, 23)

In [84]:
test_df.shape

(5000, 22)

In [85]:
train_df.columns.tolist()

['Id',
 'DistrictId',
 'Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Ecology_2',
 'Ecology_3',
 'Social_1',
 'Social_2',
 'Social_3',
 'Healthcare_1',
 'Helthcare_2',
 'Shops_1',
 'Shops_2',
 'Price',
 'Ecology_2_bin',
 'Ecology_3_bin',
 'Shops_2_bin']

In [86]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 
                 'HouseFloor','HouseYear', 'Ecology_1', 'Ecology_2_bin', 'Ecology_3_bin',
                 'Shops_2_bin', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
                 'Shops_1', 'Shops_2_bin']

target_name = ['Price']

In [87]:
train_df = train_df[feature_names + target_name]
test_df = test_df[feature_names + ['Id']]

In [88]:
X = train_df[feature_names]
y = train_df[target_name]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, shuffle=True, random_state=21
)

In [90]:
rf_model = RandomForestRegressor(
    max_depth=8, min_samples_split=100, n_estimators=500, n_jobs=-1, random_state=39
)
rf_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=8, min_samples_split=100, n_estimators=500,
                      n_jobs=-1, random_state=39)

In [91]:
y_train_preds = rf_model.predict(X_train)
y_test_preds = rf_model.predict(X_test)

evaluate_preds(y_train, y_train_preds, y_test, y_test_preds)

Train R2:	0.688
Valid R2:	0.642


In [94]:
preds_final = pd.DataFrame()
preds_final['Id'] = test_df['Id'].copy()

test_df.set_index('Id', inplace=True)
test_df = test_df[feature_names]

In [96]:
y_pred_final = rf_model.predict(test_df)

In [98]:
submission_df = pd.read_csv('./sample_submission.csv')

In [99]:
submission_df['Price'] = y_pred_final
submission_df.to_csv('./predictions.csv', index=False, encoding='utf-8', sep=',')

submission_df.head()

Unnamed: 0,Id,Price
0,4567,158971.49906
1,5925,127446.434617
2,960,162404.308198
3,3848,176828.658153
4,746,171847.171778
