In [4]:
import numpy as np
import pandas as pd

In [5]:
data = pd.read_csv('data/train.csv')

In [6]:
data.shape

(10000, 20)

In [7]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### train_test_split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train, valid = train_test_split(data, test_size=0.3, random_state = 42)

In [10]:
train.shape, valid.shape

((7000, 20), (3000, 20))

### Prepare train, valid

In [11]:
def clean_rooms(df):
    df.loc[df['Rooms'] > 6, 'Rooms'] = 6
    return df

In [12]:
def clean_square(df):
    df.loc[df['Square'] < 6, 'Square'] = 15
    df.loc[df['KitchenSquare'] < 4, 'KitchenSquare'] = 4
    return df

In [13]:
def clean_year(df):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = 2020
    return df

In [14]:
distr_info1 = train['DistrictId'].value_counts(normalize=True).reset_index().rename(columns={'index':'DistrictId', 'DistrictId':'flat_qty_distr'})
distr_stat_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_dr'})
distr_stat_r = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})
mean_price = train['Price'].mean()

In [15]:
def add_distr_info1(df, distr_info1): 
    df = pd.merge(df, distr_info1, on='DistrictId', how='left')
    df['flat_qty_distr'] = df['flat_qty_distr'].fillna(0.000143)
    return df

In [16]:
def add_distr_stats(df, distr_stat_dr, distr_stat_r, mean_price):
    df = pd.merge(df, distr_stat_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, distr_stat_r, on='Rooms', how='left')
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [17]:
def add_cat_fts(df, cat_fts= ('Ecology_2', 'Ecology_3', 'Shops_2')):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df    

In [18]:
train['Healthcare_1'].isnull().mean()

0.4797142857142857

In [19]:
train['Healthcare_1'].describe()

count    3642.000000
mean     1133.876167
std      1018.354716
min        30.000000
25%       325.000000
50%       900.000000
75%      1548.000000
max      4849.000000
Name: Healthcare_1, dtype: float64

In [20]:
def fillna_healthcare_1(df):
    df = df['Healthcare_1'] =df['Healthcare_1'].fillna(0)
    return df

In [21]:
def prepare_data(df, distr_info1, distr_stat_dr, distr_stat_r, mean_price):
    df = add_distr_info1(df, distr_info1)
    df = add_distr_stats(df, distr_stat_dr, distr_stat_r, mean_price)
    df = add_cat_fts(df)
#     df = fillna_healthcare_1(df)
    return df

In [22]:
train = prepare_data(train, distr_info1, distr_stat_dr, distr_stat_r, mean_price)
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,flat_qty_distr,mean_price_dr,mean_price_r
0,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,...,1437,3,,0,2,1,88504.384965,0.056286,102427.030975,160134.810901
1,5621,23,3.0,163.495333,161.504222,12.0,5,3.0,1977,0.014073,...,475,0,,0,0,1,207007.956663,0.056286,165911.1297,290867.452543
2,235,87,1.0,39.710131,19.538663,8.0,4,17.0,1986,0.100456,...,7227,0,,1,6,0,182126.280899,0.003,169596.630515,160134.810901
3,16258,48,3.0,96.056784,98.152802,1.0,15,1.0,2017,0.041125,...,9515,5,,1,10,1,524365.550705,0.008857,382424.639356,290867.452543
4,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,...,4048,3,,1,3,1,322048.43399,0.004,251751.766701,290867.452543


In [23]:
valid = prepare_data(valid, distr_info1, distr_stat_dr, distr_stat_r, mean_price)
valid.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,flat_qty_distr,mean_price_dr,mean_price_r
0,3702,21,2.0,48.812195,34.679779,5.0,5,5.0,1963,0.034331,...,5487,3,,0,4,1,181530.459031,0.015286,189591.187596,215287.628931
1,12848,61,3.0,81.103039,49.310278,8.0,4,4.0,1960,0.298205,...,4048,3,,1,3,1,260456.004692,0.012143,293153.148921,290867.452543
2,2239,6,3.0,82.882978,3.97765,1.0,8,17.0,1977,0.243205,...,1564,0,540.0,0,0,1,219945.30464,0.050714,212694.797384,290867.452543
3,15611,23,1.0,33.863324,29.993297,0.0,5,4.0,1977,0.034656,...,168,0,,0,0,1,66883.280318,0.056286,102427.030975,160134.810901
4,5634,52,1.0,43.095135,,1.0,10,17.0,1977,0.371149,...,7065,1,750.0,2,5,1,114086.065201,0.013714,135270.547115,160134.810901


### Model

In [24]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'flat_qty_distr',
       'mean_price_dr', 'mean_price_r'],
      dtype='object')

In [25]:
feats = ['Rooms', 'Square', 'flat_qty_distr', 'mean_price_dr', 'HouseYear']

In [26]:
from sklearn.ensemble import RandomForestRegressor as RF

In [27]:
model = RF(n_estimators=20, max_depth=12, random_state= 42)

In [28]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [29]:
pred_train = model.predict(train.loc[:, feats])

In [30]:
pred_train.shape

(7000,)

In [31]:
pred_train

array([ 88135.665491  , 195363.91383013, 177067.76684429, ...,
       250107.34450127, 211098.08865696, 375400.71411946])

In [32]:
pred_valid = model.predict(valid.loc[:, feats])

In [33]:
pred_valid.shape

(3000,)

In [34]:
pred_valid

array([192766.38162951, 342464.67645458, 218998.32928814, ...,
       259812.02087707, 117343.64528877, 267700.11927034])

### Evaluate model

In [35]:
from sklearn.metrics import r2_score as R2

In [36]:
R2(train['Price'], pred_train)

0.9097793776496335

In [37]:
R2(valid['Price'], pred_valid)

0.6722210160075393

### Test

In [38]:
?prepare_data

In [39]:
test = pd.read_csv('data/test.csv')

In [40]:
test = prepare_data(test, distr_info1, distr_stat_dr, distr_stat_r, mean_price)

In [41]:
test['Price'] = model.predict(test.loc[:, feats])

In [42]:
test.loc[:, ['Id', 'Price']].to_csv('ASatyukova_predictions_v3.csv', index=None)

In [43]:
result = pd.read_csv('ASatyukova_predictions_v3.csv')

In [44]:
result.shape

(5000, 2)

In [45]:
result.head()

Unnamed: 0,Id,Price
0,725,160981.805872
1,15856,242178.798948
2,5480,93695.468187
3,15664,246425.233495
4,14275,132165.728234
