In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [3]:
data = pd.read_csv('./train.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [5]:
data.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [6]:
data = data.loc[data["Rooms"]<7, :]
data.shape

(9997, 20)

In [7]:
data = data.loc[data["Price"].between(30000,600000), :]
data.shape

(9977, 20)

In [8]:
data.loc[:,"DistrictId"].unique()

array([ 35,  41,  53,  58,  99,  59, 154,  74,   1,  23,  28,  31,  13,
        57,  27,  85,  34,  19,   0,  48,  54, 150,  11,  50,  61,  44,
        90, 200,  43, 177, 101,  38,  21, 103,  98, 170,  94,   5,  88,
        47,   6,  52,  62,  77,  79,   9, 156,  69, 108,  96, 169,  18,
       148, 128,  10, 144, 120,  39,   7,  84, 123, 159,  70, 129,   3,
        95,  78,  45,  15, 121,  49,  25,   2, 127,  56, 109,  46,  17,
       119,  80,  86,  83, 137,  91,  92, 118, 187,  36, 106, 161,  22,
        63, 167, 113,  66,  29,  37,  20,   8, 181,  40,  93,  30,  81,
       107, 130, 100,  12, 125,  67,  26,  89,  24, 111, 138,  64, 124,
       183,  55, 157, 151,  33,  87, 126, 155, 164, 105, 110, 132,  71,
        73, 189,  75, 122, 163, 141,  76, 115, 139, 143, 201, 165, 190,
       193, 174,  60, 114, 172, 134, 117, 149, 153, 131, 145,  82, 160,
        42,  32, 133, 208, 171, 176, 173, 147,  97, 142, 195, 180, 196,
       112,  72, 175,   4, 186, 146, 191, 168, 192,  14, 199, 13

In [9]:
district_mean_price=pd.DataFrame({'DistrictId': data['DistrictId'].unique(),
                          'district_mean_price': [data.loc[data['DistrictId'] == i, 'Price'].mean() for i in data['DistrictId'].unique()]})

In [10]:
data = pd.merge(data,district_mean_price, on = 'DistrictId', how = 'outer')

In [11]:
data.loc[data["LifeSquare"].isnull(),'LifeSquare'] = data['Square']*0.82

In [12]:
data = data.loc[data["Square"]>16, :]
data.shape

(9964, 21)

In [13]:
square_mean_1 = data.loc[data['Rooms']<=1, 'Square'].mean()
square_mean_3 = data.loc[data['Rooms']==3, 'Square'].mean()

In [14]:
data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']<=1), 'Square'] = square_mean_1

data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']==3), 'Square'] = square_mean_3

data.loc[(data['Square'] > 15) & (data['LifeSquare'] < 15), 'LifeSquare'] = data['Square']

data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = (data['Square']+data['Square']*0.82)

In [15]:
data = pd.get_dummies(data)

In [16]:
data = data.drop('Healthcare_1', axis=1)

In [17]:
train, valid = train_test_split(data, test_size=0.3, random_state=666)

In [18]:
model = RandomForestRegressor(n_estimators=1000, max_depth=15, random_state=666, min_samples_leaf=2,n_jobs=-1)

In [19]:
fts = ['Rooms', 'Square', 'LifeSquare',
       'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B','district_mean_price']

In [20]:
model.fit(train.loc[:,fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=666, verbose=0, warm_start=False)

In [21]:
pred = model.predict(train.loc[:,fts])

In [22]:
r2_score(train['Price'],pred)

0.9249410126122298

In [23]:
pred_valid = model.predict(valid.loc[:,fts])

In [24]:
r2_score(valid['Price'],pred_valid)

0.7434110218438185

In [25]:
test = pd.read_csv('./test.csv')

In [26]:
test = test.drop('Healthcare_1', axis=1)

In [27]:
test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (data['Rooms']<=1), 'Square'] = square_mean_1

test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (data['Rooms']==3), 'Square'] = square_mean_3

test.loc[(test['Square'] > 15) & (test['LifeSquare'] < 15), 'LifeSquare'] = test['Square']

test.loc[test['Square'] < test['LifeSquare'], 'LifeSquare'] = (test['Square']+test['Square']*0.82)

In [28]:
test = pd.get_dummies(test)

In [29]:
test.loc[test["LifeSquare"].isnull(),'LifeSquare'] = test['Square']*0.82

In [30]:
district_mean_price=pd.DataFrame({'DistrictId': test['DistrictId'].unique(),
                          'district_mean_price': [data.loc[data['DistrictId'] == i, 'Price'].mean() for i in test['DistrictId'].unique()]})
district_mean_price_mean = district_mean_price['district_mean_price'].mean()

In [31]:
test = pd.merge(test,district_mean_price, on = 'DistrictId', how = 'outer')

In [32]:
test.loc[test["district_mean_price"].isnull(),'district_mean_price'] = district_mean_price_mean

In [33]:
pred_test = model.predict(test.loc[:, fts])

In [34]:
test['Price'] = pred_test

In [35]:
SGoryushko_predictions = test.loc[:, ['Id', 'Price']]

In [36]:
SGoryushko_predictions.to_csv('SGoryushko_predictions.csv', index=None)