In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_columns = 100
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [15]:
data = pd.read_csv('train.csv')

In [16]:
test = pd.read_csv('test.csv')

In [17]:
data.shape, test.shape

((10000, 20), (5000, 19))

### Точечные корректировки

In [18]:
data.loc[data['HouseYear'] == 4968, 'HouseYear'] = 1968

In [19]:
data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2008

In [20]:
data = data.drop(4853, axis = 0)

In [21]:
data.loc[data['Rooms'] == 19, 'Rooms'] = 1

In [22]:
data.loc[data['Rooms'] > 5, 'Rooms'] = 2

In [23]:
test.loc[test['Rooms'] == 17, 'Rooms'] = 2

In [24]:
data_list_square_id1 = [14990, 16550, 4071, 8961, 15886]

In [25]:
for i in data_list_square_id1:
    data.loc[data['Id'] == i, 'LifeSquare'] = data['Square']

In [26]:
data_list_square_id2 = [10527, 13265]

In [27]:
for i in data_list_square_id2:
    data.loc[data['Id'] == i, 'Square'] = data['LifeSquare']

In [28]:
test.loc[test['Id'] == 11533, 'LifeSquare'] = 48

In [29]:
test.loc[test['Id'] == 170, 'Square'] = 62

In [30]:
data = data.drop(212, axis = 0)

In [31]:
data.loc[(data['Square'] > 400) & (data['Rooms'] == 2), ['Square', 'LifeSquare']] = 56.0 

In [32]:
data.loc[(data['Square'] > 400) & (data['Rooms'] == 1), ['Square', 'LifeSquare']] = 40.0  

In [33]:
test.loc[test['Id'] == 15759, 'Floor'] = 22

### Функции корректировки

In [34]:
def corrected_rooms(df):
    df.loc[(df['Rooms'] == 0) & (df['Square'] < 56), 'Rooms'] = 1.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 55) & (df['Square'] < 76), 'Rooms'] = 2.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 75) & (df['Square'] < 96), 'Rooms'] = 3.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 115) & (df['Square'] < 167), 'Rooms'] = 5.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 167), 'Rooms'] = 6.0

In [35]:
def lifesquare_filling(df):
    df['LifeSquare'] = df['LifeSquare'].fillna(10)

In [36]:
def corrected_square(df):
    df.loc[df['Square'] < 25, 'Square'] = 25

In [37]:
def corrected_lifesquare(df):
    df.loc[(df['Square'] // 2) > df['LifeSquare'], 'LifeSquare'] = df['Square'] // 2

In [38]:
def corrected_floor(df):
    df.loc[df['Floor'] > df['HouseFloor'], 'HouseFloor'] = df['Floor']

### Заключительная корректировка

In [39]:
data = pd.get_dummies(data)

In [40]:
test = pd.get_dummies(test)

In [41]:
all_df = pd.merge(data,test, on = ['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B'], how = 'outer')

In [42]:
a = all_df.groupby('DistrictId', as_index=False)['Healthcare_1'].mean()

In [43]:
data = pd.merge(data, a, on = ['DistrictId','Healthcare_1'], how = 'left')

In [44]:
def healthcare_1_corrected(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(data['Healthcare_1'].mean())

### Запуск функций

In [45]:
def start_functions(df):
    corrected_rooms(df)
    lifesquare_filling(df)
    corrected_square(df)
    corrected_lifesquare(df)
    corrected_floor(df)
    healthcare_1_corrected(df)

In [46]:
start_functions(data)
start_functions(test)

### train_test_split

In [47]:
from sklearn.model_selection import train_test_split

In [169]:
train, valid = train_test_split(data, test_size = 0.25, random_state = 42)

In [170]:
train.shape, valid.shape

((7498, 23), (2500, 23))

### Построение модели

In [171]:
from sklearn.ensemble import RandomForestRegressor as RF

In [172]:
model = RF(n_estimators = 20, max_depth = 12, random_state = 42)

In [173]:
feats = ['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare',
       'HouseYear','Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2',  'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B']

In [174]:
model.fit(train.loc[:,feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [175]:
from sklearn.metrics import r2_score as r2

In [176]:
def get_prediction(model, df, feats=feats, evaluate = 'Yes'):
    pred = model.predict(df.loc[:,feats])
    if evaluate == 'Yes':
        r2_value = r2(df['Price'], pred)
        print('R2: {}'.format(r2_value))
    return pred

In [177]:
pred_train = get_prediction(model=model, df=train)

R2: 0.886447981907206


In [178]:
pred_valid = get_prediction(model=model, df=valid)

R2: 0.7265787804538915


### Тест

In [58]:
test['Price'] = get_prediction(model, test, feats=feats, evaluate = 'No')

In [59]:
test['Price'].describe()

count      5000.000000
mean     215398.018468
std       77567.920734
min       64250.129513
25%      166011.844956
50%      195914.363847
75%      249253.283170
max      550125.973977
Name: Price, dtype: float64

In [60]:
#test.loc[:, ['Id', 'Price']].to_csv('IBrilenkov_predictions.csv', index=False)