In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 100
%matplotlib inline

In [2]:
data = pd.read_csv('input/train.csv')

In [3]:
from sklearn.model_selection import train_test_split 

In [4]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

#### Train

In [5]:
train.shape, valid.shape

((7000, 20), (3000, 20))

In [6]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
9069,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,B,B,6,1437,3,,0,2,B,88504.384965
2603,5621,23,3.0,163.495333,161.504222,12.0,5,3.0,1977,0.014073,B,B,2,475,0,,0,0,B,207007.956663
7738,235,87,1.0,39.710131,19.538663,8.0,4,17.0,1986,0.100456,B,B,43,7227,0,,1,6,A,182126.280899
1579,16258,48,3.0,96.056784,98.152802,1.0,15,1.0,2017,0.041125,B,B,46,9515,5,,1,10,B,524365.550705
5058,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,B,B,16,4048,3,,1,3,B,322048.43399


#### Prepare train

In [16]:
def fill_life_square(df):
    df['LifeSquare'] = df['LifeSquare'].fillna(0.8 * df['Square'])
    return df

In [18]:
def get_cat_features(df):
    df.loc[df['Ecology_2']=='A', 'Ecology_2'] = 1
    df.loc[df['Ecology_2']=='B', 'Ecology_2'] = 2
    df = pd.get_dummies(df)
    return df

In [19]:
def prepare_data(df):
    df = fill_life_square(df)
    df = get_cat_features(df)
    return df

In [21]:
train = prepare_data(train)

In [22]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
       'Shops_1', 'Price', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B'],
      dtype='object')

#### Prepare valid

In [23]:
valid = prepare_data(valid)

#### Model

In [25]:
from sklearn.ensemble import RandomForestRegressor as RF

In [96]:
model = RF(n_estimators=100, max_depth=12, max_features=6, random_state=1)

In [97]:
fts = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Social_1', 'Social_2', 'Social_3','Helthcare_2',
       'Shops_1', 'Ecology_3_A', 'Shops_2_A']

In [98]:
model.fit(train.loc[:, fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [99]:
pred_train = model.predict(train.loc[:, fts])

In [100]:
pred_valid = model.predict(valid.loc[:, fts])

In [101]:
pred_train

array([108436.26222845, 241638.10042145, 177007.63766696, ...,
       237924.16493799, 205141.30707794, 381570.4812441 ])

In [102]:
pred_valid

array([192033.84385113, 315508.94332963, 214045.4121392 , ...,
       263523.51306729, 111185.12319967, 271761.53926243])

In [103]:
from sklearn.metrics import r2_score as r2

In [104]:
r2(train['Price'], pred_train)

0.8920507423207589

In [105]:
r2(valid['Price'], pred_valid)

0.7205516347148269

#### Test

In [107]:
test = pd.read_csv('input/test.csv')

In [108]:
test = prepare_data(test)

In [109]:
test['Price'] = model.predict(test.loc[:, fts])

In [110]:
test.loc[:, ['Id', 'Price']].to_csv('SShirkin_predictions.csv', index=False)