In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_columns = 100

In [9]:
data = pd.read_csv('train.csv')

In [10]:
test = pd.read_csv('test.csv')

In [11]:
data.shape, test.shape

((10000, 20), (5000, 19))

### Точечные корректировки

In [12]:
data.loc[data['HouseYear'] == 4968, 'HouseYear'] = 1968

In [13]:
data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2008

In [14]:
data = data.drop(4853, axis = 0)

In [15]:
data.loc[data['Rooms'] == 19, 'Rooms'] = 1

In [16]:
data.loc[data['Rooms'] > 5, 'Rooms'] = 2

In [17]:
test.loc[test['Rooms'] == 17, 'Rooms'] = 2

In [18]:
data_list_square_id1 = [14990, 16550, 4071, 8961, 15886]

In [19]:
for i in data_list_square_id1:
    data.loc[data['Id'] == i, 'LifeSquare'] = data['Square']

In [20]:
data_list_square_id2 = [10527, 13265]

In [21]:
for i in data_list_square_id2:
    data.loc[data['Id'] == i, 'Square'] = data['LifeSquare']

In [22]:
test.loc[test['Id'] == 11533, 'LifeSquare'] = 48

In [23]:
test.loc[test['Id'] == 170, 'Square'] = 62

In [24]:
data = data.drop(212, axis = 0)

In [25]:
data.loc[(data['Square'] > 400) & (data['Rooms'] == 2), ['Square', 'LifeSquare']] = 56.0 

In [26]:
data.loc[(data['Square'] > 400) & (data['Rooms'] == 1), ['Square', 'LifeSquare']] = 40.0  

In [27]:
test.loc[test['Id'] == 15759, 'Floor'] = 22

### Функции корректировки

In [28]:
def corrected_rooms(df):
    df.loc[(df['Rooms'] == 0) & (df['Square'] < 56), 'Rooms'] = 1.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 55) & (df['Square'] < 76), 'Rooms'] = 2.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 75) & (df['Square'] < 96), 'Rooms'] = 3.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 115) & (df['Square'] < 167), 'Rooms'] = 5.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 167), 'Rooms'] = 6.0

In [29]:
def lifesquare_filling(df):
    df['LifeSquare'] = df['LifeSquare'].fillna(10)

In [30]:
def corrected_square(df):
    df.loc[df['Square'] < 25, 'Square'] = 25

In [31]:
def corrected_lifesquare(df):
    df.loc[(df['Square'] // 2) > df['LifeSquare'], 'LifeSquare'] = df['Square'] // 2

In [32]:
def corrected_floor(df):
    df.loc[df['Floor'] > df['HouseFloor'], 'HouseFloor'] = df['Floor']

### Заключительная корректировка

In [33]:
data = pd.get_dummies(data)

In [34]:
test = pd.get_dummies(test)

In [35]:
all_df = pd.merge(data,test, on = ['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B'], how = 'outer')

In [36]:
a = all_df.groupby('DistrictId', as_index=False)['Healthcare_1'].mean()

In [37]:
data = pd.merge(data, a, on = ['DistrictId','Healthcare_1'], how = 'left')

In [38]:
def healthcare_1_corrected(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(data['Healthcare_1'].mean())

In [39]:
def health_dist(df):
    i = df.groupby('DistrictId', as_index=False)['Healthcare_1'].mean().rename(columns = {'Healthcare_1' : 'Health_District'})
    df = pd.merge(df, i, on = ['DistrictId'], how = 'left')
    df['Health_District'] = df['Health_District'].fillna(df['Health_District'].mean())
    return df

### Запуск функций

In [40]:
def start_functions(df):
    corrected_rooms(df)
    lifesquare_filling(df)
    corrected_square(df)
    corrected_lifesquare(df)
    corrected_floor(df)
    healthcare_1_corrected(df)

In [41]:
start_functions(data)
start_functions(test)

data = health_dist(data)
test = health_dist(test)

### train_test_split

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
train, valid = train_test_split(data, test_size = 0.2, random_state = 42)

### Построение модели

In [44]:
from sklearn.ensemble import RandomForestRegressor as RF

In [81]:
?RF

In [78]:
model = RF(n_estimators = 5000,  max_depth = 30, random_state = 42, )

In [79]:
feats = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 
       'HouseYear','Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2',  'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B','Health_District']

In [80]:
model.fit(train.loc[:,feats], train['Price'])

MemoryError: could not allocate 917504 bytes

In [None]:
from sklearn.metrics import r2_score as r2

In [None]:
def get_prediction(model, df, feats=feats, evaluate = 'Yes'):
    pred = model.predict(df.loc[:,feats])
    if evaluate == 'Yes':
        r2_value = r2(df['Price'], pred)
        print('R2: {}'.format(r2_value))
    return pred

In [None]:
pred_train = get_prediction(model=model, df=train)

In [None]:
pred_valid = get_prediction(model=model, df=valid)

### Тест

In [61]:
test['Price'] = get_prediction(model, test, feats=feats, evaluate = 'No')

In [62]:
test['Price'].describe()

count      5000.000000
mean     215043.712177
std       78230.648611
min       64431.387379
25%      164467.063045
50%      195975.261189
75%      248580.678908
max      560938.884947
Name: Price, dtype: float64

In [63]:
#test.loc[:, ['Id', 'Price']].to_csv('IBrilenkov_predictions.csv', index=False)