### Курсовой проект Python для Data Science. Бриленков Илья

In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_columns = 100

In [147]:
data = pd.read_csv('train.csv')

In [148]:
test = pd.read_csv('test.csv')

In [149]:
data.shape, test.shape

((10000, 20), (5000, 19))

### Точечные корректировки

In [150]:
data.loc[data['HouseYear'] == 4968, 'HouseYear'] = 1968

In [151]:
data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2008

In [152]:
data = data.drop(4853, axis = 0)

In [153]:
data.loc[data['Rooms'] == 19, 'Rooms'] = 1

In [154]:
data.loc[data['Rooms'] > 5, 'Rooms'] = 2

In [155]:
test.loc[test['Rooms'] == 17, 'Rooms'] = 2

In [156]:
data_list_square_id1 = [14990, 16550, 4071, 8961, 15886]

In [157]:
for i in data_list_square_id1:
    data.loc[data['Id'] == i, 'LifeSquare'] = data['Square']

In [158]:
data_list_square_id2 = [10527, 13265]

In [159]:
for i in data_list_square_id2:
    data.loc[data['Id'] == i, 'Square'] = data['LifeSquare']

In [160]:
test.loc[test['Id'] == 11533, 'LifeSquare'] = 48

In [161]:
test.loc[test['Id'] == 170, 'Square'] = 62

In [162]:
data = data.drop(212, axis = 0)

In [163]:
data.loc[(data['Square'] > 400) & (data['Rooms'] == 2), ['Square', 'LifeSquare']] = 56.0 

In [164]:
data.loc[(data['Square'] > 400) & (data['Rooms'] == 1), ['Square', 'LifeSquare']] = 40.0  

In [165]:
test.loc[test['Id'] == 15759, 'Floor'] = 22

### Функции корректировки

In [166]:
def corrected_rooms(df):
    df.loc[(df['Rooms'] == 0) & (df['Square'] < 56), 'Rooms'] = 1.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 55) & (df['Square'] < 76), 'Rooms'] = 2.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 75) & (df['Square'] < 96), 'Rooms'] = 3.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 115) & (df['Square'] < 167), 'Rooms'] = 5.0
    df.loc[(df['Rooms'] == 0) & (df['Square'] > 167), 'Rooms'] = 6.0

In [167]:
def lifesquare_filling(df):
    df['LifeSquare'] = df['LifeSquare'].fillna(10)

In [168]:
def corrected_square(df):
    df.loc[df['Square'] < 25, 'Square'] = 25

In [169]:
def corrected_lifesquare(df):
    df.loc[(df['Square'] // 2) > df['LifeSquare'], 'LifeSquare'] = df['Square'] // 2

In [170]:
def corrected_floor(df):
    df.loc[df['Floor'] > df['HouseFloor'], 'HouseFloor'] = df['Floor']

### Заключительная корректировка

In [171]:
data = pd.get_dummies(data)

In [172]:
test = pd.get_dummies(test)

In [173]:
all_df = pd.merge(data,test, on = ['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B'], how = 'outer')

In [174]:
a = all_df.groupby('DistrictId', as_index=False)['Healthcare_1'].mean()

In [175]:
data = pd.merge(data, a, on = ['DistrictId','Healthcare_1'], how = 'left')

In [176]:
def healthcare_1_corrected(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(data['Healthcare_1'].mean())

In [177]:
def health_dist(df):
    i = df.groupby('DistrictId', as_index=False)['Healthcare_1'].mean().rename(columns = {'Healthcare_1' : 'Health_District'})
    df = pd.merge(df, i, on = ['DistrictId'], how = 'left')
    df['Health_District'] = df['Health_District'].fillna(df['Health_District'].mean())
    return df

### Запуск функций

In [178]:
def start_functions(df):
    corrected_rooms(df)
    lifesquare_filling(df)
    corrected_square(df)
    corrected_lifesquare(df)
    corrected_floor(df)
    healthcare_1_corrected(df)

In [179]:
start_functions(data)
start_functions(test)

data = health_dist(data)
test = health_dist(test)

### train_test_split

In [180]:
from sklearn.model_selection import train_test_split

In [181]:
train, valid = train_test_split(data, test_size = 0.2, random_state = 42)

### Построение модели

In [182]:
from sklearn.ensemble import RandomForestRegressor as RF

In [183]:
model = RF(n_estimators = 500,  max_depth = 18, random_state = 42, )

In [186]:
feats = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 
       'HouseYear','Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2',  'Shops_1',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B','Health_District']

In [187]:
model.fit(train.loc[:,feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [188]:
from sklearn.metrics import r2_score as r2

In [189]:
def get_prediction(model, df, feats=feats, evaluate = 'Yes'):
    pred = model.predict(df.loc[:,feats])
    if evaluate == 'Yes':
        r2_value = r2(df['Price'], pred)
        print('R2: {}'.format(r2_value))
    return pred

In [190]:
pred_train = get_prediction(model=model, df=train)

R2: 0.954030801646199


In [191]:
pred_valid = get_prediction(model=model, df=valid)

R2: 0.7428982171981066


### Тест

In [192]:
test['Price'] = get_prediction(model, test, feats=feats, evaluate = 'No')

In [193]:
test['Price'].describe()

count      5000.000000
mean     214942.877854
std       78545.133093
min       64528.453346
25%      163658.712733
50%      195791.496711
75%      248078.783967
max      566781.135764
Name: Price, dtype: float64

In [194]:
test.loc[:, ['Id', 'Price']].to_csv('IBrilenkov_predictions.csv', index=False)