In [343]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = 100

warnings.filterwarnings('ignore')
%matplotlib inline

In [344]:
data = pd.read_csv('data/train.csv')

In [345]:
data.shape

(10000, 20)

### Train_test_split

In [346]:
from sklearn.model_selection import train_test_split

In [347]:
train, valid = train_test_split(data, test_size=0.30, random_state=42)

In [348]:
train.shape, valid.shape

((7000, 20), (3000, 20))

In [349]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
9069,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,B,B,6,1437,3,,0,2,B,88504.384965
2603,5621,23,3.0,163.495333,161.504222,12.0,5,3.0,1977,0.014073,B,B,2,475,0,,0,0,B,207007.956663
7738,235,87,1.0,39.710131,19.538663,8.0,4,17.0,1986,0.100456,B,B,43,7227,0,,1,6,A,182126.280899
1579,16258,48,3.0,96.056784,98.152802,1.0,15,1.0,2017,0.041125,B,B,46,9515,5,,1,10,B,524365.550705
5058,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,B,B,16,4048,3,,1,3,B,322048.43399


In [350]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 9069 to 7270
Data columns (total 20 columns):
Id               7000 non-null int64
DistrictId       7000 non-null int64
Rooms            7000 non-null float64
Square           7000 non-null float64
LifeSquare       5514 non-null float64
KitchenSquare    7000 non-null float64
Floor            7000 non-null int64
HouseFloor       7000 non-null float64
HouseYear        7000 non-null int64
Ecology_1        7000 non-null float64
Ecology_2        7000 non-null object
Ecology_3        7000 non-null object
Social_1         7000 non-null int64
Social_2         7000 non-null int64
Social_3         7000 non-null int64
Healthcare_1     3642 non-null float64
Helthcare_2      7000 non-null int64
Shops_1          7000 non-null int64
Shops_2          7000 non-null object
Price            7000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.1+ MB


In [351]:
train.loc[data['Rooms'] < 1, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
4366,456,6,0.0,81.491446,,0.0,4,0.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B,212864.799112
1397,12638,27,0.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,B,B,11,3097,0,,0,0,B,268394.744389
2269,7317,27,0.0,41.790881,,0.0,13,0.0,1977,0.211401,B,B,9,1892,0,,0,1,B,98129.976788
3911,770,28,0.0,49.483501,,0.0,16,0.0,2015,0.118537,B,B,30,6207,1,1183.0,1,0,B,217009.338463
4853,3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,,0,0,B,126596.941798


### Prepare train, valid

In [352]:
# исправим название колонки Helthcare_2
def rename_column(df):
    cols = df.columns
    for i in cols:
        if i == 'Helthcare_2':
            df = df.rename(columns={'Helthcare_2':'Healthcare_2'})
    return df

In [353]:
# заменяем NaN в колонке Healthcare_1 на 0
def fillna_healthcare_1(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(0)
    return df   

In [354]:
# исправляем комнаты
def clean_rooms(df):
    df.loc[df['Rooms'] > 6, 'Rooms'] = 6
    df.loc[df['Rooms'] < 0, 'Rooms'] = 1
    return df

In [355]:
# исправляем площади
def clean_square(df):
    Square = dict(df.groupby('Rooms')['Square'].median())
    inx = (df['Rooms'] < 2) & (df['Square'] < 15)
    df.loc[inx, 'Square'] = df.loc[inx, 'Square'].apply(lambda x: Square[x])
    return df

In [356]:
def clean_year(df):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = 2020
    return df

In [357]:
distr_info1 = train['DistrictId'].value_counts(normalize=True).reset_index().rename(columns={'index':'DistrictId', 'DistrictId':'flat_qty_distr'})
distr_stat_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_dr'})
distr_stat_r = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})
mean_price = train['Price'].mean()

In [358]:
def add_distr_info1(df, distr_info1): 
    df = pd.merge(df, distr_info1, on='DistrictId', how='left')
    df['flat_qty_distr'] = df['flat_qty_distr'].fillna(0.000143)
    return df

In [359]:
def add_distr_stats(df, distr_stat_dr, distr_stat_r, mean_price):
    df = pd.merge(df, distr_stat_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, distr_stat_r, on='Rooms', how='left')
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [360]:
def add_cat_fts(df, cat_fts= ('Ecology_2', 'Ecology_3', 'Shops_2')):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df    

In [363]:
def prepare_data(df, distr_info1, distr_stat_dr, distr_stat_r, mean_price):
    df = rename_column(df)
#     df = fillna_healthcare_1(df)
    df = clean_square(df)
#     df = add_distr_info1(df, distr_info1)
#     df = add_distr_stats(df, distr_stat_dr, distr_stat_r, mean_price)
#     df = add_cat_fts(df)

    return df

In [1]:
train = prepare_data(train, distr_info1, distr_stat_dr, distr_stat_r, mean_price)
train.head()

NameError: name 'prepare_data' is not defined

In [340]:
train['Healthcare_1'].isnull().mean()

0.4797142857142857

In [341]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 9069 to 7270
Data columns (total 20 columns):
Id               7000 non-null int64
DistrictId       7000 non-null int64
Rooms            7000 non-null float64
Square           7000 non-null float64
LifeSquare       5514 non-null float64
KitchenSquare    7000 non-null float64
Floor            7000 non-null int64
HouseFloor       7000 non-null float64
HouseYear        7000 non-null int64
Ecology_1        7000 non-null float64
Ecology_2        7000 non-null object
Ecology_3        7000 non-null object
Social_1         7000 non-null int64
Social_2         7000 non-null int64
Social_3         7000 non-null int64
Healthcare_1     3642 non-null float64
Helthcare_2      7000 non-null int64
Shops_1          7000 non-null int64
Shops_2          7000 non-null object
Price            7000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.1+ MB


In [342]:
valid = prepare_data(valid, distr_info1, distr_stat_dr, distr_stat_r, mean_price)
valid.head()

KeyError: 1.9889427240936488

### Model

In [None]:
train.columns

In [None]:
feats = ['Rooms', 'Square', 'flat_qty_distr', 'mean_price_dr', 'HouseYear']

In [None]:
from sklearn.ensemble import RandomForestRegressor as RF

In [None]:
model = RF(n_estimators=20, max_depth=12, random_state= 42)

In [None]:
model.fit(train.loc[:, feats], train['Price'])

In [None]:
pred_train = model.predict(train.loc[:, feats])

In [None]:
pred_train.shape

In [None]:
pred_train

In [None]:
pred_valid = model.predict(valid.loc[:, feats])

In [None]:
pred_valid.shape

In [None]:
pred_valid

### Evaluate model

In [None]:
from sklearn.metrics import r2_score as R2

In [None]:
R2(train['Price'], pred_train)

In [None]:
R2(valid['Price'], pred_valid)

### Test

In [None]:
?prepare_data

In [None]:
test = pd.read_csv('data/test.csv')

In [None]:
test = prepare_data(test, distr_info1, distr_stat_dr, distr_stat_r, mean_price)

In [None]:
test['Price'] = model.predict(test.loc[:, feats])

In [None]:
test.loc[:, ['Id', 'Price']].to_csv('ASatyukova_predictions_v3.csv', index=None)

In [None]:
result = pd.read_csv('ASatyukova_predictions_v3.csv')

In [None]:
result.shape

In [None]:
result.head()