In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from pylab import rcParams

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [2]:
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.medians=None
        self.kitchen_square_quantile = None
        
    def fit(self, X):
        """Сохранение статистик"""       
        # Расчет медиан
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)
    
    def transform(self, X):
        """Трансформация данных"""

        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
        
        # KitchenSquare
        condition = (X['KitchenSquare'].isna()) \
                    | (X['KitchenSquare'] > self.kitchen_square_quantile)
        
        X.loc[condition, 'KitchenSquare'] = self.medians['KitchenSquare']

        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        
        floor_outliers = X.loc[X['Floor'] > X['HouseFloor']].index
        X.loc[floor_outliers, 'Floor'] = X.loc[floor_outliers, 'HouseFloor']\
                                            .apply(lambda x: random.randint(1, x))
        
        # HouseYear
        current_year = datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1
        
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
                  
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        condition = (X['LifeSquare'].isna()) & \
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3
        
        
        return X

In [4]:
def floor_to_cat(X):

    X['floor_cat'] = 0

    X.loc[X['Floor'] <= 3, 'floor_cat'] = 1  
    X.loc[(X['Floor'] > 3) & (X['Floor'] <= 5), 'floor_cat'] = 2
    X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
    X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
    X.loc[X['Floor'] > 15, 'floor_cat'] = 5

    return X


def year_to_cat(X):

    X['year_cat'] = 0

    X.loc[X['HouseYear'] <= 1941, 'year_cat'] = 1
    X.loc[(X['HouseYear'] > 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2
    X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3
    X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4
    X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5
    X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6

    return X

In [5]:
class FeatureGenetator():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.house_year_max = None
        self.floor_max = None
        self.district_size = None
    
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # DistrictID
        self.district_size = X['DistrictId'].value_counts().reset_index() \
                               .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
                
        # Target encoding
        ## District, Rooms
        df = X.copy()
        
                    
        ## floor, year
        if y is not None:
            self.floor_max = df['Floor'].max()
            self.house_year_max = df['HouseYear'].max()
            df['Price'] = y.values
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
        

        
    def transform(self, X):
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # DistrictId, IsDistrictLarge
        X = X.merge(self.district_size, on='DistrictId', how='left')
        
        X['new_district'] = 0
        X.loc[X['DistrictSize'].isna(), 'new_district'] = 1
        
        X['DistrictSize'].fillna(5, inplace=True)
        
        X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)
        
        # More categorical features
#         X = self.floor_to_cat(X)  # + столбец floor_cat
#         X = self.year_to_cat(X)   # + столбец year_cat
        
        return X
    
    def floor_to_cat(self, X):
        bins = [0, 3, 5, 9, 15, self.floor_max]
        X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)

        X['floor_cat'].fillna(-1, inplace=True) 
        return X
     
    def year_to_cat(self, X):
        bins = [0, 1941, 1945, 1980, 2000, 2010, self.house_year_max]
        X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)

        X['year_cat'].fillna(-1, inplace=True)
        return X
    
        # Переделываем 'Healthcare_1':
    def health_transform(self, X):
        X["Healthcare_1"].update(X.loc[X.Healthcare_1.isna(),].apply(lambda x: X[X["year_cat"]==x["year_cat"]]["Healthcare_1"].mean(), axis=1))
        return X

In [6]:
TRAIN_DATASET_PATH = '../input/real-estate-price-prediction-moscow/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction-moscow/test.csv'

In [7]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [8]:
preprocessor = DataPreprocessing()
preprocessor.fit(train_df)
preprocessor.fit(test_df)

train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)

train_df.shape, test_df.shape

((10000, 24), (5000, 23))

In [9]:
TRAIN_DATASET_PATH = '../input/real-estate-price-prediction-moscow/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction-moscow/test.csv'

In [10]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [11]:
preprocessor = DataPreprocessing()
preprocessor.fit(train_df)
preprocessor.fit(test_df)

train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)

train_df.shape, test_df.shape

((10000, 24), (5000, 23))

In [12]:
features_gen = FeatureGenetator()
features_gen.fit(train_df)
features_gen.fit(test_df)

train_df = features_gen.transform(train_df)
test_df = features_gen.transform(test_df)

train_df.shape, test_df.shape

((10000, 27), (5000, 26))

In [13]:
train_df = year_to_cat(train_df)
train_df = floor_to_cat(train_df)
train_df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Price,Rooms_outlier,HouseFloor_outlier,HouseYear_outlier,LifeSquare_nan,DistrictSize,new_district,IsDistrictLarge,year_cat,floor_cat
0,11809,27,3.0,115.027311,102.027311,10.0,4,10.0,2014,0.075424,...,305018.871089,0,0,0,1,391.0,0,1,6,2
1,3013,22,1.0,39.832524,23.169223,8.0,7,8.0,1966,0.118537,...,177734.553407,0,0,0,0,33.0,0,0,3,3
2,8215,1,3.0,78.342215,47.671972,10.0,2,17.0,1988,0.025609,...,282078.72085,0,0,0,0,344.0,0,1,4,1
3,2352,1,1.0,40.409907,34.409907,3.0,10,22.0,1977,0.007122,...,168106.00763,0,0,0,1,344.0,0,1,3,4
4,13866,94,2.0,64.285067,38.562517,9.0,16,16.0,1972,0.282798,...,343995.102962,0,0,0,0,43.0,0,0,3,5


In [14]:
train_df["Healthcare_1"].update(train_df.loc[train_df.Healthcare_1.isna(),].apply(lambda x: train_df[train_df["year_cat"]==x["year_cat"]]["Healthcare_1"].mean(), axis=1))

In [15]:
test_df = year_to_cat(test_df)
test_df = floor_to_cat(test_df)
test_df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_2,Rooms_outlier,HouseFloor_outlier,HouseYear_outlier,LifeSquare_nan,DistrictSize,new_district,IsDistrictLarge,year_cat,floor_cat
0,4567,44,1.0,36.84763,19.094182,5.0,5,9.0,1970,0.036122,...,1,0,0,0,0,46,0,0,3,2
1,5925,62,1.0,42.493907,42.568133,10.0,7,17.0,2017,0.072158,...,0,0,0,0,0,122,0,1,6,3
2,960,27,2.0,59.463678,47.463678,9.0,19,19.0,1977,0.211401,...,1,0,0,0,1,391,0,1,3,5
3,3848,23,3.0,49.64603,33.893825,6.0,2,2.0,1965,0.014073,...,1,0,0,0,0,264,0,1,3,1
4,746,74,1.0,53.837056,47.837056,3.0,8,17.0,1977,0.309479,...,1,0,0,0,1,61,0,0,3,3


In [16]:
test_df["Healthcare_1"].update(test_df.loc[test_df.Healthcare_1.isna(),].apply(lambda x: test_df[test_df["year_cat"]==x["year_cat"]]["Healthcare_1"].mean(), axis=1))

In [17]:
print('Строк в трейне: ', train_df.shape[0])
print('Строк в тесте: ', test_df.shape[0])

Строк в трейне:  10000
Строк в тесте:  5000


In [18]:
train_df.shape[1] - 1 == test_df.shape[1]

True

In [19]:
name_1 = 'Id'
train_df = train_df.drop(columns=name_1)
name_2 = 'DistrictId'
train_df = train_df.drop(columns=name_2)
name_3 = 'HouseYear'
train_df = train_df.drop(columns=name_3)
name_4 = 'new_district'
train_df = train_df.drop(columns=name_4)
name_5 = 'HouseYear_outlier'
train_df = train_df.drop(columns=name_5)
name_6 = 'Rooms_outlier'
train_df = train_df.drop(columns=name_6)
name_7 = 'Ecology_2'
train_df = train_df.drop(columns=name_7)
name_8 = 'Ecology_3'
train_df = train_df.drop(columns=name_8)
name_9 = 'IsDistrictLarge'
train_df = train_df.drop(columns=name_9)

In [20]:
PredictSet_Id = test_df.Id
PredictSet_Id.head()

0    4567
1    5925
2     960
3    3848
4     746
Name: Id, dtype: int64

In [21]:
name_1 = 'Id'
test_df = test_df.drop(columns=name_1)
name_2 = 'DistrictId'
test_df = test_df.drop(columns=name_2)
name_3 = 'HouseYear'
test_df = test_df.drop(columns=name_3)
name_4 = 'new_district'
test_df = test_df.drop(columns=name_4)
name_5 = 'HouseYear_outlier'
test_df = test_df.drop(columns=name_5)
name_6 = 'Rooms_outlier'
test_df = test_df.drop(columns=name_6)
name_7 = 'Ecology_2'
test_df = test_df.drop(columns=name_7)
name_8 = 'Ecology_3'
test_df = test_df.drop(columns=name_8)
name_9 = 'IsDistrictLarge'
test_df = test_df.drop(columns=name_9)

In [22]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rooms               10000 non-null  float64
 1   Square              10000 non-null  float64
 2   LifeSquare          10000 non-null  float64
 3   KitchenSquare       10000 non-null  float64
 4   Floor               10000 non-null  int64  
 5   HouseFloor          10000 non-null  float64
 6   Ecology_1           10000 non-null  float64
 7   Social_1            10000 non-null  int64  
 8   Social_2            10000 non-null  int64  
 9   Social_3            10000 non-null  int64  
 10  Healthcare_1        10000 non-null  float64
 11  Helthcare_2         10000 non-null  int64  
 12  Shops_1             10000 non-null  int64  
 13  Shops_2             10000 non-null  int64  
 14  Price               10000 non-null  float64
 15  HouseFloor_outlier  10000 non-null  int64  
 16  LifeS

In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rooms               5000 non-null   float64
 1   Square              5000 non-null   float64
 2   LifeSquare          5000 non-null   float64
 3   KitchenSquare       5000 non-null   float64
 4   Floor               5000 non-null   int64  
 5   HouseFloor          5000 non-null   float64
 6   Ecology_1           5000 non-null   float64
 7   Social_1            5000 non-null   int64  
 8   Social_2            5000 non-null   int64  
 9   Social_3            5000 non-null   int64  
 10  Healthcare_1        5000 non-null   float64
 11  Helthcare_2         5000 non-null   int64  
 12  Shops_1             5000 non-null   int64  
 13  Shops_2             5000 non-null   int64  
 14  HouseFloor_outlier  5000 non-null   int64  
 15  LifeSquare_nan      5000 non-null   int32  
 16  Distri

In [24]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(11,7))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

In [25]:
target_name = 'Price'
X = train_df.drop(columns=target_name)
y = train_df[target_name]

In [26]:
rf_model = RandomForestRegressor(random_state=100, criterion='mse')
rf_model.fit(X, y)

RandomForestRegressor(random_state=100)

In [27]:
y_train_preds = rf_model.predict(X)
y_test_preds = rf_model.predict(test_df)

In [28]:
r2(y, y_train_preds)

0.963669418136676

In [29]:
cv_score = cross_val_score(rf_model, X, y, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))
cv_score

array([0.74807725, 0.72290036, 0.74035074])

In [30]:
cv_score.mean()

0.7371094518148958

In [31]:
feature_importances = pd.DataFrame(zip(X.columns, rf_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature_name,importance
1,Square,0.431692
16,DistrictSize,0.215039
9,Social_3,0.05209
2,LifeSquare,0.042573
6,Ecology_1,0.035104
8,Social_2,0.035051
0,Rooms,0.033827
5,HouseFloor,0.025817
7,Social_1,0.021932
4,Floor,0.021352


In [32]:
predictions = rf_model.predict(test_df)
predictions

array([156443.68186696, 118645.74393578, 139049.59840717, ...,
       147412.9017364 , 196445.96818489, 266207.60590919])

In [33]:
submit = pd.read_csv('sample_submission.csv')
submit.head()

Unnamed: 0,Id,Price
0,4567,200000.0
1,5925,200000.0
2,960,200000.0
3,3848,200000.0
4,746,200000.0


In [34]:
submit['Price'] = predictions
submit.head()

Unnamed: 0,Id,Price
0,4567,156443.681867
1,5925,118645.743936
2,960,139049.598407
3,3848,126641.35491
4,746,208651.822176


In [35]:
submit.to_csv('rf_submit.csv', index=False)