In [None]:
import numpy as np
import pandas as pd
import random

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 2. Разделение датасета и подбор гиперпараметров
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

# 3. Нормализация данных 
from sklearn.preprocessing import StandardScaler

# 4. Модели 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# 5. Метрики качества
from sklearn.metrics import r2_score as r2


import datetime
import gc # сборщик мусора

In [None]:
import warnings
warnings.filterwarnings('ignore')
matplotlib.rcParams.update({'font.size': 14})

In [None]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

Пути к директориям и файлам

In [None]:
TRAIN_DATASET_PATH = '../input/real-estate-price-prediction-moscow/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction-moscow/test.csv'

Описание датасета

##### Id - идентификационный номер квартиры
##### DistrictId - идентификационный номер района
##### Rooms - количество комнат
##### Square - площадь
##### LifeSquare - жилая площадь
##### KitchenSquare - площадь кухни
##### Floor - этаж
##### HouseFloor - количество этажей в доме
##### HouseYear - год постройки дома
##### Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
##### Social_1, Social_2, Social_3 - социальные показатели местности
##### Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья
##### Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
##### Price - цена квартиры

In [None]:
# Подгружаем train data в df_train и test data в X_test
df_train = pd.read_csv(TRAIN_DATASET_PATH)
X_test = pd.read_csv(TEST_DATASET_PATH)



# Для df_train и X_test в качества индекса устанавливаем колонку 'Id'
df_train.set_index('Id', inplace=True)
X_test.set_index('Id', inplace=True)

# Смотрим на количество строк в данных
print('Строк в трейне:', df_train.shape[0])
print('Строк в тесте', X_test.shape[0])

# Удаляем df_train чтобы не занимал место

In [None]:
df_train.head()

In [None]:
df_train.dtypes

In [None]:
df_train['DistrictId'] =df_train['DistrictId'].astype(str)
X_test['DistrictId'] = X_test['DistrictId'].astype(str)

### EDA

In [None]:
plt.figure(figsize = (15,10))

sns.set(font_scale=1.4)

corr_matrix = df_train.corr()
corr_matrix = np.round(corr_matrix, 2)
corr_matrix[np.abs(corr_matrix) < 0.3] = 0

sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='coolwarm')

plt.title('Correlation matrix')
plt.show()

Гистограммы по колличественным признакам

In [None]:
# Делю имеющиеся признаки на категориальные и колличественные 
num_features = list(df_train.select_dtypes(exclude='object').columns)
cat_features = list(df_train.select_dtypes(include='object').columns)

# Создаю гистограммы по кол. признакам 
df_train[num_features].hist( figsize=(16,16), bins=10)

plt.show()

Пропуски

In [None]:
# Смотрим где есть пропуски и создаем список таких колонок
nan_features = df_train.columns[df_train.isna().any()].tolist()
  
# Cмотрим сколько процентов значений пропущено во всех столбцах
    
for el in nan_features:
    print('В столбце', el, 'пропущенно', df_train[el].isnull().sum() * 100 / len(df_train), '% значений')

Выбросы

In [None]:
# Rooms

df_train['Rooms'].value_counts().sort_index()

# Думаю, если > 6 комнат, то сделаю значение = 5 комнатам.
# А если 0 то пусть будет 1

In [None]:
# Square

plt.scatter(df_train['Square'], df_train['Price'])

plt.show()

(df_train['Square'] > 200).value_counts()

# Возможно, стоит выкинуть значения больше 200 

In [None]:
lf_df = df_train[df_train['LifeSquare'] < 50]
lf_df.info()

In [None]:
plt.scatter(df_train['LifeSquare'], df_train['Price'])

plt.show()

(df_train['LifeSquare'] > 200).value_counts()

In [None]:
# KitchenSquare

df_train['KitchenSquare'].value_counts().sort_index()

In [None]:
#HouseFloor

df_train['HouseFloor'].value_counts().sort_index()

In [None]:
# Floor

df_train['Floor'].value_counts().sort_index()

In [None]:
# В данных много ошибок, может ли быть такое, что квартира расположена на этаже, который выше чем заявлено в здании?
(df_train['Floor'] > df_train['HouseFloor']).sum()

In [None]:
# HouseYear

df_train[df_train['HouseYear'] > 2021].head()

Категориальные переменные

In [None]:
#Смотрим на то, какие есть категориальные переменные
df_train.select_dtypes(include='object').columns.tolist()

In [None]:
# DistrictId
df_train['DistrictId'].value_counts()

In [None]:
# Ecology_2

df_train['Ecology_2'].value_counts()

In [None]:
# Ecology_3

df_train['Ecology_3'].value_counts()

In [None]:
# Shops_2

df_train['Shops_2'].value_counts()

Очистка данных(выбросы и пропуски)

In [None]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.medians = None
        
    def fit(self, X):
        """Сохранение статистик"""       
        # Расчет медиан
        self.medians = X.median()
    
    def transform(self, X):
        """Трансформация данных"""

        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms'] # !!! медиана посчитана на трейне !!!
        
        
        # Square !
#         X.loc[(X['Square'] > 110) & (X['Square'] < 1000), 'Square'] =  X.loc[X['Square'] > 300, 'Square'] * 0.2
#         X.loc[(X['Square'] > 110) & (X['Square'] < 1000), 'Square'] =  X.loc[X['Square'] > 300, 'Square'] * 0.2
        X.loc[(X['Square'] < 20), 'Square'] =  X.loc[(X['Square'] < 20), 'Square'] * 2 + 20
        X.loc[(X['Square'] > 250), 'Square'] =  self.medians['Square']
        
        # KitchenSquare
        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        X.loc[X['KitchenSquare'] > 1000, 'KitchenSquare'] = X.loc[X['KitchenSquare'] > 1000, 'KitchenSquare'] / 10  # можно median
        X.loc[X['KitchenSquare'] >= 21, 'KitchenSquare']  = 20 # !!!


        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        X.loc[X['Floor'] > X['HouseFloor'], 'Floor'] = X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor']
        
        
        # HouseYear
        current_year = now = datetime.datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1
        
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
          
        
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        
        condition = (X['LifeSquare'].isna()) &\
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3
        # X.loc[(X['LifeSquare'] > 200), 'LifeSquare'] =  self.medians['LifeSquare']
        # X.loc[(X['LifeSquare'] < 10), 'LifeSquare'] =  self.medians['LifeSquare']
        return X

Feature engineering

In [None]:
class FeatureGenerator():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_price_by_district = None
        self.med_price_by_floor_year = None
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        # DistrictID
        district = X['DistrictId'].value_counts()
        district = district[district > 50]  ## Если вы этого не сделаете, то на Leaderboard-е улетите в самый низ (>100 места, R2 ~ 0.65),
                                            ## Хотя на локальной валидации скор вырастет
        
        self.DistrictId_counts = dict(district)
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # Target encoding
        ## District
        df = X.copy()
        
        if y is not None:
            df['Price'] = y.values
            
            df['DistrictId_popular'] = df['DistrictId'].copy()
            df.loc[~df['DistrictId_popular'].isin(district.keys().tolist())] = np.nan
            
            self.med_price_by_district = df.groupby(['DistrictId_popular', 'Rooms'], as_index=False).agg({'Price':'median'})\
                                            .rename(columns={'Price':'MedPriceByDistrict',
                                                           'DistrictId_popular': 'DistrictId'})
            med_price_by_district = train_df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\
                            .rename(columns={'Price':'MedPriceByDistrict'})
            
            self.med_price_by_district_median = self.med_price_by_district['MedPriceByDistrict'].median()
            
        ## floor, year
        if y is not None:
            df['Price'] = y.values
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByFloorYear'})
            self.med_price_by_floor_year_median = self.med_price_by_floor_year['MedPriceByFloorYear'].median()

        
    def transform(self, X):
        
        # DistrictId
        X['DistrictId_count'] = X['DistrictId'].map(self.DistrictId_counts)  # self.DistrictId_counts = {'id': value}
        
        X['new_district'] = 0
        X.loc[X['DistrictId_count'].isna(), 'new_district'] = 1
        
        X['DistrictId_count'].fillna(5, inplace=True)
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # More categorical features
        X = self.floor_to_cat(X)  # + столбец floor_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        
        # Target encoding
        if self.med_price_by_district is not None:
            X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
            X['MedPriceByDistrict'].fillna(self.med_price_by_district_median, inplace=True)
            
        if self.med_price_by_floor_year is not None:
            X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            X['MedPriceByFloorYear'].fillna(self.med_price_by_floor_year_median, inplace=True)
        
        return X
    
    @staticmethod
    def floor_to_cat(X):
        
        X['floor_cat'] = np.nan
        
        X.loc[X['Floor'] < 3, 'floor_cat'] = 1  
        X.loc[(X['Floor'] >= 3) & (X['Floor'] <= 5), 'floor_cat'] = 2
        X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
        X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
        X.loc[X['Floor'] > 15, 'floor_cat'] = 5
            
        return X
     
    @staticmethod
    def year_to_cat(X):
        
        X['year_cat'] = np.nan
        
        X.loc[X['HouseYear'] < 1941, 'year_cat'] = 1
        X.loc[(X['HouseYear'] >= 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2
        X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3
        X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4
        X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5
        X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6
            
        return X

Отбор признаков

In [None]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',
                 'Helthcare_2', 'Shops_1', 'Shops_2']

new_feature_names = ['Rooms_outlier', 'HouseFloor_outlier', 'HouseYear_outlier', 'LifeSquare_nan', 'DistrictId_count',
                     'new_district', 'floor_cat', 'year_cat',  'MedPriceByDistrict', 'MedPriceByFloorYear']

target_name = 'Price'

Разбиение на train и test

In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

X = train_df.drop(columns=target_name)
y = train_df[target_name]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True)

In [None]:
preprocessor = DataPreprocessing()
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)
test_df = preprocessor.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [None]:
features_gen = FeatureGenerator()
features_gen.fit(X_train, y_train)

X_train = features_gen.transform(X_train)
X_valid = features_gen.transform(X_valid)
test_df = features_gen.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [None]:
X_train = X_train[feature_names + new_feature_names]
X_valid = X_valid[feature_names + new_feature_names]
test_df = test_df[feature_names + new_feature_names]

In [None]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()

##### Построение моделей

LGBMRegressor

In [None]:
lgbm = LGBMRegressor(max_depth=20,
                             num_leaves=31,
                             n_estimators=500,
                             learning_rate=0.03)

lgbm.fit(X_train, y_train)

In [None]:
y_train_preds = lgbm.predict(X_train)
y_valid_preds = lgbm.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_valid_preds)

In [None]:
# cv_score
cv_score = cross_val_score(lgbm, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))

mean = cv_score.mean()
std = cv_score.std()

print('R2: {:.3f} +- {:.3f}'.format(mean, std))

In [None]:
y_train_preds = lgbm.predict(X_train)
y_valid_preds = lgbm.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_valid_preds)

Важность признаков

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, lgbm.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

Прогнозирование на тестовом датасете

In [None]:
test_df.shape

In [None]:
test_df.info()

In [None]:
submit = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/sample_submission.csv')
submit.head()

In [None]:
X.info()

In [None]:
test_df.info()

In [None]:
predictions = lgbm.predict(test_df)
predictions

In [None]:
submit['Price'] = predictions
submit.head()

In [None]:
submit.info()

In [None]:
submit.to_csv('submission.csv', index=False)

In [None]:
submit.info()