In [91]:
import pandas as pd
import numpy as np

import category_encoders as ce
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.ensemble import   (AdaBoostRegressor, 
                                ExtraTreesRegressor, 
                                GradientBoostingRegressor, 
                                RandomForestRegressor)

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import r2_score, mean_squared_error, make_scorer

from hyperopt import hp, tpe
from hyperopt.fmin import fmin


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [119]:
def prepare_dataset(train, test):
    train = train.copy(); test = test.copy()
    
    dataset = pd.concat([train, test], keys=['train', 'test'])
    dataset['TotalSF'] = dataset['TotalBsmtSF'] + dataset['1stFlrSF'] + dataset['2ndFlrSF']
    dataset.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
    
    ## удалим столбцы, где слишком много пропусков
    nans_dataset = dataset.isna().sum(axis=0).sort_values(ascending=False) / len(dataset)
    many_missing = list(nans_dataset[nans_dataset>0.15].index)
    dataset.drop(columns=many_missing, inplace=True)
    
    # заполняем пропуски
    for col in ('GarageType', 'GarageFinish', 'GarageQual', 
                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        dataset[col] = dataset[col].fillna('No')
    
    for col in dataset.columns:
        if dataset[col].dtype == 'object':
            dataset[col] = dataset[col].fillna(dataset[col].mode()[0])
        else:
            dataset[col] = dataset[col].fillna(dataset[col].mean())
            
            
    # стандартизация        
    num_cols = dataset.select_dtypes(['int64', 'float64']).columns
    dataset[num_cols] = StandardScaler().fit_transform(dataset[num_cols])

            
    train_features = dataset.loc['train']
    test_features = dataset.loc['test']
    
    # преобразуем категориальные переменные при помощи TargetEncoder
    ce_target = ce.TargetEncoder()
    train_features = ce_target.fit_transform(train_features, train['SalePrice'])
    test_features = ce_target.transform(test_features)
    
    
    return train_features, test_features
    
    

In [120]:
trainP, testP = prepare_dataset(train, test)

In [78]:
X = trainP
y = np.log(train['SalePrice'])

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=200)

## Проверим несколько базовых моделей 

In [100]:
def get_score(preds, true, suff):    
    return {f'r2_{suff}': r2_score(preds, true),
            f'RMSE{suff}': np.sqrt(mean_squared_error(preds, true))}

def train_test(model, X_train, X_test, y_train, y_test):
    prediction_train = model.predict(X_train)
    print(model.__class__.__name__)
    prediction_test = model.predict(X_test)
    
    df = pd.DataFrame([{"model":  model.__class__.__name__,
                   **get_score(prediction_train, y_train, 'train'),
                   **get_score(prediction_test, y_test, 'test') 
                  }])
    print(df)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=200)

for model_class in (AdaBoostRegressor,
                    ExtraTreesRegressor,
                    GradientBoostingRegressor,
                    RandomForestRegressor):
    mod = model_class().fit(X_train, y_train)
    train_test(mod, X_train, X_test, y_train, y_test)
    print()
    

AdaBoostRegressor
               model  r2_train  RMSEtrain   r2_test  RMSEtest
0  AdaBoostRegressor  0.854985   0.144918  0.815943  0.168833

ExtraTreesRegressor
                 model  r2_train  RMSEtrain   r2_test  RMSEtest
0  ExtraTreesRegressor       1.0   0.000008  0.850061  0.153681

GradientBoostingRegressor
                       model  r2_train  RMSEtrain   r2_test  RMSEtest
0  GradientBoostingRegressor  0.955743    0.08048  0.881299  0.136816

RandomForestRegressor
                   model  r2_train  RMSEtrain  r2_test  RMSEtest
0  RandomForestRegressor  0.981528    0.05153  0.83977  0.156362



## Самым перспективным выглядит GradientBoostingRegressor

## Настроим гиперпараметры:

In [103]:
def rmse_sklearn(truth, predictions):
    return mean_squared_error(truth, predictions) ** 0.5

rmse_scorer = make_scorer(rmse_sklearn, greater_is_better=True, needs_proba=False)


In [88]:
def objective(params):
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']),
              'max_features': params['max_features'],
              'learning_rate': params['learning_rate'],
             }
    
    gb = GradientBoostingRegressor(**params)
    score = cross_val_score(gb, X, y, scoring=rmse_scorer, cv=5).mean()
    print("RMSE {:.3f} params {}".format(score, params))
    return score

space_gb = {
    'n_estimators': hp.quniform('n_estimators', 200, 400, 25),
    'max_depth': hp.quniform('max_depth', 2, 5, 1),
    'max_features': hp.quniform('max_features', 0.3, 0.6, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0.04, 0.06, 0.005)
}

best = fmin(fn=objective,
            space=space_gb,
            algo=tpe.suggest,
            max_evals=10)

RMSE 0.125 params {'n_estimators': 200, 'max_depth': 2, 'max_features': 0.5, 'learning_rate': 0.05}                    
RMSE 0.121 params {'n_estimators': 325, 'max_depth': 2, 'max_features': 0.55, 'learning_rate': 0.055}                  
RMSE 0.121 params {'n_estimators': 250, 'max_depth': 3, 'max_features': 0.55, 'learning_rate': 0.055}                  
RMSE 0.119 params {'n_estimators': 275, 'max_depth': 4, 'max_features': 0.45, 'learning_rate': 0.055}                  
RMSE 0.122 params {'n_estimators': 250, 'max_depth': 5, 'max_features': 0.55, 'learning_rate': 0.045}                  
RMSE 0.123 params {'n_estimators': 350, 'max_depth': 5, 'max_features': 0.5, 'learning_rate': 0.05}                    
RMSE 0.121 params {'n_estimators': 275, 'max_depth': 3, 'max_features': 0.45, 'learning_rate': 0.06}                   
RMSE 0.121 params {'n_estimators': 325, 'max_depth': 3, 'max_features': 0.6000000000000001, 'learning_rate': 0.055}    
RMSE 0.121 params {'n_estimators': 350, 

In [104]:
# лучшие параметры
best

{'learning_rate': 0.055,
 'max_depth': 4.0,
 'max_features': 0.45,
 'n_estimators': 275.0}

In [116]:
gb = GradientBoostingRegressor(learning_rate=0.055, max_depth=4, max_features=0.45, n_estimators=275)
res = cross_val_score(gb, X, y, cv=5, scoring='neg_root_mean_squared_error')
-res.mean(), res.std()

(0.11914520115408318, 0.008251187553808005)

## Проверили, убедились при помощи кросс-валидации в устойчивости результатов
## Значение RMSE устойчиво на 0.12