In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Homework description

Check models on overfitting: **OVERFIT MODELS!**
1. RandomForest with max_depth: 3, 5, 7, 9
2. GB with max_depth: 3, 5, 7, 9; estimators = 1000, 2000, 3000, ...
3. Create n-layer Dense network (examples n=3, 4, ..., neurons=1024, 2048, ...)

Evaluate model on test. Do you have overfit?

Prevent overfitting and obtain underfit on train:
1. Train RF, GB with max_depth in (1, 2, 3), estimators in (10, 25, 35)
2. Train NN with n=1, neurons=10

Try some methods to obtain hyperparameters (max_depth, n, estimators, neurons, ...)
- GridSearch
- Hyperopt, skopt

Have you received good results?

In [2]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
y_train = np.log(train_data['SalePrice'])

In [4]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
x_train = train_data[numeric_columns].fillna(-1)
x_test = test_data[numeric_columns].fillna(-1)

In [2]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor


In [4]:
def rmse(a, b):
    return ((a - b) ** 2).mean() ** 0.5

Overfit and underfit task done only on default numeric columns 

**Overfit**

In [39]:
rf_metrics = []
max_depths = [3, 5, 7, 9]

K = 10

for max_depth in tqdm(max_depths):
    for k in range(K):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=k)
        model = RandomForestRegressor(max_depth=max_depth).fit(x_tr, y_tr)
        model.fit(x_tr, y_tr)
        y_pred_tr = model.predict(x_tr)
        y_pred_val = model.predict(x_val)
        rf_metrics.append({
            'algorithm' : f'RandomForest: max_depth={max_depth}',
            'train_mse' : rmse(y_tr, y_pred_tr),
            'val_mse' : rmse(y_val, y_pred_val)
        })

In [11]:
from scipy.stats import norm
alpha = 0.05

def check_overfit(metrics, a):
    metrics_df = pd.DataFrame(metrics)
    grouped_metrics_df = metrics_df.groupby(['algorithm']).agg(['mean', 'std', 'count'])

    # 95% confident interval
    xi = norm.ppf(1 - a / 2)

    confidence_intervals_train = []
    confidence_intervals_val = []

    for i in range(len(grouped_metrics_df)):
        confidence_intervals_train.append(
            np.array([-1, 1]) * xi * grouped_metrics_df.iloc[i]['train_mse']['std'] / grouped_metrics_df.iloc[i]['train_mse']['count'] ** 0.5 + grouped_metrics_df.iloc[i]['train_mse']['mean']
        )

        confidence_intervals_val.append(
            np.array([-1, 1]) * xi * grouped_metrics_df.iloc[i]['val_mse']['std'] / grouped_metrics_df.iloc[i]['val_mse']['count'] ** 0.5 + grouped_metrics_df.iloc[i]['val_mse']['mean']
        )

    grouped_metrics_df['conf_inter_train_left'] = [confidence_intervals_train_el[0] for confidence_intervals_train_el in confidence_intervals_train]
    grouped_metrics_df['conf_inter_train_right'] = [confidence_intervals_train_el[1] for confidence_intervals_train_el in confidence_intervals_train]
    grouped_metrics_df['conf_inter_val_left'] = [confidence_intervals_val_el[0] for confidence_intervals_val_el in confidence_intervals_val]
    grouped_metrics_df['conf_inter_val_right'] = [confidence_intervals_val_el[1] for confidence_intervals_val_el in confidence_intervals_val]
    intersection_start = max(grouped_metrics_df['conf_inter_train_left'][0], grouped_metrics_df['conf_inter_val_left'][0])
    intersection_end = min(grouped_metrics_df['conf_inter_train_right'][0], grouped_metrics_df['conf_inter_val_right'][0])
    grouped_metrics_df['overfitted_(no_intersection)'] = intersection_start > intersection_end
    
    return grouped_metrics_df

In [35]:
rf_metrics_df = check_overfit(rf_metrics, alpha)
rf_metrics_df

In [41]:
gb_metrics = []
max_depths = [3, 5, 7, 9]
estimators = [1000, 2000, 3000, 4000]

K = 5

for max_depth in tqdm(max_depths):
    for n_estimators in tqdm(estimators):
        for k in range(K):
            x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=k)
            model = GradientBoostingRegressor(max_depth=max_depth, n_estimators=n_estimators).fit(x_tr, y_tr)
            model.fit(x_tr, y_tr)
            y_pred_tr = model.predict(x_tr)
            y_pred_val = model.predict(x_val)
            gb_metrics.append({
            'algorithm' : f'GradientBoostingRegressor: max_depth={max_depth}, n_estimators={n_estimators}',
            'train_mse' : rmse(y_tr, y_pred_tr),
            'val_mse' : rmse(y_val, y_pred_val)
        })

In [42]:
gb_metrics_df = check_overfit(gb_metrics, alpha)
gb_metrics_df

In [52]:
nn_metrics = []
number_of_layers = [3, 4]
number_of_neurons = [1024, 2048]

K = 5

for n in number_of_layers:
    for neurons in number_of_neurons:
        for k in tqdm(range(K)):
            x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=k)
            hidden_layer_sizes = tuple([neurons for _ in range(n)])
            model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes).fit(x_tr, y_tr)
            model.fit(x_tr, y_tr)
            y_pred_tr = model.predict(x_tr)
            y_pred_val = model.predict(x_val)
            nn_metrics.append({
            'algorithm' : f'MLPRegressor: hidden_layer_sizes={hidden_layer_sizes}',
            'train_mse' : rmse(y_tr, y_pred_tr),
            'val_mse' : rmse(y_val, y_pred_val)
            })
            
nn_metrics_df = check_overfit(nn_metrics, alpha)
nn_metrics_df

**Underfit**

In [16]:
rf_metrics_underfit = []
max_depths = [1, 2, 3]
estimators = [10, 25, 35]

K = 10

for max_depth in tqdm(max_depths):
    for n_estimators in tqdm(estimators):
        for k in range(K):
            x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=k)
            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth).fit(x_tr, y_tr)
            model.fit(x_tr, y_tr)
            y_pred_tr = model.predict(x_tr)
            y_pred_val = model.predict(x_val)
            rf_metrics_underfit.append({
                'algorithm' : f'RandomForest: n_estimators={n_estimators}, max_depth={max_depth}',
                'train_mse' : rmse(y_tr, y_pred_tr),
                'val_mse' : rmse(y_val, y_pred_val)
            })
        
rf_metrics_underfit_df = check_overfit(rf_metrics_underfit, alpha)
rf_metrics_underfit_df

In [22]:
max_depths = [1, 2, 3]
estimators = [10, 25, 35]

plt.rcParams["figure.figsize"] = (20,20)

fig, axs = plt.subplots(len(max_depths), len(estimators))

for i, max_depth in enumerate(tqdm(max_depths)):
    for j, n_estimators in enumerate(tqdm(estimators)):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=j*(i+1))
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth).fit(x_tr, y_tr)
        model.fit(x_tr, y_tr)
        y_pred_tr = model.predict(x_tr)
        axs[i, j].plot(np.arange(start=1, stop=51), y_pred_tr[:50], label='prediction')
        axs[i, j].plot(np.arange(start=1, stop=51), y_val[:50], label='validation')
        axs[i, j].set_title(f'max_depth={max_depth}, n_estimators={n_estimators}')
        plt.legend()

In [13]:
gb_metrics_underfit = []
max_depths = [1, 2, 3]
estimators = [10, 25, 35]

K = 10

for max_depth in tqdm(max_depths):
    for n_estimators in tqdm(estimators):
        for k in range(K):
            x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=k)
            model = GradientBoostingRegressor(max_depth=max_depth, n_estimators=n_estimators).fit(x_tr, y_tr)
            model.fit(x_tr, y_tr)
            y_pred_tr = model.predict(x_tr)
            y_pred_val = model.predict(x_val)
            gb_metrics_underfit.append({
            'algorithm' : f'GradientBoostingRegressor: max_depth={max_depth}, n_estimators={n_estimators}',
            'train_mse' : rmse(y_tr, y_pred_tr),
            'val_mse' : rmse(y_val, y_pred_val)
        })

gb_metrics_underfit_df = check_overfit(gb_metrics_underfit, alpha)
gb_metrics_underfit_df

In [23]:
plt.rcParams["figure.figsize"] = (20,20)

fig, axs = plt.subplots(len(max_depths), len(estimators))

for i, max_depth in enumerate(tqdm(max_depths)):
    for j, n_estimators in enumerate(tqdm(estimators)):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=j*(i+1))
        model = GradientBoostingRegressor(max_depth=max_depth, n_estimators=n_estimators).fit(x_tr, y_tr)
        model.fit(x_tr, y_tr)
        y_pred_tr = model.predict(x_tr)
        axs[i, j].plot(np.arange(start=1, stop=51), y_pred_tr[:50], label='prediction')
        axs[i, j].plot(np.arange(start=1, stop=51), y_val[:50], label='validation')
        axs[i, j].set_title(f'max_depth={max_depth}, n_estimators={n_estimators}')
        plt.legend()

In [15]:
nn_metrics_underfit = []
hidden_layer_sizes = (10, )
K = 10

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=k)
    model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes).fit(x_tr, y_tr)
    model.fit(x_tr, y_tr)
    y_pred_tr = model.predict(x_tr)
    y_pred_val = model.predict(x_val)
    nn_metrics_underfit.append({
    'algorithm' : f'MLPRegressor: hidden_layer_sizes={hidden_layer_sizes}',
    'train_mse' : rmse(y_tr, y_pred_tr),
    'val_mse' : rmse(y_val, y_pred_val)
    })
                
nn_metrics_underfit_df = check_overfit(nn_metrics_underfit, alpha)
nn_metrics_underfit_df

In [24]:
plt.rcParams["figure.figsize"] = (15, 15)

hidden_layer_sizes = (10, )

x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, random_state=42)
model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes).fit(x_tr, y_tr)
model.fit(x_tr, y_tr)
y_pred_tr = model.predict(x_tr)
plt.plot(np.arange(start=1, stop=201), y_pred_tr[:200], label='prediction')
plt.plot(np.arange(start=1, stop=201), y_val[:200], label='validation')
plt.title(f'hidden_layer_sizes={hidden_layer_sizes}')
plt.legend()
plt.show()

In [14]:
from catboost import CatBoostRegressor
from catboost import Pool
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from sklearn.model_selection import KFold

In [20]:
import time

In [35]:
x_train = train_data[numeric_columns]

model_cpu = CatBoostRegressor()
start = time.time()
model_cpu.fit(x_train, y_train, verbose=False)
end = time.time()
print('CPU time: ', end - start)

In [18]:
model_gpu = CatBoostRegressor(task_type='GPU')
start = time.time()
model_gpu.fit(x_train, y_train, verbose=False)
end = time.time()
print('GPU time: ', end - start)

**Some preprocessing done - transform some of numerical columns to categories**

**hyperopt**

In [5]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
y_train = np.log(train_data['SalePrice'])
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
for column in numeric_columns:
    print(f'{column}: ', len(train_data[column].unique()))

Columns selected for change were picked in a naive way

In [6]:
columns_to_categorical = ['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'PoolArea', 'MoSold', 'YrSold']
train_data[columns_to_categorical]

In [7]:
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(-1)
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].astype(int)
train_data['GarageYrBlt']

In [8]:
train_data[columns_to_categorical] = train_data[columns_to_categorical].astype('object')

categorical_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j not in [np.int64, np.float64]]
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]

train_data[numeric_columns] = train_data[numeric_columns].fillna(-1)

train_data[categorical_columns] = train_data[categorical_columns].fillna("Other")
train_data.isna().sum()[train_data.isna().sum() > 0]

In [16]:
x_train = train_data[numeric_columns + categorical_columns]

In [94]:
space= {'n_estimators' : hp.quniform('n_estimators', 50, 500, 5),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.1, 0.01),
        'depth':hp.quniform('depth', 3, 10, 1),
       }


kf = KFold(n_splits=5)

# RMSE is the default loss function
def objective(space):
    rmses = []
    for tr_index, val_index in kf.split(x_train):
        x_tr, y_tr = x_train.iloc[tr_index], y_train.iloc[tr_index]
        x_val, y_val = x_train.iloc[val_index], y_train.iloc[val_index]
        pool_tr = Pool(x_tr, y_tr, cat_features=categorical_columns)
        pool_val = Pool(x_val, cat_features=categorical_columns)
        model = CatBoostRegressor(**space, verbose=False)
        model.fit(pool_tr)
        y_pred = model.predict(pool_val)
        rmses.append(rmse(y_val, y_pred))
        
    return np.mean(rmses)


trials = Trials()
best = fmin(objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials)

print(best)
print(trials.results)

In [9]:
test_data[columns_to_categorical]

In [10]:
for column in columns_to_categorical:
    if test_data[column].dtype == 'float64':
        test_data[column] = test_data[column].fillna(-1)
        test_data[column] = test_data[column].astype('int')

In [11]:
test_data[columns_to_categorical] = test_data[columns_to_categorical].astype('object')
test_data[numeric_columns] = test_data[numeric_columns].fillna(-1)
test_data[categorical_columns] = test_data[categorical_columns].fillna("Other")

In [12]:
test_data.dtypes

In [17]:
x_test = test_data[numeric_columns + categorical_columns]

model = CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06)
pool_train = Pool(x_train, y_train, cat_features=categorical_columns)
pool_test = Pool(x_test, cat_features=categorical_columns)
model.fit(pool_train)
y_pred = model.predict(pool_test)

submit = pd.DataFrame()
submit['Id'] = test_data['Id']
submit['SalePrice'] = np.exp(y_pred)
submit

In [163]:
submit.to_csv('/kaggle/working/catboost_hyperopt_num_to_cat.csv', index=False)