In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Homework description

1. Take data from house pricing (kaggle)
2. Prepare metrics. Using bootstrap fit K=10 models with different hyper parameters
    - fit K linear regressions per several hyperparameters
    - K gradient bostings per several hyperparameters
    - K neural networks per several hyperparameters
1. For each model and hyperparameter setup estimate confidence interval.
2. Create a rate dashboard using avg metrics (train or validation avg metric). Take top-2 models and answer questions:
    - do for 1-st model train and validation intervals intersect each other?
    - do for 1-st and 2-nd model tr. intervals intersect each other?
    - do for 1-st and 2-dn mode val. intervals intersect each other?
1. Apply t-test to 1-st model (train-validation metrics). Apply same test for 1-st and 2-nd train metrics (for validation too)
2. What model is the best? Does it’s CI intersect other CI?


In [4]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

Features: simple, only floats and ints (data preprocessing was not part of this homework - and any other homework from this module of academy)

In [5]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
x_train = train_data[numeric_columns].fillna(-1)
x_test = test_data[numeric_columns].fillna(-1)

In [6]:
import matplotlib.pyplot as plt

y_train = np.log(train_data['SalePrice'])

_ = plt.hist(y_train)

Why target was logarithmized?
Because without logarithm, in test stage, for parametric models like linear regression or neural network, sale prices (target) will be smaller than 0

In [5]:
def rmse(a, b):
    return ((a - b) ** 2).mean() ** 0.5

**Linear Regression (Elastic Net)**

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from tqdm import tqdm

K = 10

preds_train = []
preds_validation = []

lr_metrics = []

hyperparameters = [
    [1, 0.5],
    [1, 1],
    [1, 0],
    [1, 0.25],
    [1, 0.75],
    [10, 0.5],
    [10, 0.25],
    [10, 0.75],
    [10, 0],
    [10, 1]
]

for hyperparameters_i in tqdm(hyperparameters):
    for k in range(K):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

        lr = ElasticNet(alpha=hyperparameters_i[0], l1_ratio=hyperparameters_i[1]).fit(x_tr, y_tr)

        preds_tr = lr.predict(x_tr)
        preds_val = lr.predict(x_val)

        preds_train.append(preds_tr)
        preds_validation.append(preds_val)

        train_mse = rmse(preds_tr, y_tr)
        val_mse = rmse(preds_val, y_val)

        lr_metrics.append({
            'algorithm' : f'ElasticNet: alpha={hyperparameters_i[0]}, l1_ratio={hyperparameters_i[1]}',
            'train_mse' : train_mse,
            'val_mse' : val_mse,
        })

In [7]:
lr_metrics_df = pd.DataFrame(lr_metrics)
lr_metrics_df.head(20)

In [8]:
grouped_lr_metrics_df = lr_metrics_df.groupby(['algorithm']).agg(['mean', 'std', 'count'])
grouped_lr_metrics_df.head()

In [9]:
from scipy.stats import norm
a = 0.05
# 95% confident interval
xi = norm.ppf(1 - a / 2)

lr_confidence_intervals_train = []
lr_confidence_intervals_val = []

for i in range(len(hyperparameters)):
    lr_confidence_intervals_train.append(
        np.array([-1, 1]) * xi * grouped_lr_metrics_df.iloc[i]['train_mse']['std'] / grouped_lr_metrics_df.iloc[i]['train_mse']['count'] ** 0.5 + grouped_lr_metrics_df.iloc[i]['train_mse']['mean']
    )

    lr_confidence_intervals_val.append(
        np.array([-1, 1]) * xi * grouped_lr_metrics_df.iloc[i]['val_mse']['std'] / grouped_lr_metrics_df.iloc[i]['val_mse']['count'] ** 0.5 + grouped_lr_metrics_df.iloc[i]['val_mse']['mean']
    )

In [10]:
lr_confidence_intervals_train

In [11]:
lr_confidence_intervals_val

In [12]:
grouped_lr_metrics_df['conf_inter_train_left'] = [lr_confidence_intervals_train_el[0] for lr_confidence_intervals_train_el in lr_confidence_intervals_train]
grouped_lr_metrics_df['conf_inter_train_right'] = [lr_confidence_intervals_train_el[1] for lr_confidence_intervals_train_el in lr_confidence_intervals_train]
grouped_lr_metrics_df['conf_inter_val_left'] = [lr_confidence_intervals_val_el[0] for lr_confidence_intervals_val_el in lr_confidence_intervals_val]
grouped_lr_metrics_df['conf_inter_val_right'] = [lr_confidence_intervals_val_el[1] for lr_confidence_intervals_val_el in lr_confidence_intervals_val]

Confidence intervals help to estimate quality of metrics and build statistical tests

In [13]:
grouped_lr_metrics_df

**Gradient Boosting**

In [14]:
import lightgbm as lgbm

preds_train = []
preds_validation = []

gb_metrics = []

hyperparameters = [
    [1000, 3],
    [2000, 3],
    [1000, 4],
    [2000, 4],
    [1000, 5],
    [2000, 5],
    [1000, 6],
    [2000, 6],
    [1000, 7],
    [2000, 7]
]

for hyperparameters_i in tqdm(hyperparameters):
    for k in range(K):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

        lgbm_model = lgbm.LGBMRegressor(n_estimators=hyperparameters_i[0], max_depth=hyperparameters_i[1]).fit(x_tr, y_tr)

        preds_tr = lgbm_model.predict(x_tr)
        preds_val = lgbm_model.predict(x_val)

        preds_train.append(preds_tr)
        preds_validation.append(preds_val)

        train_mse = rmse(preds_tr, y_tr)
        val_mse = rmse(preds_val, y_val)

        gb_metrics.append({
            'algorithm' : f'LightGBM: n_estimators={hyperparameters_i[0]}, max_depth={hyperparameters_i[1]}',
            'train_mse' : train_mse,
            'val_mse' : val_mse,
        })

In [15]:
gb_metrics_df = pd.DataFrame(gb_metrics)
gb_metrics_df.head(20)

In [16]:
grouped_gb_metrics_df = gb_metrics_df.groupby(['algorithm']).agg(['mean', 'std', 'count'])
grouped_gb_metrics_df.head()

In [17]:
gb_confidence_intervals_train = []
gb_confidence_intervals_val = []

for i in range(len(hyperparameters)):
    gb_confidence_intervals_train.append(
        np.array([-1, 1]) * xi * grouped_gb_metrics_df.iloc[i]['train_mse']['std'] / grouped_gb_metrics_df.iloc[i]['train_mse']['count'] ** 0.5 + grouped_gb_metrics_df.iloc[i]['train_mse']['mean']
    )

    gb_confidence_intervals_val.append(
        np.array([-1, 1]) * xi * grouped_gb_metrics_df.iloc[i]['val_mse']['std'] / grouped_gb_metrics_df.iloc[i]['val_mse']['count'] ** 0.5 + grouped_gb_metrics_df.iloc[i]['val_mse']['mean']
    )

In [18]:
grouped_gb_metrics_df['conf_inter_train_left'] = [gb_confidence_intervals_train_el[0] for gb_confidence_intervals_train_el in gb_confidence_intervals_train]
grouped_gb_metrics_df['conf_inter_train_right'] = [gb_confidence_intervals_train_el[1] for gb_confidence_intervals_train_el in gb_confidence_intervals_train]
grouped_gb_metrics_df['conf_inter_val_left'] = [gb_confidence_intervals_val_el[0] for gb_confidence_intervals_val_el in gb_confidence_intervals_val]
grouped_gb_metrics_df['conf_inter_val_right'] = [gb_confidence_intervals_val_el[1] for gb_confidence_intervals_val_el in gb_confidence_intervals_val]

grouped_gb_metrics_df

Simple MLPRegressor, nothing more advanced like Keras, just to meet requirements - small amount of iterations & layers to don't spend too much time on training

In [19]:
from sklearn.neural_network import MLPRegressor

preds_train = []
preds_validation = []

mlp_metrics = []

MAX_ITER = 1000

hyperparameters = [
    (2000, 2000),
    (2000, 2000, 2000),
]

for hyperparameters_i in hyperparameters:
    for k in tqdm(range(K)):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)

        mlp_model = MLPRegressor(hidden_layer_sizes=hyperparameters_i,max_iter=MAX_ITER).fit(x_tr, y_tr)
    
        preds_tr = mlp_model.predict(x_tr)
        preds_val = mlp_model.predict(x_val)

        preds_train.append(preds_tr)
        preds_validation.append(preds_val)

        train_mse = rmse(preds_tr, y_tr)
        val_mse = rmse(preds_val, y_val)

        mlp_metrics.append({
            'algorithm' : f'MLPRegressor: hidden_layer_sizes={hyperparameters_i}, max_iter={MAX_ITER}',
            'train_mse' : train_mse,
            'val_mse' : val_mse,
        })

In [20]:
mlp_metrics_df = pd.DataFrame(mlp_metrics)
mlp_metrics_df.head(20)

In [26]:
grouped_mlp_metrics_df = mlp_metrics_df.groupby(['algorithm']).agg(['mean', 'std', 'count'])
grouped_mlp_metrics_df.head()

In [27]:
mlp_confidence_intervals_train = []
mlp_confidence_intervals_val = []

for i in range(len(hyperparameters)):
    mlp_confidence_intervals_train.append(
        np.array([-1, 1]) * xi * grouped_mlp_metrics_df.iloc[i]['train_mse']['std'] / grouped_mlp_metrics_df.iloc[i]['train_mse']['count'] ** 0.5 + grouped_mlp_metrics_df.iloc[i]['train_mse']['mean']
    )

    mlp_confidence_intervals_val.append(
        np.array([-1, 1]) * xi * grouped_mlp_metrics_df.iloc[i]['val_mse']['std'] / grouped_mlp_metrics_df.iloc[i]['val_mse']['count'] ** 0.5 + grouped_mlp_metrics_df.iloc[i]['val_mse']['mean']
    )

In [30]:
grouped_mlp_metrics_df['conf_inter_train_left'] = [mlp_confidence_intervals_train_el[0] for mlp_confidence_intervals_train_el in mlp_confidence_intervals_train]
grouped_mlp_metrics_df['conf_inter_train_right'] = [mlp_confidence_intervals_train_el[1] for mlp_confidence_intervals_train_el in mlp_confidence_intervals_train]
grouped_mlp_metrics_df['conf_inter_val_left'] = [mlp_confidence_intervals_val_el[0] for mlp_confidence_intervals_val_el in mlp_confidence_intervals_val]
grouped_mlp_metrics_df['conf_inter_val_right'] = [mlp_confidence_intervals_val_el[1] for mlp_confidence_intervals_val_el in mlp_confidence_intervals_val]

grouped_mlp_metrics_df

In [31]:
grouped_metrics_df = pd.concat([grouped_lr_metrics_df, grouped_gb_metrics_df, grouped_mlp_metrics_df], axis=0)
grouped_metrics_df

In [32]:
sorted_metrics_df = grouped_metrics_df.sort_values(by=('val_mse', 'mean'))
sorted_metrics_df

In [25]:
sorted_metrics_df.iloc[:2]

- does 1-st model's train and validation intervals intersect each other?

No - that suggests that 1-st model is probably overfitted

- does 1-st and 2-nd model's train intervals intersect each other?

No

- does 1-st and 2-nd model's val intervals intersect each other? 

Yes

In [33]:
from scipy.stats import ttest_rel

In [39]:
first_model_train_mse = []
first_model_val_mse = []
second_model_train_mse = []
second_model_val_mse = []

first_model_val_preds = []

for k in tqdm(range(K)):
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, shuffle=True, random_state=k)
    
    first_model = lgbm.LGBMRegressor(n_estimators=1000, max_depth=3).fit(x_tr, y_tr)

    preds_tr = first_model.predict(x_tr)
    preds_val = first_model.predict(x_val)
    
    first_model_val_preds.append(preds_val)

    first_model_train_mse.append(rmse(preds_tr, y_tr))
    first_model_val_mse.append(rmse(preds_val, y_val))
    
    second_model = lgbm.LGBMRegressor(n_estimators=1000, max_depth=7).fit(x_tr, y_tr)

    preds_tr = second_model.predict(x_tr)
    preds_val = second_model.predict(x_val)

    second_model_train_mse.append(rmse(preds_tr, y_tr))
    second_model_val_mse.append(rmse(preds_val, y_val))

Student's test - test for mean difference

Examples of uses cases:
- test if model-1 avg(evaluation) quality equals to model-2 avg(evaluation) quality

So for alpha=0.05:
- if p value obtained from student's test on test m1 vs test m2 is higher than 0.95, we can sat that their test performance is roughly the same
- if p value obtained from student's test on test m1 vs test m2 is lower than 0.05, we can sat that their test performance is different
- same applies to train

In [36]:
ttest_rel(first_model_train_mse, first_model_val_mse)

In [37]:
ttest_rel(first_model_train_mse, second_model_train_mse)

In [38]:
ttest_rel(first_model_val_mse, second_model_val_mse)

In [41]:
final_model = lgbm.LGBMRegressor(n_estimators=1000, max_depth=3).fit(x_train, y_train)
y_pred = final_model.predict(x_test)

submit = pd.DataFrame()
submit['Id'] = test_data['Id']
submit['SalePrice'] = np.exp(y_pred)

submit

In [44]:
submit.to_csv('/kaggle/working/lgb_estimators_1000_maxdepth_3.csv', index=False)