In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [16]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from tqdm import tqdm

# Homework description

Take data from kaggle housing
1. Take your best prediction
2. Try pseudo-labeling
3. Try noisy student approach

Hypertuned CatBoostRegressor provided best score on Kaggle so far: 0.12548 public score (from Homework #5)

depth=6, n_estimators=465, learning_rate=0.06

Let's try to create bagging ensemble of 20 CatBoostRegressors with such hyperparameters first and see if the result will be better then currently the best one

No features will be transformed from numerical to categorical

In [2]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [57]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
categorical_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j not in [np.int64, np.float64]]

train_data[categorical_columns] = train_data[categorical_columns].fillna("Other")
train_data[numeric_columns] = train_data[numeric_columns].fillna(-1)
test_data[categorical_columns] = test_data[categorical_columns].fillna("Other")
test_data[numeric_columns] = test_data[numeric_columns].fillna(-1)
x_train = train_data[numeric_columns + categorical_columns]
x_test = test_data[numeric_columns + categorical_columns]

y_train = np.log(train_data['SalePrice'])

In [9]:
y_pred = []

M = 20
TEST_SIZE = 0.2

models = [CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06) for i in range(M)]

for k, model in enumerate(tqdm(models)):
    x_tr, _, y_tr, _ = train_test_split(x_train, y_train, test_size=TEST_SIZE, random_state=k)
    pool_train = Pool(x_tr, y_tr, cat_features=categorical_columns)
    pool_test = Pool(x_test, cat_features=categorical_columns)
    model.fit(pool_train)
    y_pred.append(model.predict(pool_test)) 
    
test_targets = np.exp(np.mean(y_pred, axis=0))
    
submit_bagging = pd.DataFrame()
submit_bagging['Id'] = test_data['Id']
submit_bagging['SalePrice'] = test_targets
print(submit_bagging)

submit_bagging.to_csv('/kaggle/working/hypertuned_20_catboosts_bagging_point2_test_split.csv', index=False)

Above ensemble scored with 0.12297 public score which is better than previous high score.

Now with test size = 0.1


In [25]:
y_pred2 = []

TEST_SIZE = 0.1

models = [CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06) for i in range(M)]

for k, model in enumerate(tqdm(models)):
    x_tr, _, y_tr, _ = train_test_split(x_train, y_train, test_size=TEST_SIZE, random_state=k)
    pool_train = Pool(x_tr, y_tr, cat_features=categorical_columns)
    pool_test = Pool(x_test, cat_features=categorical_columns)
    model.fit(pool_train)
    y_pred2.append(model.predict(pool_test)) 
    
test_targets2 = np.exp(np.mean(y_pred2, axis=0))
    
submit_bagging2 = pd.DataFrame()
submit_bagging2['Id'] = test_data['Id']
submit_bagging2['SalePrice'] = test_targets2
print(submit_bagging2)

submit_bagging2.to_csv('/kaggle/working/hypertuned_20_catboosts_bagging_point1_test_split.csv', index=False)

Test size = 0.1 helped to achieve even better public score: 0.12252

In [26]:
y_pred3 = []

TEST_SIZE = 0.05

models = [CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06) for i in range(M)]

for k, model in enumerate(tqdm(models)):
    x_tr, _, y_tr, _ = train_test_split(x_train, y_train, test_size=TEST_SIZE, random_state=k)
    pool_train = Pool(x_tr, y_tr, cat_features=categorical_columns)
    pool_test = Pool(x_test, cat_features=categorical_columns)
    model.fit(pool_train)
    y_pred3.append(model.predict(pool_test)) 
    
test_targets3 = np.exp(np.mean(y_pred3, axis=0))
    
submit_bagging3 = pd.DataFrame()
submit_bagging3['Id'] = test_data['Id']
submit_bagging3['SalePrice'] = test_targets3
print(submit_bagging3)

submit_bagging3.to_csv('/kaggle/working/hypertuned_20_catboosts_bagging_point05_test_split.csv', index=False)

Decreasing test size to 0.05 led to (once again) better public score: 0.12206

### Pseudo-labeling

In [58]:
# Add small noise into test data in a naive way (some of features marked as numeric are indeed categorical + is adding gaussian noise to integer features actually ok?)

mu, sigma = 0, 0.1
x_test_noise = np.random.normal(mu, sigma, [x_test[numeric_columns].shape[0], x_test[numeric_columns].shape[1]])
print(x_test_noise)

In [59]:
x_test_with_noise_only_numeric = x_test[numeric_columns].add(x_test_noise)
x_test_with_noise_only_numeric

In [62]:
x_test_with_noise = pd.concat([x_test_with_noise_only_numeric, x_test[categorical_columns]], axis=1)
x_test_with_noise

Get test's targets for data with noise

Approach #1 - predict new test targets (pseudo-labels) based on features with noise - don't apply noise to target itself

In [69]:
y_pred_for_x_test_with_noise = []

TEST_SIZE = 0.05

models = [CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06) for i in range(M)]

for k, model in enumerate(tqdm(models)):
    x_tr, _, y_tr, _ = train_test_split(x_train, y_train, test_size=TEST_SIZE, random_state=k)
    pool_train = Pool(x_tr, y_tr, cat_features=categorical_columns)
    pool_test = Pool(x_test_with_noise, cat_features=categorical_columns)
    model.fit(pool_train)
    y_pred_for_x_test_with_noise.append(model.predict(pool_test)) 
    
# important! don't use exponent here
test_target_with_noise = np.mean(y_pred_for_x_test_with_noise, axis=0)

In [65]:
x_train_combined = x_train.append(x_test_with_noise, ignore_index=True)
x_train_combined

In [71]:
test_target_with_noise = pd.Series(data=test_target_with_noise, name='SalePrice')
test_target_with_noise

In [72]:
y_train_combined = y_train.append(test_target_with_noise, ignore_index=True)
y_train_combined

In [81]:
y_pred4 = []

TEST_SIZE = 0.05

models = [CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06) for i in range(M)]

for k, model in enumerate(tqdm(models)):
    x_tr, _, y_tr, _ = train_test_split(x_train_combined, y_train_combined, test_size=TEST_SIZE, random_state=k)
    pool_train = Pool(x_tr, y_tr, cat_features=categorical_columns)
    pool_test = Pool(x_test, cat_features=categorical_columns)
    model.fit(pool_train)
    y_pred4.append(model.predict(pool_test)) 
    
test_targets4 = np.exp(np.mean(y_pred4, axis=0))
    
submit_pseudolabeling = pd.DataFrame()
submit_pseudolabeling['Id'] = test_data['Id']
submit_pseudolabeling['SalePrice'] = test_targets4
print(submit_pseudolabeling)

submit_pseudolabeling.to_csv('/kaggle/working/pseudolabeling2.csv', index=False)

The public score achieved by using pseudo-labeling approach used above: 0.12102

It is know the best score achieved

### Noisy Student based on the following paper: https://arxiv.org/pdf/1911.04252.pdf

In [12]:
def load_and_preprocess():
    train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
    test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

    numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
    categorical_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j not in [np.int64, np.float64]]

    train_data[categorical_columns] = train_data[categorical_columns].fillna("Other")
    train_data[numeric_columns] = train_data[numeric_columns].fillna(-1)
    test_data[categorical_columns] = test_data[categorical_columns].fillna("Other")
    test_data[numeric_columns] = test_data[numeric_columns].fillna(-1)
    x_train = train_data[numeric_columns + categorical_columns]
    x_test = test_data[numeric_columns + categorical_columns]

    y_train = np.log(train_data['SalePrice'])
    
    return x_train, y_train, x_test, numeric_columns, categorical_columns

In [91]:
MU, SIGMA = 0, 0.1
TEST_SIZE = 0.05
M = 20

def train_models(x_train, y_train, categorical_columns, print_text):
    models = [CatBoostRegressor(verbose=False, depth=6, n_estimators=465, learning_rate=0.06) for i in range(M)]
    print(print_text)
    for k, model in enumerate(tqdm(models)):
        x_tr, _, y_tr, _ = train_test_split(x_train, y_train, test_size=TEST_SIZE, random_state=k)
        pool_train = Pool(x_tr, y_tr, cat_features=categorical_columns)
        model.fit(pool_train)
        
    return models

def generate_pseudo_labels(x_test, categorical_columns, models):
    pseudolabels = []
    for k, model in enumerate(models):
        pool_test = Pool(x_test, cat_features=categorical_columns)
        pseudolabels.append(model.predict(pool_test)) 

    # important! don't use exponent here
    pseudolabels = np.mean(pseudolabels, axis=0)
    
    return np.array(pseudolabels)

def add_noise(x_test, pseudo_labels):
    x_test_noise = np.random.normal(MU, SIGMA, [x_test[numeric_columns].shape[0], x_test[numeric_columns].shape[1]])
    x_test_with_noise_only_numeric = x_test[numeric_columns].add(x_test_noise)
    x_test_with_noise = pd.concat([x_test_with_noise_only_numeric, x_test[categorical_columns]], axis=1)
    
    pseudo_labels_noise = np.random.normal(MU, SIGMA, [pseudo_labels.shape[0]])
    pseudo_labels_with_noise = pseudo_labels + pseudo_labels_noise
    
    return x_test_with_noise, pseudo_labels_with_noise

def combine_data(x_train, x_test_with_noise, y_train, pseudo_labels_with_noise):
    x_train_combined = x_train.append(x_test_with_noise, ignore_index=True)
    pseudo_labels_with_noise = pd.Series(data=pseudo_labels_with_noise, name='SalePrice')
    y_train_combined = y_train.append(pseudo_labels_with_noise, ignore_index=True)
    
    return x_train_combined, y_train_combined

def student_predict(x_test, categorical_columns, models):
    y_preds = []
    print('Final prediction...')
    for k, model in enumerate(tqdm(models)):
        pool_test = Pool(x_test, cat_features=categorical_columns)
        y_preds.append(model.predict(pool_test)) 

    y_pred = np.exp(np.mean(y_preds, axis=0))
    
    return y_pred

def noisy_student(iterations):
    x_train, y_train, x_test, numeric_columns, categorical_columns = load_and_preprocess()
    teacher_models = train_models(x_train, y_train, categorical_columns, 'Learning on initial train data...')
    print('Iterative training...')
    for i in tqdm(range(iterations)):
        pseudo_labels = generate_pseudo_labels(x_test, categorical_columns, teacher_models)
        # Approach #2 - Apply noise to both test data and pseudo-labels
        x_test_with_noise, pseudo_labels_with_noise = add_noise(x_test, pseudo_labels)
        x_train, y_train = combine_data(x_train, x_test_with_noise, y_train, pseudo_labels_with_noise)
        student_models = train_models(x_train, y_train, categorical_columns, 'Learning on data with noise...')
        teacher_models = student_models
    y_pred = student_predict(x_test, categorical_columns, teacher_models)
    
    return y_pred

In [48]:
y_pred = noisy_student(10)
y_pred

In [49]:
submit_noisy_student = pd.DataFrame()
submit_noisy_student['Id'] = test_data['Id']
submit_noisy_student['SalePrice'] = y_pred

submit_noisy_student.to_csv('/kaggle/working/noisy_student.csv', index=False)

Training noisy student for 10 iterations resulted in the public score = 0.12272

In [50]:
y_pred_3_iters = noisy_student(3)
submit_noisy_student_3_iters = pd.DataFrame()
submit_noisy_student_3_iters['Id'] = test_data['Id']
submit_noisy_student_3_iters['SalePrice'] = y_pred_3_iters

submit_noisy_student_3_iters.to_csv('/kaggle/working/noisy_student_3_iters.csv', index=False)

Applying noisy student approach with 3 iterations resulted in worse public score than for 10 iterations: 0.12450

Let's try:
1. scaling features to [0, 1] range and the target with the same type of scaler and then use inverse_transform to bring back log values
2. apply Gaussian noise with smaller sigma

In [99]:
from sklearn.preprocessing import MinMaxScaler
MU, SIGMA = 0, 0.001

def scale(x_train, x_test, y_train, numeric_columns):
    scaler = MinMaxScaler().fit(x_train[numeric_columns])
    x_train[numeric_columns] = scaler.transform(x_train[numeric_columns])
    x_test[numeric_columns] = scaler.transform(x_test[numeric_columns])
    target_scaler = MinMaxScaler().fit(y_train.values.reshape(-1, 1))
    y_train = target_scaler.transform(y_train.values.reshape(-1, 1))
    
    return x_train, x_test, y_train, target_scaler
    
def add_noise_scale_version(x_test, pseudo_labels):
    x_test_noise = np.random.normal(MU, SIGMA, [x_test[numeric_columns].shape[0], x_test[numeric_columns].shape[1]])
    x_test_with_noise_only_numeric = x_test[numeric_columns].add(x_test_noise)
    x_test_with_noise = pd.concat([x_test_with_noise_only_numeric, x_test[categorical_columns]], axis=1)
    
    pseudo_labels_noise = np.random.normal(MU, SIGMA, [pseudo_labels.shape[0]])
    pseudo_labels_with_noise = pseudo_labels + pseudo_labels_noise
    
    return x_test_with_noise, pseudo_labels_with_noise

def combine_data_scale_version(x_train, x_test_with_noise, y_train, pseudo_labels_with_noise):
    x_train_combined = x_train.append(x_test_with_noise, ignore_index=True)
    pseudo_labels_with_noise = pd.Series(data=pseudo_labels_with_noise, name='SalePrice')
    if type(y_train) != pd.Series:
        y_train = pd.Series(data=y_train.flatten(), name='SalePrice')
    y_train_combined = y_train.append(pseudo_labels_with_noise, ignore_index=True)
    return x_train_combined, y_train_combined

def student_predict_scale_version(x_test, categorical_columns, models, target_scaler):
    y_preds = []
    print('Final prediction...')
    for k, model in enumerate(tqdm(models)):
        pool_test = Pool(x_test, cat_features=categorical_columns)
        y_preds.append(model.predict(pool_test)) 

    y_pred_mean = np.mean(y_preds, axis=0)
    y_pred_inverse_scaled = target_scaler.inverse_transform(y_pred_mean.reshape(-1, 1))
    y_pred = np.array(np.exp(y_pred_inverse_scaled).flatten())
    
    return y_pred

def noisy_student_scale_version(iterations):
    x_train, y_train, x_test, numeric_columns, categorical_columns = load_and_preprocess()
    x_train, x_test, y_train, target_scaler = scale(x_train, x_test, y_train, numeric_columns)
    teacher_models = train_models(x_train, y_train, categorical_columns, 'Learning on initial train data...')
    print('Iterative training...')
    for i in tqdm(range(iterations)):
        pseudo_labels = generate_pseudo_labels(x_test, categorical_columns, teacher_models)
        x_test_with_noise, pseudo_labels_with_noise = add_noise_scale_version(x_test, pseudo_labels)
        x_train, y_train = combine_data_scale_version(x_train, x_test_with_noise, y_train, pseudo_labels_with_noise)
        student_models = train_models(x_train, y_train, categorical_columns, 'Learning on data with noise...')
        teacher_models = student_models
    y_pred = student_predict_scale_version(x_test, categorical_columns, teacher_models, target_scaler)
    
    return y_pred

In [100]:
y_pred_3_iters_scaled = noisy_student_scale_version(3)
submit_noisy_student_3_iters_scaled = pd.DataFrame()
submit_noisy_student_3_iters_scaled['Id'] = test_data['Id']
submit_noisy_student_3_iters_scaled['SalePrice'] = y_pred_3_iters_scaled

submit_noisy_student_3_iters_scaled.to_csv('/kaggle/working/noisy_student_3_iters_scaled.csv', index=False)

Previous best public score was: 0.12102

This time, with applied scaling, noisy student helped to achieve the new best score: 0.12032
