In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Homework description

Implement simple mixture of experts:

- ROUTER: clusterize your data (on features). Create 2 clusters.
- EXPERTS: Build experts (GB ensamble M=10) per clusters
- AGGREGATOR: you should define it. You can choose: Argmax (probabilities), wieghted sum, NN over expert's outputs and probabilities

In [77]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [78]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
x_train = train_data[numeric_columns].fillna(-1)
x_test = test_data[numeric_columns].fillna(-1)

In [79]:
y_train = np.log(train_data['SalePrice'])

In [80]:
def rmse(a, b):
    return ((a - b) ** 2).mean() ** 0.5

In [81]:
from sklearn.cluster import KMeans

N_CLUSTERS = 2
KMEANS_RANDOM_STATE = 0

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=KMEANS_RANDOM_STATE).fit(x_train)
np.unique(kmeans.labels_, return_counts=True)

In [82]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler().fit(x_train)
x_train = standard_scaler.transform(x_train)
x_test = standard_scaler.transform(x_test)

In [83]:
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=KMEANS_RANDOM_STATE).fit(x_train)
np.unique(kmeans.labels_, return_counts=True)

In [84]:
x_train = pd.DataFrame(x_train, columns=numeric_columns)
x_test = pd.DataFrame(x_test, columns=numeric_columns)
x_train

In [85]:
x_train['Cluster'] = kmeans.labels_
x_train

**Catboost only, avg ensemble**

In [87]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from tqdm import tqdm

M = 10

x_train_clustered_list = [x_train[x_train['Cluster'] == i].drop(columns=['Cluster']) for i in range(N_CLUSTERS)]
y_train_clustered_list = [y_train[x_train[x_train['Cluster'] == i].index] for i in range(N_CLUSTERS)]

experts_rmses_tr = []
experts_rmses_val = []
experts_preds_test = []

for x_train_clustered, y_train_clustered in zip(x_train_clustered_list, y_train_clustered_list):
    rmses_tr = []
    rmses_val = []
    y_preds_test = []
    for m in tqdm(range(M)):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train_clustered, y_train_clustered, test_size=0.1, random_state=m)
        model = CatBoostRegressor(verbose=False).fit(x_tr, y_tr)
        y_pred_tr = model.predict(x_tr)
        rmses_tr.append(rmse(y_tr, y_pred_tr))
        y_pred_val = model.predict(x_val)
        rmses_val.append(rmse(y_val, y_pred_val))
        y_preds_test.append(model.predict(x_test))
    experts_rmses_tr.append(np.exp(np.mean(rmses_tr, axis=0)))
    experts_rmses_val.append(np.exp(np.mean(rmses_val, axis=0)))    
    experts_preds_test.append(np.exp(np.mean(y_preds_test, axis=0)))    

In [88]:
experts_rmses_tr

In [89]:
experts_rmses_val

In [57]:
avg_experts_preds_test = np.mean(experts_preds_test, axis=0)
avg_experts_preds_test

In [58]:
submit_avg = pd.DataFrame()
submit_avg['Id'] = test_data['Id']
submit_avg['SalePrice'] = avg_experts_preds_test

submit_avg.to_csv('/kaggle/working/mixture_of_experts_avg.csv', index=False)

**Different GB, wieghted sum ensemble**

In [94]:
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# !pip install tensorflow_decision_forests
import tensorflow_decision_forests as tfdf

gbs = [
    ('sklearn', GradientBoostingRegressor(), 0.05),
    ('sklearn', GradientBoostingRegressor(), 0.05),
    ('lgb', LGBMRegressor(), 0.1),
    ('lgb', LGBMRegressor(), 0.1),
    ('xgb', XGBRegressor(), 0.1),
    ('xgb', XGBRegressor(), 0.1),
    ('keras', tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0), 0.1),
    ('keras', tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0), 0.1),
    ('catb', CatBoostRegressor(verbose=False), 0.15),
    ('catb', CatBoostRegressor(verbose=False), 0.15),
]

experts_rmses_tr = []
experts_rmses_val = []
experts_preds_test = []

for x_train_clustered, y_train_clustered in zip(x_train_clustered_list, y_train_clustered_list):
    rmses_tr = []
    rmses_val = []
    y_preds_test = []
    for m in tqdm(range(M)):
        x_tr, x_val, y_tr, y_val = train_test_split(x_train_clustered, y_train_clustered, test_size=0.1, random_state=m)
        if gbs[m][0] != 'keras':
            model = gbs[m][1].fit(x_tr, y_tr)
            
            y_pred_tr = model.predict(x_tr)
            y_pred_val = model.predict(x_val)
            y_pred_test = model.predict(x_test)
        else:
            tr = pd.concat([x_tr, y_tr], axis=1)
            tf_tr = tfdf.keras.pd_dataframe_to_tf_dataset(tr, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
            
            model = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
            model.fit(x=tf_tr, verbose=0)

            y_pred_tr = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_tr, task=tfdf.keras.Task.REGRESSION), verbose=0)
            y_pred_val = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_val, task=tfdf.keras.Task.REGRESSION), verbose=0)
            y_pred_test = model.predict(tfdf.keras.pd_dataframe_to_tf_dataset(x_test, task=tfdf.keras.Task.REGRESSION), verbose=0)

            y_pred_tr = np.array([pred_tr_el for pred_tr in y_pred_tr for pred_tr_el in pred_tr])
            y_pred_val = np.array([pred_val_el for pred_val in y_pred_val for pred_val_el in pred_val])
            y_pred_test = np.array([pred_test_el for pred_test in y_pred_test for pred_test_el in pred_test])
                        
        rmses_tr.append(rmse(y_tr, y_pred_tr))
        rmses_val.append(rmse(y_val, y_pred_val))
        y_preds_test.append(gbs[m][2] * y_pred_test)
    
    experts_rmses_tr.append(np.exp(np.mean(rmses_tr, axis=0)))
    experts_rmses_val.append(np.exp(np.mean(rmses_val, axis=0)))    
    experts_preds_test.append(np.exp(np.sum(y_preds_test, axis=0)))    

In [95]:
experts_rmses_tr

In [96]:
experts_rmses_val

In [97]:
avg_experts_preds_test = np.mean(experts_preds_test, axis=0)
avg_experts_preds_test

In [98]:
submit_weighted_ensemble = pd.DataFrame()
submit_weighted_ensemble['Id'] = test_data['Id']
submit_weighted_ensemble['SalePrice'] = avg_experts_preds_test

submit_weighted_ensemble.to_csv('/kaggle/working/mixture_of_experts_weighted_ensemble.csv', index=False)