In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Homework description

1. Take data from house pricing (kaggle)
2. Fit different linear regression models using different ensemble methods:
- Voting
- Soft voting
- Bootstrap aggregation (or bootstrap voting)
- Stacking
- (Almost all methods you can implement yourself or use sklearn)
3. What method is the best?

In [3]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [5]:
y_train = np.log(train_data['SalePrice'])

In [6]:
numeric_columns = [i for i, j in zip(train_data.columns, train_data.dtypes) if j in [np.int64, np.float64] and i not in ['SalePrice', 'Id']]
x_train = train_data[numeric_columns].fillna(-1)
x_test = test_data[numeric_columns].fillna(-1)

In [7]:
def rmse(a, b):
    return ((a - b) ** 2).mean() ** 0.5

**Voting**

In [18]:
from tqdm import tqdm
from sklearn.linear_model import Lasso, Ridge, ElasticNet

y_pred = []

for model in tqdm([Lasso(), Ridge(), ElasticNet()]):
    model.fit(x_train, y_train)
    y_pred.append(model.predict(x_test))
    
submit_voting = pd.DataFrame()
submit_voting['Id'] = test_data['Id']
submit_voting['SalePrice'] = np.exp(np.mean(y_pred, axis=0))
print(submit_voting)

submit_voting.to_csv('/kaggle/working/diff_lin_regr_models_num_only_default_voting.csv', index=False)

**Soft Voting**

In [19]:
y_pred = []

models_and_weights = [
    [Lasso(), 0.5],
    [Ridge(), 0.3],
    [ElasticNet(), 0.2],
]

for model_and_weight in tqdm(models_and_weights):
    model = model_and_weight[0]
    weight = model_and_weight[1]
    model.fit(x_train, y_train)
    y_pred.append(weight * model.predict(x_test))

submit_soft_voting = pd.DataFrame()
submit_soft_voting['Id'] = test_data['Id']
submit_soft_voting['SalePrice'] = np.exp(np.sum(y_pred, axis=0))
print(submit_soft_voting)

submit_soft_voting.to_csv('/kaggle/working/diff_lin_regr_models_num_only_default_soft_voting.csv', index=False)

**Bagging**

In [20]:
from sklearn.model_selection import train_test_split

y_pred = []

for k, model in enumerate(tqdm([Lasso(), Ridge(), ElasticNet()])):
    x_tr, _, y_tr, _ = train_test_split(x_train, y_train, test_size=0.2, random_state=k)
    model.fit(x_tr, y_tr)
    y_pred.append(model.predict(x_test))
    
submit_bagging = pd.DataFrame()
submit_bagging['Id'] = test_data['Id']
submit_bagging['SalePrice'] = np.exp(np.mean(y_pred, axis=0))
print(submit_bagging)

submit_bagging.to_csv('/kaggle/working/diff_lin_regr_models_num_only_default_bagging.csv', index=False)

In [25]:
y_train

**TODO: own implementation of stacking**  (not the one from scikit-learn as its implementation is different than from the main book for this module: "The Elements of Statistical Learning"

(Without stacking), the order of achieved public scores from the best to the worst:
1. Bagging - 0.16842
2. Voting - 0.16906
3. Soft voting - 0.17212