<a href="https://colab.research.google.com/github/IvanOM-97/DPro-Exercises/blob/master/U39T1C76EnsembleLearningTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
APRENDIZAJE POR CONJUNTOS (combinacion de multiples modelos)

'''

In [3]:
# loading and preparing regresion dataset to be used
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# loading train.csv
#from google.colab import files
#uploaded = files.upload()

df = pd.read_csv('train.csv')
X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values

# dividing train.csv into 80% of training data and 20% of verification data
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=0)

print(f"training size: {X_train.shape}, validation size: {X_val.shape}")

training size: (1168, 2), validation size: (292, 2)


In [4]:
# PROBLEMA 1 - EJEMPLO DE IMPLEMENTACION DE BLENDING
# fitting different models
model1 = LinearRegression()
model2 = SVR(kernel='rbf', C=1e3, gamma=0.1)
model3 = DecisionTreeRegressor(max_depth=5, random_state=0)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

# predicting on validation set
pred1 = model1.predict(X_val)
pred2 = model2.predict(X_val)
pred3 = model3.predict(X_val)

# blending: simple average
blend_pred = (pred1 + pred2 + pred3) / 3

# evaluating
mse1 = mean_squared_error(y_val, pred1)
mse2 = mean_squared_error(y_val, pred2)
mse3 = mean_squared_error(y_val, pred3)
mse_blended = mean_squared_error(y_val, blend_pred)

print("\n[blending results]")
print(f"linear regression mse: {mse1}")
print(f"svr mse: {mse2}")
print(f"decision tree mse: {mse3}")
print(f"blended model mse: {mse_blended}")


[blending results]
linear regression mse: 2942066921.6721087
svr mse: 7206623146.120989
decision tree mse: 2169961248.6656322
blended model mse: 2860614890.7194366


In [6]:
# PROBLEMA 2 - EJEMPLO DE IMPLEMENTACION DE BAGGING
np.random.seed(0)
n_models = 5
bagged_preds = []

for i in range(n_models):
  # bootstrap sample
  indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
  X_sample = X_train[indices]
  y_sample = y_train[indices]

  # using simple regresor for bagging (decision tree)
  model = DecisionTreeRegressor(max_depth=5, random_state=i)
  model.fit(X_sample, y_sample)

  # predicting on validation set
  pred = model.predict(X_val)
  bagged_preds.append(pred)

# average predicitions
bagged_pred = np.mean(bagged_preds, axis=0)

# evaluating
mse_single_tree = mean_squared_error(y_val, model3.predict(X_val))
mse_bagged = mean_squared_error(y_val, bagged_pred)

print("\n[bagging results]")
print(f"single tree mse: {mse3}")
print(f"bagged tree model mse: {mse_bagged}")




[bagging results]
single tree mse: 2169961248.6656322
bagged tree model mse: 1864051360.9653666


In [8]:
# PROBLEMA 3 - EJEMPLO DE IMPLEMENTACION DE STACKING

#stage 0:training base models
base1 = LinearRegression()
base2 = DecisionTreeRegressor(max_depth=5, random_state=1)

base1.fit(X_train, y_train)
base2.fit(X_train, y_train)

# blended data: base model predictions (level 1 features)
base1_train_pred = base1.predict(X_train)
base2_train_pred = base2.predict(X_train)

stacked_X_train = np.vstack((base1_train_pred, base2_train_pred)).T

# stage 1: trainning meta model(simple linear regression)
meta = LinearRegression()
meta.fit(stacked_X_train, y_train)

# applying to validation
base1_val_pred = base1.predict(X_val)
base2_val_pred = base2.predict(X_val)

stacked_X_val = np.vstack((base1_val_pred, base2_val_pred)).T
stacked_pred = meta.predict(stacked_X_val)

# evaluating
mse_base1 = mean_squared_error(y_val, base1_val_pred)
mse_base2 = mean_squared_error(y_val, base2_val_pred)
mse_stacked = mean_squared_error(y_val, stacked_pred)

print("\n[stacking results]")
print(f"base1 linear regression mse: {mse_base1}")
print(f"base2 decision tree mse: {mse_base2}")
print(f"stacked model mse: {mse_stacked}")


[stacking results]
base1 linear regression mse: 2942066921.6721087
base2 decision tree mse: 2169961248.6656322
stacked model mse: 2192399562.22146
