# Ensemble Learning

This notebook will be fully dedicated to ensemble learning. I will be experimenting with possible combinations of models

In [41]:
# Importing necessary libraries
import pandas as pd 
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, SGDRegressor, LinearRegression, Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

# Fixing formatting
pd.options.display.float_format = '{:,.4f}'.format

## Getting the Data

In [42]:
# Getting the data
train_prepared = pd.read_csv('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+
                            '/Data/Prepared Data/prepared-training-data.csv')
train_prepared

Unnamed: 0.1,Unnamed: 0,LotFrontage,LotArea,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,CatRemod_N,CatRemod_Y,CatYearBuilt_N,CatYearBuilt_Y,MSSubClass_150,SalePrice
0,618,0.9679,0.4903,-0.5130,1.8628,-0.7203,1.9987,0.6722,1.6024,-0.8780,...,0,0,0,1,0,1,0,1,0,12.6597
1,870,-0.3244,-0.6137,-0.5130,-0.7236,-1.2487,0.8153,0.0345,-0.6749,-0.8780,...,0,0,1,0,1,0,1,0,0,11.6037
2,92,0.5925,0.7473,1.2784,-0.7236,0.7880,-0.8439,0.0162,-0.4350,-0.8780,...,0,0,1,0,0,1,1,0,0,12.0046
3,817,0.1211,0.7336,-0.5130,0.7564,1.4133,-0.2685,0.5377,1.3506,-0.8780,...,0,0,1,0,0,1,0,1,0,12.5099
4,302,1.8313,0.7964,-0.5130,0.7663,-1.2487,1.7214,0.5222,1.0585,-0.8780,...,0,0,1,0,0,1,0,1,0,12.2308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,763,0.6712,0.0750,-0.5130,2.4324,1.3525,-1.1667,0.3361,0.4376,1.2453,...,0,0,1,0,1,0,0,1,0,12.7278
1164,835,-0.3244,0.1094,1.2784,-0.7236,0.3549,0.3408,0.1929,-0.1118,-0.8780,...,0,0,1,0,1,0,1,0,0,11.7598
1165,1216,0.0745,-0.0302,-0.5130,-0.7236,-1.2487,-2.0804,-6.0587,0.5608,1.0544,...,0,0,1,0,1,0,0,1,0,11.6263
1166,559,0.1211,-2.0132,-0.5130,-0.2075,-1.2487,1.5095,0.4194,1.0914,-0.8780,...,0,0,1,0,0,1,0,1,0,12.3631


In [43]:
# Seperating data into X and y
y = train_prepared['SalePrice']
X = train_prepared.drop('SalePrice',axis=1)
X, y

(      Unnamed: 0  LotFrontage  LotArea  OverallCond  MasVnrArea  BsmtFinSF1  \
 0            618       0.9679   0.4903      -0.5130      1.8628     -0.7203   
 1            870      -0.3244  -0.6137      -0.5130     -0.7236     -1.2487   
 2             92       0.5925   0.7473       1.2784     -0.7236      0.7880   
 3            817       0.1211   0.7336      -0.5130      0.7564      1.4133   
 4            302       1.8313   0.7964      -0.5130      0.7663     -1.2487   
 ...          ...          ...      ...          ...         ...         ...   
 1163         763       0.6712   0.0750      -0.5130      2.4324      1.3525   
 1164         835      -0.3244   0.1094       1.2784     -0.7236      0.3549   
 1165        1216       0.0745  -0.0302      -0.5130     -0.7236     -1.2487   
 1166         559       0.1211  -2.0132      -0.5130     -0.2075     -1.2487   
 1167         684      -0.4325   1.1861      -0.5130     -0.0573     -1.2487   
 
       BsmtUnfSF  TotalBsmtSF  1stFlrS

## Models

In [44]:
# Function to Get Results
def fit_ml_algo(algo,X,y,cv=5):
    
    # Building/Training the model
    model = algo.fit(X.values, y.values)
    
    # Evaluation
    scores = cross_validate(estimator=algo,X=X,y=y,scoring='neg_root_mean_squared_error',cv=cv, 
                           return_train_score=True)
    train_error = np.mean(scores['train_score'] * -1)
    test_error = np.mean(scores['test_score'] * -1)
    
    # Returing errors + model
    return model, train_error, test_error

In [45]:
# Creating a dataframe to store the results
results = pd.DataFrame({
    'Model':[],
    'Model Variable Name':[],
    'Training RMSE':[],
    'Cross Val RMSE':[],
    'Difference in Error':[],
})
results

Unnamed: 0,Model,Model Variable Name,Training RMSE,Cross Val RMSE,Difference in Error


### Voting Classifier: Elastic Net + SVR + SGD

In [46]:
# Building the model
voting1 = VotingRegressor(estimators=[
    ('elasticnet',ElasticNet(random_state=0)),
    ('svr',SVR(kernel='rbf')),
    ('sgd',SGDRegressor(random_state=0,early_stopping=True)),
],n_jobs=-1)

In [47]:
# Fitting the model and getting the errors
voting1_fit, train_error, cross_val = fit_ml_algo(voting1,X, y, 5)

In [48]:
# Storing Errors in table
results = results.append({
    'Model': 'Elastic Net + SVR + SGD',
    'Model Variable Name':'voting1_fit',
    'Training RMSE': train_error,
    'Cross Val RMSE': cross_val,
    'Difference in Error': np.abs(cross_val - train_error),
},ignore_index=True)
results

Unnamed: 0,Model,Model Variable Name,Training RMSE,Cross Val RMSE,Difference in Error
0,Elastic Net + SVR + SGD,voting1_fit,179323105373951.06,181928433171231.8,2605327797280.75


### Voting Classifier: Linear Regression + Random Forest + Ridge Regression

In [49]:
# Building the model
voting2 = VotingRegressor(estimators=[
    ('linear',LinearRegression(n_jobs=-1)),
    ('forest',RandomForestRegressor(n_jobs=-1,random_state=0)),
    ('ridge',Ridge(random_state=0)),
],n_jobs=-1)

In [50]:
# Fitting the model and getting the errors
voting2_fit, train_error, cross_val = fit_ml_algo(voting2,X, y, 5)

In [51]:
# Storing Errors in table
results = results.append({
    'Model': 'Linear Regression + Random Forest + Ridge Regression',
    'Model Variable Name':'voting2_fit',
    'Training RMSE': train_error,
    'Cross Val RMSE': cross_val,
    'Difference in Error': np.abs(cross_val - train_error),
},ignore_index=True)
results

Unnamed: 0,Model,Model Variable Name,Training RMSE,Cross Val RMSE,Difference in Error
0,Elastic Net + SVR + SGD,voting1_fit,179323105373951.06,181928433171231.8,2605327797280.75
1,Linear Regression + Random Forest + Ridge Regr...,voting2_fit,0.0803,0.1229,0.0426


## Tuning Models

### Tuning Voting2

In [55]:
# Re-Initializing the Model
voting2_tuned = VotingRegressor(estimators=[
    ('linear',LinearRegression(n_jobs=-1)),
    ('forest',RandomForestRegressor(n_jobs=-1,random_state=0)),
    ('ridge',Ridge(random_state=0)),
],n_jobs=-1)

In [56]:
voting2_tuned.get_params()

{'estimators': [('linear', LinearRegression(n_jobs=-1)),
  ('forest', RandomForestRegressor(n_jobs=-1, random_state=0)),
  ('ridge', Ridge(random_state=0))],
 'n_jobs': -1,
 'verbose': False,
 'weights': None,
 'linear': LinearRegression(n_jobs=-1),
 'forest': RandomForestRegressor(n_jobs=-1, random_state=0),
 'ridge': Ridge(random_state=0),
 'linear__copy_X': True,
 'linear__fit_intercept': True,
 'linear__n_jobs': -1,
 'linear__normalize': False,
 'forest__bootstrap': True,
 'forest__ccp_alpha': 0.0,
 'forest__criterion': 'mse',
 'forest__max_depth': None,
 'forest__max_features': 'auto',
 'forest__max_leaf_nodes': None,
 'forest__max_samples': None,
 'forest__min_impurity_decrease': 0.0,
 'forest__min_impurity_split': None,
 'forest__min_samples_leaf': 1,
 'forest__min_samples_split': 2,
 'forest__min_weight_fraction_leaf': 0.0,
 'forest__n_estimators': 100,
 'forest__n_jobs': -1,
 'forest__oob_score': False,
 'forest__random_state': 0,
 'forest__verbose': 0,
 'forest__warm_start': 

In [57]:
# Setting up for RandomizedSearchCV
params = {
    'forest__n_estimators':[10,50,100,500,1000,1500],
    'forest__max_depth':[3,6,9,15,21,30,None],
    'forest__ccp_alpha':[0,0.5,1.0,1.5,3],
    'forest__max_features':[0.3,0.5,0.8,'sqrt','auto'],
    'forest__max_leaf_nodes':[3,9,18,31,100,None],
    'ridge__alpha':[1.0,1.5,3.0,5.0],
}
random = RandomizedSearchCV(estimator=voting2_tuned,param_distributions=params,scoring='neg_root_mean_squared_error',
                           n_jobs=-1,cv=5,refit=True,return_train_score=True, n_iter=50)

In [58]:
res = random.fit(X,y)

In [59]:
tuned_voting2 = res.best_estimator_

In [61]:
print(f'Test Score Mean: {-res.cv_results_["mean_test_score"].mean()}')
print(f'Test Score STD: {res.cv_results_["mean_test_score"].std()}')
print(f'Train Score Mean: {-res.cv_results_["mean_train_score"].mean()}')
print(f'Train Score STD: {res.cv_results_["mean_train_score"].std()}')

Test Score Mean: 0.16802054750898646
Test Score STD: 0.017506630527391622
Train Score Mean: 0.15439680939801906
Train Score STD: 0.02352630732556999
