In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet


df = pd.read_csv("en_eff.csv")
df.columns = ['relative_compactness', 
              'surface_area', 
              'wall_area', 
              'roof_area', 
              'overall_height', 
              'orientation', 
              'glazing_area', 
              'glazing_distribution', 
              'heating_load', 
              'cooling_load']
X = df[['relative_compactness', 
        'surface_area', 
        'wall_area', 
        'roof_area', 
        'overall_height', 
        'orientation', 
        'glazing_area', 
        'glazing_distribution']]
Y = df[['heating_load']]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3) # random_state controls the shuffling applied to the data before applying the split

In [32]:
def RepeatedKFoldPerf(m, x, y):
    # define model evaluation method
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(m, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    # force scores to be positive
    scores = absolute(scores)
    return 'Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores))

### Linear Regression 

In [38]:
lin_model = LinearRegression()

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(lin_model, X, Y)))

lin_model.fit(X_train, Y_train)
print("Score after fit: {}".format(lin_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 2.085 (0.260)

Score after fit: 0.921854284088875


### Polynomial Regression 

In [39]:
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)
poly_model = LinearRegression()

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(poly_model, X_poly, Y)))

poly_model.fit(X_train, Y_train)
print("Score after fit: {}".format(poly_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 0.346 (0.042)

Score after fit: 0.921854284088875


### Lasso Regression 

In [44]:
lasso_model = Lasso(alpha=0.01)

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(lasso_model, X, Y)))

lasso_model.fit(X_train, Y_train)
print("Score after fit: {}".format(lasso_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 2.213 (0.240)

Score after fit: 0.9175303161178504


### Decision Tree Regression 

In [47]:
tree_model = DecisionTreeRegressor()

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(tree_model, X, Y)))

tree_model.fit(X_train, Y_train)
print("Score after fit: {}".format(tree_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 0.359 (0.052)

Score after fit: 0.9963962171218154


### Random Forest Regression 

In [55]:
rf_model = RandomForestRegressor(n_estimators = 1000, random_state = 42)

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(rf_model, X, Y.values.ravel())))

rf_model.fit(X_train, Y_train.values.ravel())
print("Score after fit: {}".format(rf_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 0.312 (0.039)

Score after fit: 0.9976633026346274


### Support Vector Regression

In [62]:
svr_model = SVR(kernel = 'rbf')

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(svr_model, X, Y.values.ravel())))

svr_model.fit(X_train, Y_train.values.ravel())
print("Score after fit: {}".format(svr_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 3.885 (0.468)

Score after fit: 0.7286712123385257


### Bayesian Ridge Regression

In [65]:
bayes_ridge_model = BayesianRidge()

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(bayes_ridge_model, X, Y.values.ravel())))

bayes_ridge_model.fit(X_train, Y_train.values.ravel())
print("Score after fit: {}".format(bayes_ridge_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 2.095 (0.257)

Score after fit: 0.9209171817646942


### Ridge Regression 

In [67]:
ridge_model = Ridge(alpha=1.0)

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(ridge_model, X, Y.values.ravel())))

ridge_model.fit(X_train, Y_train.values.ravel())
print("Score after fit: {}".format(ridge_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 2.201 (0.242)

Score after fit: 0.9181267401815916


### ElasticNet Regression 

In [77]:
elastic_model = ElasticNet(alpha=0.01, l1_ratio=1.0)

print("RKFold performance: \n{}\n".format(RepeatedKFoldPerf(elastic_model, X, Y.values.ravel())))

elastic_model.fit(X_train, Y_train.values.ravel())
print("Score after fit: {}".format(elastic_model.score(X_test, Y_test)))

RKFold performance: 
Mean MAE: 2.213 (0.240)

Score after fit: 0.9175303161178504
