## Read data

In [1]:
import csv
import numpy
import pandas
from sklearn.model_selection import GridSearchCV

pandas.set_option('display.max_columns', 500)

def read_data_as_data_frame():
    with open('EnergyEfficiencyData.csv') as csv_file:
        return pandas.DataFrame(csv.reader(csv_file, delimiter=','))


## Preprocess data - Clean data with missing values

In [2]:
def process_invalid_value(data_frame, is_invalid, invalid_type):
    if is_invalid.sum().sum() == 0: 
        print(f'Data does not contain any {invalid_type} value')
    else:
        print(f'Number of {invalid_type} value or missing value: ', is_invalid.sum().sum())
        # drop any column contain null value
        data_frame = data_frame.dropna(axis=1, how='any')
        # drop any row contain null value
        data_frame = data_frame.dropna(axis=0, how='any')
    return data_frame

def process_empty_string(data_frame):
    # drop row with empty string
    # did not use inplace=True to avoid overwriting the reference to it and avoid SettingWithCopyWarning
    data_frame = data_frame.drop(data_frame[data_frame['Relative Compactness'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Wall Area'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Roof Area'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Surface'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Overall Height'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Orientation'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Glazing Area'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Glazing Area Distribution'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Heating Load'] == ''].index)
    data_frame = data_frame.drop(data_frame[data_frame['Cooling Load'] == ''].index)
    
    return data_frame


## Preprocess data - format data

In [3]:
def format_data(data_frame): 
    # convert all object(string) to number(float)
    float_data_frame = data_frame.drop(['Orientation', 'Glazing Area Distribution'], axis=1).astype(float)
    # convert integer column to integer
    data_frame['Orientation'] = data_frame['Orientation'].astype(int)
    data_frame['Glazing Area Distribution'] = data_frame['Glazing Area Distribution'].astype(int)

    float_data_frame.insert(5, 'Orientation', data_frame['Orientation'], True)
    float_data_frame.insert(7, 'Glazing Area Distribution', data_frame['Glazing Area Distribution'], True)

    data_frame = float_data_frame
    
    return data_frame

## Preprocess data - Check and remove multicollinearity

In [4]:
def check_multicollinearity(data_frame):
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    x = data_frame.drop(['Heating Load', 'Cooling Load'], axis=1)

    # drop columns with strong multicollinearity
    x = x.drop(columns=['Wall Area', 'Roof Area', 'Surface'])
    
    # center the predictors by substracting the mean    
    x = x.subtract(x.mean())
    
    vif_data = pandas.DataFrame()
    vif_data['feature'] = x.columns
    vif_data['VIF'] = [variance_inflation_factor(x.values, i) for i in range(len(x.columns))]

    if vif_data['VIF'][vif_data['VIF'] >= 10].count() == 0:
        print('Data does not contain collinearity or multicollinearity between variables')
    else:
        print(vif_data)
        print('Result suggests high collinearity or multicollinearity between variables')


## Preprocess function

In [5]:
def preprocess_data_frame(data_frame):
    # drop last 2 empty column, use inplace=True to increase readability
    data_frame.drop([10, 11], axis=1, inplace=True)

    # make first row as header by:
    # 1. assign header with first row
    data_frame.columns = data_frame.iloc[0]
    # 2. remove first row
    data_frame = data_frame[1:]

    # rename column to readable attributes
    data_frame.columns = ['Relative Compactness',
                          'Surface',
                          'Wall Area',
                          'Roof Area',
                          'Overall Height',
                          'Orientation',
                          'Glazing Area',
                          'Glazing Area Distribution',
                          'Heating Load',
                          'Cooling Load']
    
    # check the data frame
    print(data_frame.info())

    # check if there is any nan value
    data_frame = process_invalid_value(data_frame, data_frame.isna(), 'NaN')
    # check if there is any null value
    data_frame = process_invalid_value(data_frame, data_frame.isnull(), 'Null')
    # check if there is any empty string
    data_frame = process_empty_string(data_frame)
    
    # format data
    data_frame = format_data(data_frame)
    
    # check collinearity and multicollinearity among vairables
#     check_multicollinearity(data_frame)
    data_frame = data_frame.drop(columns=['Wall Area', 'Roof Area', 'Surface'])

    # check the data frame
    print(data_frame.info())

    return data_frame


## Print result

In [6]:
def print_regression_result(regression_model_cv, x_train, x_test, y_train, y_test, y_predict, regression_model_type):
    print(regression_model_type + " Grid Search CV Best Score {}".format(regression_model_cv.best_score_))
    print(regression_model_type + " Grid Search CV Best Estimator {}".format(regression_model_cv.best_estimator_))

    print(regression_model_type + " Grid Search CV Training Score {}".
          format(regression_model_cv.score(x_train, y_train)))
    print(regression_model_type + " Grid Search CV Testing Score {}\n".
          format(regression_model_cv.score(x_test, y_test)))
    
    print("Adjusted r squared: {}".
          format(1 - (1-regression_model_cv.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)))
    
    return regression_model_cv.best_estimator_, regression_model_cv.score(x_train, y_train), regression_model_cv.score(x_test, y_test)


## Random Forest

In [15]:
def random_forest_regression(x_train, x_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor

    grid_parameters = {
        'bootstrap': [True, False],
        'max_depth': numpy.arange(start=10, stop=101, step=10),
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': numpy.arange(start=100, stop=501, step=100)
    }

    random_forest_cv = GridSearchCV(RandomForestRegressor(), grid_parameters, verbose=1, cv=5, n_jobs=-1)
    random_forest_cv.fit(x_train, y_train)

    y_predict = random_forest_cv.predict(x_test)

    regression_model = 'Random Forest'

    return print_regression_result(random_forest_cv, x_train, x_test, y_train, y_test, y_predict, regression_model)


## Linear Regression

In [8]:
def linear_regression(x_train, x_test, y_train, y_test):
    from sklearn.linear_model import LinearRegression

    # parameters can be improved
    grid_parameters = {
        'normalize': [True, False],
        'fit_intercept': [True, False],
    }

    linear_regression_cv = GridSearchCV(LinearRegression(), grid_parameters, verbose=1, cv=5, n_jobs=-1)
    linear_regression_cv.fit(x_train, y_train)

    y_predict = linear_regression_cv.predict(x_test)

    regression_model = 'Linear Regression'

    return print_regression_result(linear_regression_cv, x_train, x_test, y_train, y_test, y_predict, regression_model)



## Ridge Regression

In [9]:
def ridge_regression(x_train, x_test, y_train, y_test):
    from sklearn.linear_model import Ridge

    # parameters can be improved
    grid_parameters = {
        'alpha': numpy.linspace(1, 21),
        'normalize': [True, False],
        'fit_intercept': [True, False],
        'tol': [1e-5, 1e-4, 1e-3]
    }

    ridge_regression_cv = GridSearchCV(Ridge(), grid_parameters, verbose=1, cv=5, n_jobs=-1)
    ridge_regression_cv.fit(x_train, y_train)

    y_predict = ridge_regression_cv.predict(x_test)

    regression_model = 'Ridge Regression'

    return print_regression_result(ridge_regression_cv, x_train, x_test, y_train, y_test, y_predict, regression_model)

    

## Lasso Regression

In [10]:
def lasso_regression(x_train, x_test, y_train, y_test):
    from sklearn.linear_model import Lasso

    # parameters can be improved
    grid_parameters = {
        'alpha': numpy.linspace(1, 21),
        'normalize': [True, False],
        'fit_intercept': [True, False],
        'tol': [1e-5, 1e-4, 1e-3],
        'max_iter': [10000, 20000, 30000],
    }

    lasso_regression_cv = GridSearchCV(Lasso(), grid_parameters, verbose=1, cv=5, n_jobs=-1)
    lasso_regression_cv.fit(x_train, y_train)

    y_predict = lasso_regression_cv.predict(x_test)

    regression_model = 'Lasso Regression'

    return print_regression_result(lasso_regression_cv, x_train, x_test, y_train, y_test, y_predict, regression_model)

    

## Poisson Regression

In [11]:
def poission_regression(x_train, x_test, y_train, y_test):
    from sklearn.linear_model import PoissonRegressor
    # parameters can be improved
    grid_parameters = {
        'alpha': numpy.linspace(1, 21),
        'warm_start': [True, False],
        'fit_intercept': [True, False],
        'tol': [1e-5, 1e-4, 1e-3],
        'max_iter': [10000, 20000, 30000],
    }

    poisson_regression_cv = GridSearchCV(PoissonRegressor(), grid_parameters, verbose=1, cv=5, n_jobs=-1)
    poisson_regression_cv.fit(x_train, y_train)

    y_predict = poisson_regression_cv.predict(x_test)

    regression_model = 'Poisson Regression'

    return print_regression_result(poisson_regression_cv, x_train, x_test, y_train, y_test, y_predict, regression_model)

## Train Test Split 

In [12]:
def get_x_and_y(processed_data_frame):
    x = processed_data_frame.drop(['Heating Load', 'Cooling Load'], axis=1)
    y_heating = processed_data_frame['Heating Load']
    y_cooling = processed_data_frame['Cooling Load']

    return x, y_heating, y_cooling


def train_set_test_set(x, y):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    x = StandardScaler().fit_transform(x)
    return train_test_split(x, y, test_size=0.2)

## Visualise Result

In [16]:
def tabulate(results):
    header = pandas.MultiIndex.from_product([['Heating','Cooling'],
                                     ['Train Score','Test Score']],
                                    names=['Energy Load','Result'])
    idx = ['Ridge Regression', 'Lasso Regression', 'Linear Regression', 'Poisson Regression', 'Random Forest Regression']
    result_df = pandas.DataFrame(index = idx, columns = header)
    
    random_forest, ridge, lasso, linear, poisson = results
    
    result_df.loc[:, ('Heating', 'Train Score')] = [ridge[0][1], lasso[0][1], linear[0][1], poisson[0][1], random_forest[0][1]]
    result_df.loc[:, ('Heating', 'Test Score')] = [ ridge[0][2], lasso[0][2], linear[0][2], poisson[0][2], random_forest[0][2]]
    result_df.loc[:, ('Cooling', 'Train Score')] = [ridge[1][1], lasso[1][1], linear[1][1], poisson[1][1], random_forest[1][1]]
    result_df.loc[:, ('Cooling', 'Test Score')] = [ridge[1][2], lasso[1][2], linear[1][2], poisson[1][2], random_forest[1][2]]
    
    print(result_df)
    

## Main Function

In [17]:
def main():
    # The dataset contains eight attributes (or features, denoted by X1...X8) and two responses (or outcomes,
    # denoted by y1 and y2). The aim is to use the eight features to predict each of the two responses
    
    processed_data_frame = preprocess_data_frame(read_data_as_data_frame())
    x, y_heating, y_cooling = get_x_and_y(processed_data_frame)

    models = [ridge_regression, lasso_regression, linear_regression, poission_regression, random_forest_regression]
    
    def train_and_test(model):
        x_train, x_test, y_heating_train, y_heating_test = train_set_test_set(x, y_heating)
        best_heating = model(x_train, x_test, y_heating_train, y_heating_test)
        
        x_train, x_test, y_cooling_train, y_cooling_test = train_set_test_set(x, y_cooling)
        best_cooling = model(x_train, x_test, y_cooling_train, y_cooling_test)
                
        return [best_heating, best_cooling]

    results = list(map(train_and_test, models))
    tabulate(results)
    
if __name__ == '__main__':
    main()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 1 to 1296
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Relative Compactness       1296 non-null   object
 1   Surface                    1296 non-null   object
 2   Wall Area                  1296 non-null   object
 3   Roof Area                  1296 non-null   object
 4   Overall Height             1296 non-null   object
 5   Orientation                1296 non-null   object
 6   Glazing Area               1296 non-null   object
 7   Glazing Area Distribution  1296 non-null   object
 8   Heating Load               1296 non-null   object
 9   Cooling Load               1296 non-null   object
dtypes: object(10)
memory usage: 101.4+ KB
None
Data does not contain any NaN value
Data does not contain any Null value
<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 1 to 768
Data columns (total 7 columns):
 #   Col

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:    7.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Ridge Regression Grid Search CV Best Score 0.8953519490481904
Ridge Regression Grid Search CV Best Estimator Ridge(tol=1e-05)
Ridge Regression Grid Search CV Training Score 0.9000822948435954
Ridge Regression Grid Search CV Testing Score 0.928164338286723

Adjusted r squared: 0.9257374578234365
Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Ridge Regression Grid Search CV Best Score 0.8767831584084261
Ridge Regression Grid Search CV Best Estimator Ridge(tol=1e-05)
Ridge Regression Grid Search CV Training Score 0.880386840401586
Ridge Regression Grid Search CV Testing Score 0.8883040634297871

Adjusted r squared: 0.8845305520591717
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 8838 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:    9.2s finished


Lasso Regression Grid Search CV Best Score 0.8400643820577038
Lasso Regression Grid Search CV Best Estimator Lasso(max_iter=10000, tol=1e-05)
Lasso Regression Grid Search CV Training Score 0.8448710527811366
Lasso Regression Grid Search CV Testing Score 0.8434378503364446

Adjusted r squared: 0.8381485885234866
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 8838 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:    9.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


Lasso Regression Grid Search CV Best Score 0.8166124679756956
Lasso Regression Grid Search CV Best Estimator Lasso(max_iter=10000, tol=1e-05)
Lasso Regression Grid Search CV Training Score 0.8199159419922368
Lasso Regression Grid Search CV Testing Score 0.8326050429755372

Adjusted r squared: 0.826949807940927
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Linear Regression Grid Search CV Best Score 0.9042164564262677
Linear Regression Grid Search CV Best Estimator LinearRegression(normalize=True)
Linear Regression Grid Search CV Training Score 0.9066665486093857
Linear Regression Grid Search CV Testing Score 0.9028902434398559

Adjusted r squared: 0.8996095084209321
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Linear Regression Grid Search CV Best Score 0.8772062510206935
Linear Regression Grid Search CV Best Estimator LinearRegression(normalize=True)
Linear Regression Grid Search CV Training Score 0.8790934379756637
Linear Regression Grid Search CV Testing

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 2680 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 6680 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:   26.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Poisson Regression Grid Search CV Best Score 0.9160405753564682
Poisson Regression Grid Search CV Best Estimator PoissonRegressor(max_iter=10000, tol=1e-05, warm_start=True)
Poisson Regression Grid Search CV Training Score 0.9187779380519988
Poisson Regression Grid Search CV Testing Score 0.8915224505437771

Adjusted r squared: 0.8878576684675533
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2680 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 6680 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:   25.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Poisson Regression Grid Search CV Best Score 0.8941448387383278
Poisson Regression Grid Search CV Best Estimator PoissonRegressor(max_iter=10000, tol=1e-05, warm_start=True)
Poisson Regression Grid Search CV Training Score 0.8975495416617187
Poisson Regression Grid Search CV Testing Score 0.8950873940697218

Adjusted r squared: 0.89154304927478
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 36.7min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed: 44.5min finished


Random Forest Grid Search CV Best Score 0.9967447699024256
Random Forest Grid Search CV Best Estimator RandomForestRegressor(max_depth=90, n_estimators=300)
Random Forest Grid Search CV Training Score 0.9996176357977228
Random Forest Grid Search CV Testing Score 0.9974240446585745

Adjusted r squared: 0.997337019140283
Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 29.9min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 34.1min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed: 40.5min finished


Random Forest Grid Search CV Best Score 0.9671955359926063
Random Forest Grid Search CV Best Estimator RandomForestRegressor(max_depth=100)
Random Forest Grid Search CV Training Score 0.995404942518749
Random Forest Grid Search CV Testing Score 0.9775618910990339

Adjusted r squared: 0.9768038468794067
Energy Load                  Heating                Cooling           
Result                   Train Score Test Score Train Score Test Score
Ridge Regression            0.844871   0.843438    0.819916   0.832605
Lasso Regression            0.906667   0.902890    0.879093   0.894048
Linear Regression           0.918778   0.891522    0.897550   0.895087
Poisson Regression          0.999618   0.997424    0.995405   0.977562
Random Forest Regression    0.900082   0.928164    0.880387   0.888304
