# Assessment 2 - Part 1: Machine Learning
<br>

## Task: train regression models that predict two indicators of energy efficiency based on 8 numerical inputs. 
<br>

### Packages needed:

- pandas
- sklearn
- numpy
- matplotlib

### 1.1 - Data Preparation

#### Importing libraries

In [3]:
## Imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

## Used for normalising the data
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

## Used for regression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

## Used for assessment of regression
from sklearn.metrics import mean_squared_error

## Used for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

print("imported libraries")

imported libraries


#### Loading the dataset

In [4]:
energy_data = pd.read_excel("data.xlsx")

energy_data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


#### Extracting and splitting data

In [5]:
## Extracting Data

#inputs
inputs = energy_data.values[:,:8].astype(float)

#normalise the inputs
scaler = MinMaxScaler()
scaled_inputs = scaler.fit_transform(inputs)

#targets
targets_Y1 = energy_data["Y1"]
targets_Y2 = energy_data["Y2"]
targets =  energy_data.values[:,8:].astype(float)

print("scaled data")

scaled data


#### Random Sample generator

In [6]:
sample = energy_data.sample(frac=1).reset_index(drop=True)

print( sample.head())

     X1     X2     X3      X4   X5  X6    X7  X8     Y1     Y2
0  0.62  808.5  367.5  220.50  3.5   5  0.25   2  14.60  15.30
1  0.90  563.5  318.5  122.50  7.0   3  0.10   2  29.68  29.44
2  0.71  710.5  269.5  220.50  3.5   5  0.25   4  12.28  15.64
3  0.98  514.5  294.0  110.25  7.0   5  0.40   4  32.74  33.88
4  0.90  563.5  318.5  122.50  7.0   4  0.25   2  32.33  32.77


#### Setting up train and test data

In [7]:
#set up data
x_train, x_test, y_joint_train, y_joint_test = train_test_split(scaled_inputs, targets, test_size=0.3)
x1_train, x1_test, y1_train, y1_test = train_test_split(scaled_inputs, targets_Y1, test_size=0.3)
x2_train, x2_test, y2_train, y2_test = train_test_split(scaled_inputs,targets_Y2, test_size=0.3)

print("set up train and test data")

set up train and test data


### Task 1.2 - Regression

#### Regression using the "sklearn.neural_network.MLPRegressor"

In [9]:
MLP = MLPRegressor(max_iter=20000)
MLP.fit(x_train, y_joint_train)
MLP_Outputs = MLP.predict(x_train)

MLP_CV = abs(cross_val_score(MLP, x_test, y_joint_test, cv=10, scoring='neg_mean_squared_error'))
print(MLP_CV)

MLP_CV2 = abs(cross_val_score(MLP, x_train, y_joint_train, cv=10, scoring='neg_mean_squared_error'))
print(MLP_CV2)

[16.20939596 10.28982448  6.66842684 15.49162158 17.18668986 11.92337926
 11.62245768 10.6605699  16.07778821 12.88750386]
[8.7437944  7.05721733 1.38187999 1.95175357 4.3386912  6.25013752
 6.52999162 5.36780887 6.64665345 5.14780621]


#### Regression using the "sklearn.ensemble.RandomForestRegressor"

In [None]:
Forest = RandomForestRegressor()

Forest.fit(x_train, y_joint_train)
Forest_Outputs = Forest.predict(x_train)

Forest_CV = abs(cross_val_score(Forest, x_test, y_joint_test, cv=10, scoring='neg_mean_squared_error'))
print(Forest_CV)
Forest_CV2 = abs(cross_val_score(Forest, x_train, y_joint_train, cv=10, scoring='neg_mean_squared_error'))
print(Forest_CV2)


#### Regression using the "sklearn.svm.SVR"

In [None]:
SVR_poly = SVR()
SVR_1 = SVR_poly.fit(x_train, y1_train)
SVR_2 = SVR_poly.fit(x_train, y2_train)

SVR_Outputs = SVR_poly.predict(x_train)

SVR_1_CV = abs(cross_val_score(SVR_poly, x1_test, y1_test, cv=10, scoring='neg_mean_squared_error'))
SVR_2_CV = abs(cross_val_score(SVR_poly, x2_test, y2_test, cv=10, scoring='neg_mean_squared_error'))
print(SVR_1_CV)
print(SVR_2_CV)

SVR_1_CV1 = abs(cross_val_score(SVR_poly, x1_train, y1_train, cv=10, scoring='neg_mean_squared_error'))
SVR_2_CV2 = abs(cross_val_score(SVR_poly, x2_train, y2_train, cv=10, scoring='neg_mean_squared_error'))
print(SVR_1_CV1)
print(SVR_2_CV2)

### Task 1.3 - Assessment of  Regression

#### Boxplot

In [None]:
VALS_Data = [Forest_CV2, MLP_CV2, SVR_1_CV1, SVR_2_CV2]

# Creating plot
plt.boxplot(VALS_Data)
  
# Adding title 
plt.title("Cross Validation - Training data")

plt.xticks([1, 2, 3, 4], ['Forest', 'MLP', 'SVR_Y1', 'SVR_Y2'])
plt.xlabel('Regressor Model')
plt.ylabel('MSE Rate')

# show plot
plt.show()

In [None]:
CV_TEST_DATA = [Forest_CV, MLP_CV, SVR_1_CV, SVR_2_CV]

# Creating plot
plt.boxplot(CV_TEST_DATA)
  
# Adding title 
plt.title("Cross Validation - Testing data")

plt.xticks([1, 2, 3, 4], ['Forest', 'MLP', 'SVR_Y1', 'SVR_Y2'])
plt.xlabel('Regressor Model')
plt.ylabel('MSE Rate')

# show plot
plt.show()