# Regression and Prediction over multivariate data
An exploratory experiment conducted by Weiye Chen.

Todos:
* Explore Data before modeling
    * ploting and reviewing
* Modeling Time Series Data

## Import Data

In [1]:
import csv
import numpy as np

In [2]:
def loadACLoadSample():
    with open('./ACLoad.csv', newline='') as csvfile:
        table = csv.DictReader(csvfile, delimiter=',', quotechar='|')
        ones = []
        date = []
        hour = []
        temp = []
        humi = []
        sunr = []
        k_1temp = []
        k_1sunr = []
        load = []
        for row in table:
#             date.append(row['date'])
            ones.append(1.0)
#             hour.append(float(row['hour']))
            temp.append(float(row['temp']))
            humi.append(float(row['humi']))
            sunr.append(float(row['sunr']))
            k_1temp.append(float(row['k_1temp']))
            k_1sunr.append(float(row['k_1sunr']))
            load.append(float(row['load']))
        matrix_t = np.mat([np.array(ones), temp, humi, sunr, k_1temp, k_1sunr, load])
        matrix_r = np.transpose(matrix_t)
        return matrix_r

By default, this notebook environment assumes a data matrix with a row of 1.0 at the top. It's essential for OLS, however, they should be eliminated in the machine learning section.

In [3]:
matrix = loadACLoadSample()
print(matrix)

[[   1.    27.9   21.2 ...   26.8  242.6 -385.7]
 [   1.    28.8   21.4 ...   27.9  353.3 -399.2]
 [   1.    29.5   21.2 ...   28.8  491.4 -403.7]
 ...
 [   1.    31.3   22.  ...   32.4  167.8 -379.8]
 [   1.    30.1   21.6 ...   31.3   20.8 -362.7]
 [   1.    29.    21.3 ...   30.1    0.  -348. ]]


Dimension of the matrix

In [4]:
np.shape(matrix)

(299, 7)

In this notebook, columns represents ones, outdoor temperature `temp`, humidity `humi`, exposure to sun radiation `sunr`, outdoor temperature at k-1 hour `k_1temp`, exposure to sun radiation at k-1 hour `k_1sunr` and air condition power load `load`.

## Multivariate Linear Regression
Using Ordinary Least Squares (OLS).

In [5]:
import scipy.stats as stats

In [40]:
class MLR:     
    def __init__(self, x, y = None):
        
        self.init()
        if (y is None):
            self.x = np.array(matrix[:,0:-1])
            self.y = np.array(matrix[:,-1])
            shape = np.shape(matrix)
            self.k = shape[1] - 2
            self.n = shape[0]
        else:
            self.x = x
            self.y = y
            self.k = np.shape(x)[1] - 1
            self.n = np.shape(x)[0]
    
    def init(self):
        self.r2 = 0.
        self.t = []
        self.p_t = []
        self.f = 0.
        self.p_f = 0.
        self.ess = 0.
        self.rss = 0.
        self.tss = 0.
        self.r2_adj = 0.

        
    def OLS(self):
        x = self.x
        y = self.y
        x_t = np.transpose(x)
        xt_x_inv = np.matmul(x_t, x)
        xt_x_inv = np.linalg.inv(xt_x_inv)
        beta = np.matmul(xt_x_inv, x_t)
        beta = np.matmul(beta, y)
        self.beta = beta
        y_e = self.PredictionE(x)
        deviation = self.y - y_e
        deviation = deviation * deviation
        self.rss = np.sum(deviation)
        mean = np.mean(y)
        self.tss = np.sum((y-mean) * (y-mean))
        self.ess = self.tss - self.rss
        self.f = (self.ess / self.k) / (self.rss / (self.n - self.k - 1))
        self.r2 = 1 - self.rss / self.tss
        self.r2_adj = 1 - (self.rss / (self.n - self.k - 1)) / (self.tss / (self.n - 1))
        self.p_f = 1 - stats.f.cdf(self.f, self.k, (self.n - self.k - 1))
        sigma2 = self.rss / (self.n - self.k - 1)
        beta_array = np.array(np.transpose(beta)[0])
        cii = []
        for i in range(0, self.k + 1):
            cii.append(xt_x_inv[i][i])
        cii_np = np.array(cii)
        self.t = beta_array / np.sqrt(cii_np * sigma2)
        self.p_t = 1 - stats.t.cdf(np.absolute(self.t), self.n - self.k - 1)
        
    def PredictionE(self, x):
        return np.matmul(x, self.beta)
    

In [41]:
mlr = MLR(matrix)

In [42]:
mlr.OLS()

In [43]:
beta = mlr.beta
print(beta)

[[ 3.57997287e+02]
 [-1.04921325e+01]
 [-1.82365349e+01]
 [ 7.52564326e-03]
 [-6.42760743e-01]
 [ 1.92977516e-02]]


F检验显著性p value

t检验及是否处于95%置信区间之外

In [44]:
print(mlr.p_f)
print(mlr.t)
print(mlr.p_t < 0.025)

1.1102230246251565e-16
[ 17.8584512   -4.51603903 -21.69461018   0.85423894  -0.33122165
   2.11854944]
[ True  True  True False False  True]


In [45]:
def OLS(matrix):
    x = np.array(matrix[:,0:-1])
    y = np.array(matrix[:,-1])
    x_t = np.transpose(x)
    beta = np.matmul(x_t, x)
    beta = np.linalg.inv(beta)
    beta = np.matmul(beta, x_t)
    beta = np.matmul(beta, y)
    return beta

In [46]:
print("R2 = " + str(mlr.r2))
print("R2_adj = " + str(mlr.r2_adj))

R2 = 0.8423908522946908
R2_adj = 0.8397012763952827


It's suggested in the coefficient of determination that Variables and Target are highly linearly correlated.

## Data Preprocessing
Divide data into two sets with randomness, one for training and one for testing.

In [13]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

Before we move on to the machine learning model, we should normalize the data. It's very important as to improve the performance of the machine learning models.

In [14]:
std = np.array(np.std(matrix, axis = 0))[0]
print(std)

[  0.           2.24364937   2.26701032 258.50252051   2.35136086
 248.54142779  62.03627259]


In [15]:
mean = np.array(np.mean(matrix, axis = 0))[0]
print(mean)

[   1.           29.05652174   19.75451505  313.92240803   28.9548495
  326.12474916 -317.07658863]


In [16]:
matrix1 = preprocessing.scale(matrix[:, 1:])
print(matrix1)

[[-0.51546456  0.63761728  0.15232962 -0.91642654 -0.33605967 -1.10618205]
 [-0.11433237  0.7258392   0.68656039 -0.44861234  0.10933892 -1.32379667]
 [ 0.19765934  0.63761728  0.16702967 -0.06585527  0.66498069 -1.39633488]
 ...
 [ 0.99992374  0.99050495 -1.13392476  1.46517302 -0.63701553 -1.0110764 ]
 [ 0.46508081  0.81406111 -1.21438819  0.99735882 -1.22846622 -0.73543122]
 [-0.02519188  0.68172824 -1.21438819  0.48701606 -1.31215448 -0.49847307]]


In [17]:
def DivideData(matrix, ratioTesting):
    x = np.array(matrix[:,0:-1])
    y = np.array(np.transpose(matrix[:,-1]))
    print(np.shape(x), np.shape(y))
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=ratioTesting)
    return xtrain, xtest, ytrain, ytest

In [18]:
# Global Variables:
xtrain, xtest, ytrain, ytest = DivideData(matrix1, 0.25)

(299, 5) (299,)


## Regression using Machine Learning
In this section, we will use several machine Learning Model to regress the model.

In [19]:
def denormalize(array, col):
    return array * std[col] + mean[col]

#### 1. Supporting Vector Machine Regression

In [20]:
from sklearn.svm import SVR

In [21]:
def SVMR(xtrain, ytrain):
    clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
    clf.fit(xtrain, ytrain)
    return clf

In [22]:
dt = SVMR(xtrain, ytrain)

In [23]:
dt.score(xtest, ytest)

0.7962612458051609

#### 2. Random Forest (Emsembled Decision Trees with randomness)

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
def randomForest(xtrain, xtest, ytrain, ytest):
    regr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
    regr.fit(xtrain, ytrain)
    return regr.feature_importances_, regr.score(xtest, ytest), regr

In [26]:
rf = randomForest(xtrain, xtest, ytrain, ytest)

In [27]:
print("Feature Importances")
print("temp,humi,sunr,k_1temp,k_1sunr,load")
print(rf[0])

Feature Importances
temp,humi,sunr,k_1temp,k_1sunr,load
[0.07698041 0.80087753 0.01528824 0.08568883 0.02116499]


In [28]:
print("Regression Score = " + str(rf[1]))

Regression Score = 0.8310752868015373


In [29]:
rf[2]

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

#### 3. Neural Network

In [30]:
from sklearn.neural_network import MLPRegressor

In [31]:
def multiLayerPerceptron(xtrain, xtest, ytrain, ytest):
    clf = MLPRegressor(solver='lbfgs', activation = 'logistic', alpha=1e-6, hidden_layer_sizes=(5, 5), random_state=1)
    clf.fit(xtrain, ytrain)
    
    return clf

In [32]:
mlp = multiLayerPerceptron(xtrain, xtest, ytrain, ytest)

In [33]:
mlp.score(xtest, ytest)

0.8713677079098294

In [34]:
mlp.predict(xtest)

array([-0.89556847, -0.34585391, -0.34266279,  1.49362466, -1.51394225,
       -0.46249741,  1.78453906, -0.07795122,  1.1570157 ,  1.07000498,
       -0.30969489,  1.04286428,  0.93296135, -1.77354117,  1.75325865,
       -0.01095141,  1.29542966, -0.81956491,  0.63104349,  1.25427681,
        1.64855792, -1.35860491,  0.4104094 ,  1.32427037,  1.13639606,
       -1.6938346 , -0.84663744, -0.58819423,  0.45633156, -0.35078152,
       -0.15292788, -0.34026327,  1.74105853,  0.86131739,  1.09612553,
       -0.43551203, -0.49242549,  0.46476272,  0.78963165, -1.64437031,
       -0.14658968,  1.29028677, -0.4157555 ,  0.39329235,  1.26002757,
        0.32877173,  1.10761753, -1.26606809, -2.26222817, -0.78384978,
        0.91400961,  0.97401071, -0.20333799,  1.70932889,  1.27745573,
       -0.41985653, -0.42434104,  0.90765711,  0.56818347,  1.63228049,
        1.0490695 ,  0.69849221,  0.1337964 , -0.15656692,  0.26586269,
        0.14764795, -1.03739094,  0.9440404 ,  0.7729561 , -0.68

In [35]:
denormalize(mlp.predict(xtest),-1)

array([-372.6343186 , -338.53207623, -338.33411059, -224.41768202,
       -410.99592294, -345.76820387, -206.37043695, -321.91239198,
       -245.29964756, -250.69746817, -336.28890509, -252.38117566,
       -259.19914405, -427.10047238, -208.31095711, -317.75597298,
       -236.71296104, -367.919341  , -277.92900275, -239.26593035,
       -214.80619983, -401.35937335, -291.61631904, -234.92379085,
       -246.57881273, -422.15577359, -369.59881967, -353.56596608,
       -288.76747963, -338.83776647, -326.56366402, -338.18525333,
       -209.0678073 , -263.64366835, -249.07704661, -344.09413141,
       -347.62483075, -288.24444172, -268.09078412, -419.08719371,
       -326.17046598, -237.03200696, -342.86851031, -292.67819696,
       -238.9091747 , -296.68081615, -248.36412546, -395.61873398,
       -457.41679221, -365.70370725, -260.37483903, -256.65259468,
       -329.69091975, -211.03619557, -237.82799674, -343.12292259,
       -343.40112531, -260.76892483, -281.82860375, -215.81599

The performance of Neural Network (MLP) needs to be improved.

## Brief Summary
The predictions achieved by three machine learning models don't significantly outperform the multiple linear regression.

Special Thanks to Anlin Li for providing a time dependent dataset.