In [2]:
#Linear Regression using MLE

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [4]:
#reading data from CSV file
dataset = dataset = pd.read_excel('weather_data.xlsx')
dataset = dataset.sample(frac = 1)
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:].values
dataset

Unnamed: 0,dewptc,hum,wndspd,pressure,rain,smoke,temp
684,29,56,15,1001,0,0,18
570,40,33,14,1016,0,0,6
572,35,33,16,1012,0,0,6
122,29,54,7,1022,0,0,16
450,48,28,12,1003,0,0,4
...,...,...,...,...,...,...,...
789,36,56,11,1003,0,1,18
635,12,42,17,1012,0,0,10
721,20,66,6,1007,1,0,24
959,35,20,2,1013,0,1,2


In [5]:
#Augmenting the X
ones = np.tile([1],(1000,1))
X_hat = np.hstack((X,ones))
print(X_hat)

[[29 56 15 ...  0  0  1]
 [40 33 14 ...  0  0  1]
 [35 33 16 ...  0  0  1]
 ...
 [20 66  6 ...  1  0  1]
 [35 20  2 ...  0  1  1]
 [42 44 19 ...  0  0  1]]


In [6]:
datasize = X.shape[0]
X_train = X_hat[:(datasize*7)//10,:]
y_train = y[:(datasize*7)//10,:]
X_test = X_hat[(datasize*7)//10:,:]
y_test =  y[(datasize*7)//10:,:]

In [7]:
#Calculating the W using MLE
def calculate_W(X,y):
    W = np.matmul(np.transpose(X),X)
    W = np.linalg.inv(W)
    W = np.matmul(W,np.transpose(X))
    W = np.matmul(W,y)
    return W
W = calculate_W(X_train,y_train)
print(W)
print(np.matmul(X_train,W))

[[ 5.89378401e-05]
 [ 5.23486253e-01]
 [-1.14816140e-02]
 [ 1.36528647e-03]
 [ 1.96749847e+00]
 [ 1.38982310e-01]
 [-1.28688645e+01]]
[[17.64250242]
 [ 5.63492784]
 [ 5.60620878]
 [16.71605385]
 [ 3.02318259]
 [10.39784819]
 [15.13095475]
 [15.46281396]
 [29.04266408]
 [11.94598232]
 [29.05390994]
 [18.79565908]
 [13.53670262]
 [18.68722558]
 [12.03208442]
 [13.14900749]
 [23.26779482]
 [12.04007872]
 [17.55788851]
 [16.90200316]
 [14.93819733]
 [15.17244117]
 [16.67127609]
 [13.47211566]
 [13.95362338]
 [11.51958884]
 [ 8.85696251]
 [ 7.82824387]
 [17.16222567]
 [18.7082256 ]
 [ 8.64973211]
 [ 4.65497818]
 [-0.75065428]
 [22.2190648 ]
 [12.49798077]
 [ 7.27491873]
 [19.24484282]
 [ 6.18740751]
 [26.45573944]
 [28.57212669]
 [ 5.56464079]
 [22.79734643]
 [ 7.49076186]
 [12.36745011]
 [16.32356388]
 [ 9.32887463]
 [10.88366809]
 [14.50695824]
 [ 4.16392792]
 [24.04805556]
 [17.71083279]
 [ 4.08346405]
 [18.7683439 ]
 [ 7.72231105]
 [22.2841041 ]
 [17.75242176]
 [17.62525943]
 [12.377998

In [8]:
#Calculating the sigma^2 (Variance) using MLE
def calculate_var(X,y,W):
    n = X.shape[0]
    var = np.linalg.norm(y - np.matmul(X,W))**2
    var = (1/n)*var
    return var
var = calculate_var(X_train,y_train,W)
print(var)

2.9213562293450632


In [9]:
def Accuracy_of_classifier(X,y,W):
    pred = np.matmul(X,W)
    SE = 0
    for i in range(len(pred)):
        SE += (y[i][0]-pred[i][0])**2
    MSE = (1/X.shape[0])*SE
    return MSE

In [10]:
Accuracy_of_classifier(X_train,y_train,W)

2.921356229345061

In [14]:
Accuracy_of_classifier(X_test,y_test,W)

2.9011432386930354

In [22]:
def Normalize_dataset(X):
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis = 0)
    X = (X-X_mean)/X_std
    return X



In [33]:
X_train_1 = X[:(datasize*7)//10,:]
X_train_norm = Normalize_dataset(X_train_1)
X_train_norm = np.hstack((X_train_norm,ones[:700]))
print(X_train_norm)
W1= calculate_W(X_train_norm,y_train)
print(Accuracy_of_classifier(X_train_norm,y_train,W1))

[[-0.04219716  0.4181223   0.51494448 ... -0.67251926 -0.57075176
   1.        ]
 [ 0.82656793 -1.10180749  0.30525214 ... -0.67251926 -0.57075176
   1.        ]
 [ 0.4316747  -1.10180749  0.72463683 ... -0.67251926 -0.57075176
   1.        ]
 ...
 [ 0.51065335  1.27721305  0.0955598  ...  1.48694626 -0.57075176
   1.        ]
 [-1.93768463 -0.30880064  1.77309855 ... -0.67251926 -0.57075176
   1.        ]
 [-0.67402632 -0.50705235  1.5634062  ... -0.67251926 -0.57075176
   1.        ]]
2.921356229345062


In [26]:
#standardizing the features
print(X_train[0])
print(Normalize_dataset(X_hat)[0])

[  29   56   15 1001    0    0    1]
[-0.02513788  0.42563841  0.51546669 -1.74402762 -0.67341545 -0.57735027
         nan]


In [13]:
#Cost function
