# Generalized Linear Regression

## This is Simple Linear Regression on Advertising dataset (TV vs sales)

## Library

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Dataset

In [7]:
df = pd.read_csv('Advertising.csv')

In [8]:
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [9]:
df = df.drop(['radio','newspaper'],axis=1)
df

Unnamed: 0,TV,sales
0,230.1,22.1
1,44.5,10.4
2,17.2,9.3
3,151.5,18.5
4,180.8,12.9
...,...,...
195,38.2,7.6
196,94.2,9.7
197,177.0,12.8
198,283.6,25.5


## Train Test Split

In [11]:
l = [1,2,3,4,5]
print(l[:-1])
print(l[-1])

[1, 2, 3, 4]
5


In [12]:
x = df['TV']
x

0      230.1
1       44.5
2       17.2
3      151.5
4      180.8
       ...  
195     38.2
196     94.2
197    177.0
198    283.6
199    232.1
Name: TV, Length: 200, dtype: float64

In [13]:
x.ndim

1

#### Here the dimension of x is 1 which will cause error in train test split ,since tran test split expects x to be 2 dimensional.

#### so the way we can do is 
#### i)  x = np.array(df['TV']).reshape(len(x),1)
#### ii) x = df[['TV']]
#### ii) x = df[df.columns[:-1]] 

In [14]:
# x = np.array(df['TV']).reshape(len(x),1)
# x
# x.ndim

In [15]:
# x = df[['TV']]
# x
# x.ndim

In [16]:
x = df[df.columns[:-1]]
x

Unnamed: 0,TV
0,230.1
1,44.5
2,17.2
3,151.5
4,180.8
...,...
195,38.2
196,94.2
197,177.0
198,283.6


In [17]:
x.ndim

2

In [18]:
y = df[df.columns[-1:]]
y

Unnamed: 0,sales
0,22.1
1,10.4
2,9.3
3,18.5
4,12.9
...,...
195,7.6
196,9.7
197,12.8
198,25.5


In [19]:
y.ndim

2

In [20]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3,random_state=0)

In [21]:
x_train

Unnamed: 0,TV
131,265.2
96,197.6
181,218.5
19,147.3
153,171.3
...,...
67,139.3
192,17.2
117,76.4
47,239.9


In [22]:
x_train = x_train.reset_index()
x_train

Unnamed: 0,index,TV
0,131,265.2
1,96,197.6
2,181,218.5
3,19,147.3
4,153,171.3
...,...,...
135,67,139.3
136,192,17.2
137,117,76.4
138,47,239.9


In [23]:
y_train

Unnamed: 0,sales
131,12.7
96,11.7
181,12.2
19,14.6
153,19.0
...,...
67,13.4
192,5.9
117,9.4
47,23.2


# Linear Regression

In [24]:
n = len(x_train.columns)
n

2

In [25]:
A = np.zeros((n,n))
A

array([[0., 0.],
       [0., 0.]])

In [26]:
H = np.zeros((n,1))
H

array([[0.],
       [0.]])

In [27]:
for i in range(n):
    for j in range(n):
        if i==0 and j==0 :
            A[i,j] = len(x_train)
        else:
            if i==0 :
                A[i,j] = x_train[x_train.columns[j]].sum()
            elif j==0 :
                A[i,j] = x_train[x_train.columns[i]].sum()
            else :
                A[i,j] = (x_train[x_train.columns[i]] * x_train[x_train.columns[j]]).sum()

A

array([[1.40000000e+02, 2.15279000e+04],
       [2.15279000e+04, 4.29596957e+06]])

In [28]:
for i in range(n):
    if i==0 :
        H[i,0] = y_train[y_train.columns[0]].sum()
    else :
        H[i,0] = (x_train[x_train.columns[i]].values * y_train[y_train.columns[0]].values).sum()

H

array([[  2009.8 ],
       [354203.41]])

In [29]:
A_list = list()

for j in range(n):
    temp_A = A.copy()
    temp_A[:,j] = H.reshape(n,)
    A_list.append(temp_A)

A_list

[array([[2.00980000e+03, 2.15279000e+04],
        [3.54203410e+05, 4.29596957e+06]]),
 array([[1.4000000e+02, 2.0098000e+03],
        [2.1527900e+04, 3.5420341e+05]])]

In [30]:
b_list = list()

for i in range(n):
    b = np.linalg.det(A_list[i]) / np.linalg.det(A)
    b_list.append(b)

b_list

[7.310810165411687, 0.04581434217189627]

In [31]:
print('intercept :',b_list[0])

intercept : 7.310810165411687


In [32]:
print('coefficients :')
for i in range(1,n):
    print('b'+str(i),':',b_list[i])

coefficients :
b1 : 0.04581434217189627


In [33]:
x_test

Unnamed: 0,TV
18,69.2
170,50.0
107,90.4
98,289.7
177,170.2
182,56.2
5,8.7
146,240.1
12,23.8
152,197.6


In [34]:
y_test

Unnamed: 0,sales
18,11.3
170,8.4
107,8.7
98,25.4
177,11.7
182,8.7
5,7.2
146,13.2
12,9.2
152,16.6


In [35]:
# y_prediction

y_cap = list()
y_prediction = b_list[0]

for row in x_test.index:
    for i in range(len(x_test.columns)):
        y_prediction = y_prediction + b_list[i+1] * x_test[x_test.columns[i]][row]
    
    y_cap.append(y_prediction)
    y_prediction = b_list[0]

y_cap

[10.48116264370691,
 9.6015272740065,
 11.452426697751111,
 20.583225092610036,
 15.108411203068432,
 9.885576195472257,
 7.709394942307185,
 18.31083372088398,
 8.401191509102818,
 16.36372417857839,
 19.282097774928182,
 11.305820802801042,
 14.485336149530642,
 15.914743625293806,
 10.811025907344561,
 12.81769409447362,
 19.479099446267337,
 7.3428802049320145,
 10.73314152565234,
 17.087590784894353,
 20.487014974049057,
 13.738562372128733,
 15.332901479710724,
 13.344559029450426,
 9.761877471608138,
 12.950555686772118,
 14.89766522907771,
 16.702750310650423,
 17.57322281191645,
 8.460750153926284,
 10.417022564666254,
 16.423282823401856,
 20.17089601306297,
 18.384136668359016,
 7.915559482080718,
 8.167538364026147,
 10.041344958856705,
 15.594043230090534,
 10.444511169969392,
 8.456168719709094,
 9.00135939155466,
 8.75396194382642,
 13.857679661775665,
 16.913496284641145,
 17.179219469238145,
 12.015943106465434,
 7.704813508089995,
 8.085072548116734,
 13.0696729764190

In [36]:
len(y_cap)

60

## Model Evaluation on Test Dataset

In [37]:
y_test

Unnamed: 0,sales
18,11.3
170,8.4
107,8.7
98,25.4
177,11.7
182,8.7
5,7.2
146,13.2
12,9.2
152,16.6


In [38]:
# R Square

y_mean = y_test[y_test.columns[0]].mean()

sum1 = 0
sum2 = 0

for i in range(len(y_cap)):
    diff = y_test.iloc[i][y_test.columns[0]] - y_cap[i] 
    sum1 = sum1 + (diff ** 2)

for i in y_test.index:
    diff = y_test[y_test.columns[0]][i] - y_mean
    sum2 = sum2 + (diff ** 2)

r2 = 1 - (sum1 / sum2)

print(r2)

0.725606346597073


In [39]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_cap) # first comes actual value and then predicted value
print(r2)

0.7256063465970731


In [40]:
# MSE

sum1 = 0

for i in range(len(y_cap)):
    diff = y_test.iloc[i][y_test.columns[0]] - y_cap[i] 
    sum1 = sum1 + (diff ** 2)

mse = sum1 / (len(y_cap))
print(mse)

7.497479593464671


In [41]:
# RMSE

rmse = np.sqrt(mse)
rmse
print(rmse)

2.7381525876883983


In [42]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_cap)
print('MSE :',mse)
rmse = np.sqrt(mse)
print('RMSE :',rmse)

MSE : 7.497479593464671
RMSE : 2.7381525876883983
