# Import Packages

In [199]:
import csv
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy import stats
import matplotlib
matplotlib.use('qt5agg')
import matplotlib.pyplot as plt
import seaborn
import math

  # Remove the CWD from sys.path while we load stuff.


# Loading Data

In [200]:
cereals = pd.read_csv("D:/WSU/Neural Network/Neural-Network/Cereals.csv")
cereals.head()

Unnamed: 0,Name,Manuf,Type,Calories,Protein,Fat,Sodium,Fiber,Carbo,Sugars,...,Weight,Cups,Rating,Cold,Nabisco,Quaker,Kelloggs,GeneralMills,Ralston,AHFP
0,100%_Bran,N,C,70,4,1,130,10.0,5.0,6.0,...,1.0,0.33,68.402973,1,1,0,0,0,0,0
1,100%_Natural_Bran,Q,C,120,3,5,15,2.0,8.0,8.0,...,1.0,1.0,33.983679,1,0,1,0,0,0,0
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,...,1.0,0.33,59.425505,1,0,0,1,0,0,0
3,All-Bran_with_Extra_Fiber,K,C,50,4,0,140,14.0,8.0,0.0,...,1.0,0.5,93.704912,1,0,0,1,0,0,0
4,Almond_Delight,R,C,110,2,2,200,1.0,14.0,8.0,...,1.0,0.75,34.384843,1,0,0,0,0,1,0


# Data Cleaning

In [201]:
# To clear the null values in the dataframe
cereals.fillna(cereals.mean())
cereals['Sugars'].fillna((cereals['Sugars'].mean()), inplace=True)

In [202]:
#Clear null values in sugar
cereals['Sugars'].isnull().values.any()


False

In [203]:
#Clear null values in Rating
cereals['Rating'].isnull().values.any()


False

In [204]:
#Clear null values in Fiber
cereals['Fiber'].isnull().values.any()

False

In [205]:
#Extracting the predictors and response variables from dataset
x_sugar = cereals['Sugars'].values.reshape((-1, 1))
x_fiber = cereals['Fiber'].values.reshape((-1, 1))
y_rating = cereals['Rating'].values.reshape((-1, 1))

# Defining The Model

In [206]:
#Linear regression model for entire data on Sugar and Rating
model = LinearRegression()
result_sugar = model.fit(x_sugar, y_rating)


In [207]:
model = LinearRegression()
result_fiber = model.fit(x_fiber, y_rating)

In [208]:
x_multi_linear = cereals[['Sugars', 'Fiber']]
multi_linear = linear_model.LinearRegression()
result_multi = multi_linear.fit(x_multi_linear, y_rating)

In [209]:
x_multi_linear_fat = cereals[['Sugars', 'Fiber', 'Fat' ]]
multi_linear_fat = linear_model.LinearRegression()
result_multi_fat = multi_linear_fat.fit(x_multi_linear_fat, y_rating)

In [210]:
x_multi_linear_protein = cereals[['Sugars', 'Fiber', 'Protein']]
multi_linear_protein = linear_model.LinearRegression()
result_multi_protein = multi_linear_protein.fit(x_multi_linear_protein, y_rating)

In [211]:
x_multi_linear_sodium = cereals[['Sugars', 'Fiber', 'Sodium']]
multi_linear_sodium = linear_model.LinearRegression()
result_multi_sodium = multi_linear_sodium.fit(x_multi_linear_sodium, y_rating)


In [212]:
class Linear_regression:
    
    def __init__(self, x, y, model):
        self.data = x
        self.target = y
        self.model = model
        ## degrees of freedom population dep. variable variance
        self._dft = x.shape[0] - 1   
        ## degrees of freedom population error variance
        self._dfe = x.shape[0] - x.shape[1] - 1  
    
    def sse(self):
        '''returns sum of squared errors (model vs actual)'''
        squared_errors = (self.target - self.model.predict(self.data)) ** 2
        return np.sum(squared_errors)
        
    def sst(self):
        '''returns total sum of squared errors (actual vs avg(actual))'''
        avg_y = np.mean(self.target)
        squared_errors = (self.target - avg_y) ** 2
        return np.sum(squared_errors)
    
    def ssr(self):
        return (self.sst() - self.sse())
    
    def r_squared(self):
        '''returns calculated value of r^2'''
        return ((1 - self.sse()/self.sst())*100)
    
    def adj_r_squared(self):
        '''returns calculated value of adjusted r^2'''
        return ((1 - (self.sse()/self._dfe) / (self.sst()/self._dft))*100)
    
    def bias(self):
        intercept = self.model.intercept_
        return(np.sum(intercept))
    
    def slope(self):
        slope = self.model.coef_
        return(np.sum(slope))
    
    def see(self):
        N = len(self.data)
        ss = self.sse()/N
        s_est = math.sqrt(ss)
        return(s_est)
        

In [213]:
def print_stats(stats_obj):
    '''returns report of statistics for a given model object'''
    items = ( ('sse:', stats_obj.sse()), ('sst:', stats_obj.sst()), ('ssr:', stats_obj.ssr()),
             ('r^2:', stats_obj.r_squared()), ('adj_r^2:', stats_obj.adj_r_squared()), ('bias:', stats_obj.bias()), ('slope:', stats_obj.slope()),('Standard Error of estimation:', stats_obj.see()))
    for item in items:
        print('{0:8} {1:.4f}'.format(item[0], item[1]))
        

# Sugar Data(Linear)

In [214]:
temp = Linear_regression(x_sugar, y_rating, result_sugar)
print_stats(temp)

sse:     6284.8676
sst:     14996.8004
ssr:     8711.9328
r^2:     58.0919
adj_r^2: 57.5332
bias:    59.9604
slope:   -2.4614
Standard Error of estimation: 9.0345


# Fiber Data(Linear)

In [215]:
temp = Linear_regression(x_fiber, y_rating, result_fiber)
print_stats(temp)


sse:     9879.2413
sst:     14996.8004
ssr:     5117.5591
r^2:     34.1243
adj_r^2: 33.2460
bias:    35.2566
slope:   3.4430
Standard Error of estimation: 11.3270


# Sugar and Fiber(Multi linear)

In [216]:
temp = Linear_regression(x_multi_linear, y_rating, result_multi)
print_stats(temp)

sse:     2784.7129
sst:     14996.8004
ssr:     12212.0875
r^2:     81.4313
adj_r^2: 80.9294
bias:    52.2378
slope:   0.6323
Standard Error of estimation: 6.0137


# Sugar, Fiber and Fat(Multi linear)

In [217]:
temp = Linear_regression(x_multi_linear_fat, y_rating, result_multi_fat)
print_stats(temp)

sse:     2063.8836
sst:     14996.8004
ssr:     12932.9168
r^2:     86.2378
adj_r^2: 85.6723
bias:    53.7061
slope:   -2.2716
Standard Error of estimation: 5.1772


# Sugar, Fiber and Protein(Multi linear)

In [218]:
temp = Linear_regression(x_multi_linear_protein, y_rating, result_multi_protein)
print_stats(temp)

sse:     2764.5487
sst:     14996.8004
ssr:     12232.2517
r^2:     81.5657
adj_r^2: 80.8082
bias:    50.8558
slope:   1.1026
Standard Error of estimation: 5.9919


# # Sugar, Fiber and Sodium(Multi linear)

In [219]:
temp = Linear_regression(x_multi_linear_sodium, y_rating, result_multi_sodium)
print_stats(temp)

sse:     1169.9520
sst:     14996.8004
ssr:     13826.8484
r^2:     92.1987
adj_r^2: 91.8781
bias:    60.9538
slope:   0.5046
Standard Error of estimation: 3.8980
