In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
sns.set()

In [2]:
data = pd.read_csv('auto_insurance_sweden.csv')

In [3]:
data.head()

Unnamed: 0,x,y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4


In [4]:
x = data['x']
y = data['y']

In [5]:
y


0     392.5
1      46.2
2      15.7
3     422.2
4     119.4
      ...  
58     87.4
59    209.8
60     95.5
61    244.6
62    187.5
Name: y, Length: 63, dtype: float64

In [6]:
x.shape

(63,)

# For sklearn,  you have to create an instance of the linear regression first

In [7]:
reg = LinearRegression()

# Then change the shape from 1d to 2d if you are using only one x value

In [8]:
x_matrix = x.values.reshape(-1,1)

In [9]:
x_matrix.shape

(63, 1)

#  Now fit the model

In [10]:
reg.fit(x_matrix,y)

LinearRegression()

#  To get the statistical summary, we use some commands 

In [11]:
reg.score(x_matrix,y) #this is to the R Squared

0.8333466719794502

In [12]:
reg.coef_ #This is to get the coefficient of the regression

array([3.41382356])

In [13]:
reg.intercept_ # This is to get the intercept

19.994485759114795

#  Predictions

##  In making predictions, you have to insert 2d arrays into the predict method

In [14]:
reg.predict(x_matrix)

array([388.68743025,  84.8571334 ,  64.37419204, 443.30860721,
       156.54742816, 214.58242868,  98.51242764,  67.7880156 ,
       173.61654596,  54.13272136,  37.06360356, 183.85801664,
        57.54654492,  98.51242764,  43.89125068,  26.82213288,
       101.9262512 ,  40.47742712,  30.23595644,  98.51242764,
        40.47742712,  50.7188978 ,  50.7188978 ,  30.23595644,
       118.995369  ,  43.89125068,  33.64978   ,  88.27095696,
        43.89125068,  33.64978   ,  19.99448576, 105.34007476,
        40.47742712,  37.06360356,  95.09860408,  57.54654492,
       228.23772292,  60.96036848,  33.64978   ,  74.61566272,
        64.37419204, 224.82389936, 159.96125172, 146.30595748,
       207.75478156, 159.96125172,  57.54654492, 112.16772188,
        47.30507424,  30.23595644,  78.02948628,  64.37419204,
        64.37419204,  71.20183916,  47.30507424, 118.995369  ,
       122.40919256, 101.9262512 ,  50.7188978 , 125.82301612,
        67.7880156 , 200.92713444, 108.75389832])

#   MULTIPLE LINEAR REGRESSION

In [15]:
newdata = pd.read_csv('Fish.csv')

In [16]:
newdata.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [17]:
x = newdata[['Weight', 'Height']]
y = newdata['Width']

#  You don't need to reshape the model because it already has 2d

In [18]:
reg1 = LinearRegression()
reg1.fit(x,y)

LinearRegression()

In [19]:
coefficient = reg1.coef_ # Getting the coefficient

In [20]:
intercept = reg1.intercept_

In [21]:
reg1.score(x,y) # Getting the R squared

0.833700911093449

#  Sklearn does not have a defined adjusted r squared value hence you have to create your own

##  In creating your own, you use jupyter notebook to write done the formular

$R^2_{adj.} = 1-(1-R^2)*\frac{n-1}{n-p-1}$

In [22]:
r2 = reg1.score(x,y)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)

In [23]:
adjusted_r2 # Finding the adjusted r squared value


0.831568871492083

##  Feature selection and f_regression

In [24]:
from sklearn.feature_selection import f_regression

In [25]:
f_regression(x,y)

(array([576.28151245, 265.79347778]), array([2.03819464e-54, 1.34754904e-35]))

In [26]:
p_values = f_regression(x,y)[1]


In [27]:
p_values

array([2.03819464e-54, 1.34754904e-35])

###  Round them up to know which one is closer to 0

In [28]:
p_values= p_values.round(3)

###  Since They are all smaller than 0.05 then they are all relevant

#  Creating a summary for the model

In [29]:
reg_summary = pd.DataFrame({'Features':[r2,adjusted_r2,reg1.coef_,reg1.intercept_,p_values]})
reg_summary = reg_summary.rename( index={0:'R Squared',1:'Adjusted R Sqaured', 2:'Coefficients',3:'Intercept',4:'P Values'})
reg_summary

Unnamed: 0,Features
R Squared,0.833701
Adjusted R Sqaured,0.831569
Coefficients,"[0.00309296824002104, 0.12473391697274713]"
Intercept,2.066487
P Values,"[0.0, 0.0]"


#  STANDARDISATION

In [30]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()

In [36]:
scaler.fit(x)

StandardScaler()

In [37]:
x_scaled = scaler.transform(x)

In [38]:
x_scaled


array([[-0.43807217,  0.59657867],
       [-0.30356218,  0.82126055],
       [-0.16344761,  0.79734129],
       [-0.09899491,  0.87977146],
       [ 0.08875862,  0.81283498],
       [ 0.14480444,  1.08395111],
       [ 0.28491901,  1.21901769],
       [-0.02333304,  0.86572884],
       [ 0.14480444,  1.17815367],
       [ 0.28491901,  1.23004114],
       [ 0.21486173,  1.23851352],
       [ 0.28491901,  1.26393066],
       [ 0.28491901,  1.12064915],
       [-0.16344761,  1.15662166],
       [ 0.56514816,  1.40037809],
       [ 0.56514816,  1.51356159],
       [ 0.8453773 ,  1.37837799],
       [ 0.8453773 ,  1.39653978],
       [ 0.59317107,  1.5592001 ],
       [ 0.70526273,  1.28789673],
       [ 0.49509087,  1.44112509],
       [ 0.80334293,  1.64359623],
       [ 0.62119399,  1.53338508],
       [ 0.78933147,  1.52072332],
       [ 0.8453773 ,  1.7013816 ],
       [ 0.91543458,  1.72934981],
       [ 0.90142313,  1.72977109],
       [ 0.88460938,  1.76609466],
       [ 1.26572101,

In [83]:
reg2 = LinearRegression()
reg2.fit(x_scaled,y)

LinearRegression()

In [84]:
reg2_summary = pd.DataFrame([['Bias'],['R^2'],['Weight'],['Height']], columns=["Features"])
reg2_summary['Weights'] = reg2.intercept_,reg2.score(x_scaled,y),reg2.coef_[0],reg2.coef_[1]
reg2_summary

Unnamed: 0,Features,Weights
0,Bias,4.417486
1,R^2,0.833701
2,Weight,1.103728
3,Height,0.532952


In [85]:
reg2.predict(x_scaled)

array([4.25192044, 4.52012747, 4.66202808, 4.77709763, 4.9486526 ,
       5.15500374, 5.38163609, 4.85312374, 5.20520914, 5.38751106,
       5.31470222, 5.40557253, 5.32921043, 4.85350711, 5.78758923,
       5.84791055, 6.08516107, 6.09484042, 5.90316335, 5.88229052,
       5.7319812 , 6.18011502, 5.92033488, 6.09916487, 6.25730635,
       6.34953625, 6.33429593, 6.33509683, 6.80221637, 7.5240365 ,
       7.16183137, 7.27596022, 7.26676785, 7.40659787, 7.20305541,
       2.70750263, 2.94077992, 3.00320535, 3.03615615, 3.21298962,
       2.87416446, 3.17603528, 3.2003416 , 3.25445066, 3.34244013,
       3.43866585, 3.31651011, 3.35965939, 3.52899426, 3.42702774,
       3.60771288, 3.50716106, 4.07068622, 3.97649496, 4.45584621,
       3.94690894, 3.91759647, 4.10785   , 5.07683145, 6.00788254,
       6.70041845, 3.09071615, 3.07246541, 3.26853414, 3.48443557,
       3.63966641, 3.56443124, 3.76429188, 3.7294271 , 3.97555321,
       4.2939174 , 4.41235304, 2.34817394, 2.60552364, 2.66718

#  Making a new data for testing the model

In [87]:
newdata

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [95]:
newdata1 = pd.DataFrame(columns=['Weight'])
newdata1['Weight'] = 242, 332,441, 232,133
newdata1['Height'] = 11,12,12.4,11.9,11.4
newdata1 = scaler.transform(newdata1)

In [96]:

reg2.predict(newdata1)

array([4.1870588 , 4.59015986, 4.97718696, 4.26838964, 3.89981883])

#  Croping a table

In [97]:
data

Unnamed: 0,x,y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4
...,...,...
58,9,87.4
59,31,209.8
60,14,95.5
61,53,244.6


In [107]:
data1 = data[:,x]

TypeError: '(slice(None, None, None),      Weight   Height
0     242.0  11.5200
1     290.0  12.4800
2     340.0  12.3778
3     363.0  12.7300
4     430.0  12.4440
..      ...      ...
154    12.2   2.0904
155    13.4   2.4300
156    12.2   2.2770
157    19.7   2.8728
158    19.9   2.9322

[159 rows x 2 columns])' is an invalid key

In [105]:
data1

Unnamed: 0,x,y
0,,
1,,
2,,
3,,
4,,
...,...,...
58,,
59,,
60,,
61,,
