# Import necessary modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import metrics

## Load DataSets

In [2]:
df = pd.read_csv("BostonHousing.csv")
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


## Checking for null and duplicate values

In [3]:
df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [4]:
df.duplicated().sum()

0

# Normalization

In [5]:
x = df.iloc[:, 0:-1].values
y = df.iloc[:, -1].values.reshape(-1, 1)

In [7]:
scaler_x = preprocessing.StandardScaler().fit(x)
scaler_y = preprocessing.StandardScaler().fit(y)

In [8]:
norm_x = scaler_x.transform(x)
norm_y = scaler_y.transform(y)

# Spliting Data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(norm_x, norm_y, 
                                                    test_size=0.3,
                                                   shuffle=False)

# Training Data

In [10]:
reg = LinearRegression()
model = reg.fit(X_train, y_train)

## Regression Line

In [38]:
line = reg.coef_*norm_x + reg.intercept_

# Predictions

In [15]:
print(X_test) 
y_pred = reg.predict(X_test) 

[[-0.4155122   2.94584308 -1.34627324 ...  1.63882832  0.28645448
  -0.64522879]
 [-0.40811319  2.94584308 -1.34627324 ...  1.63882832  0.21233542
  -0.99285977]
 [ 0.62485868 -0.48772236  1.01599907 ...  0.80657583  0.23086518
   0.69343084]
 ...
 [-0.41344658 -0.48772236  0.11573841 ...  1.17646583  0.44105193
  -0.98304761]
 [-0.40776407 -0.48772236  0.11573841 ...  1.17646583  0.4032249
  -0.86530163]
 [-0.41500016 -0.48772236  0.11573841 ...  1.17646583  0.44105193
  -0.66905833]]


# Errors

In [25]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 1.6753765591275767
MSE: 6.478586022406224
RMSE: 2.5453066656900547


In [34]:
y_pred

array([[-1.08801479e+00],
       [-7.78383388e-01],
       [ 6.61606505e-01],
       [ 1.82752236e-01],
       [ 1.45086631e-01],
       [-7.76235970e-02],
       [ 2.38815763e-01],
       [-1.09061977e-01],
       [-9.39094987e-01],
       [-3.70906596e-01],
       [ 2.70109432e+00],
       [-2.49983214e+00],
       [-1.37850494e+00],
       [-1.34054822e+00],
       [-9.02594069e-01],
       [ 1.02476484e+00],
       [ 1.53117156e+00],
       [ 8.98618562e-01],
       [ 4.98811065e-01],
       [-4.01057220e-01],
       [-1.60386750e-01],
       [ 3.43579321e+00],
       [ 2.01759209e+00],
       [ 1.44275299e+00],
       [ 2.96206977e+00],
       [ 1.98363948e+00],
       [ 1.28605351e+01],
       [ 2.03038441e+00],
       [ 5.37559820e-03],
       [-1.84581169e-01],
       [ 1.61773838e-01],
       [ 7.70973374e-01],
       [ 1.22488582e+00],
       [ 1.32961349e+00],
       [-4.32133458e-02],
       [-2.68533455e-01],
       [-8.72448418e-02],
       [ 1.99429226e-02],
       [-1.8