In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## Reading Data from CSV

In [2]:
housing_data = pd.read_csv('boston.csv')

In [3]:
housing_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


### Separating features and target set

In [8]:
features = housing_data.drop('MEDV', axis = 1)
target = housing_data['MEDV']

In [11]:
features.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [12]:
target.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

### Normalizing feature set

In [13]:
scaler = StandardScaler()
s_feature = scaler.fit_transform(features)

In [14]:
pd.DataFrame(s_feature, columns= features.columns)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.284830,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459000,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.557160,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.557160,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.416750,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.511180,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,-0.413229,-0.487722,0.115738,-0.272599,0.158124,0.439316,0.018673,-0.625796,-0.982843,-0.803212,1.176466,0.387217,-0.418147
502,-0.415249,-0.487722,0.115738,-0.272599,0.158124,-0.234548,0.288933,-0.716639,-0.982843,-0.803212,1.176466,0.441052,-0.500850
503,-0.413447,-0.487722,0.115738,-0.272599,0.158124,0.984960,0.797449,-0.773684,-0.982843,-0.803212,1.176466,0.441052,-0.983048
504,-0.407764,-0.487722,0.115738,-0.272599,0.158124,0.725672,0.736996,-0.668437,-0.982843,-0.803212,1.176466,0.403225,-0.865302


### Splitting data for train and test

In [15]:
506 * 0.8

404.8

In [22]:
#features.loc[404:,:] test set

In [23]:
#features.loc[:404,:] train set

In [27]:
X_train, X_test, y_train, y_test = train_test_split(s_feature, target, test_size=0.2, random_state=969)

In [28]:
X_train.shape

(404, 13)

In [29]:
X_test.shape

(102, 13)

### Training a Regression Model

In [31]:
lin_regress = LinearRegression()
lin_regress.fit(X_train, y_train)

In [32]:
lin_regress.coef_

array([-0.67166291,  1.31941242,  0.03894124,  0.64868797, -1.85655234,
        2.19866836,  0.03547213, -2.97939655,  2.70726066, -2.10182682,
       -1.95808435,  0.85870616, -3.86913888])

### Performance Metrics

In [33]:
y_pred = lin_regress.predict(X_test)

In [34]:
y_pred

array([26.28463364, 38.47366825, 20.74934618, 15.09849104, 24.77465821,
       18.67049821, 16.31930964, 36.24838398, 20.16753963, 20.59718525,
       24.43387967, 23.5660649 , 24.42444542, 25.14447954, 25.36531389,
       23.97084773, 30.92379427, 23.98182094, 22.51663352, 19.27438329,
       21.422279  , 18.9366784 ,  7.93025442, 32.40702194, 22.48277856,
       22.83412843, 20.19537548, 23.17185156, 14.23322551, 24.28277002,
       28.73647003, 27.40835754, 17.94538496, 22.27587514, 13.00744708,
       25.694156  , 23.10977327, 16.83125371, 37.39222326, 25.06427209,
       24.46566553, 22.29515405, 32.39104062, 16.66690007, 22.19343908,
       41.67021601, 15.64131253, 37.68334085, 12.47918694, 10.35032785,
       34.67478214, 22.20984005, 28.09093826, 33.12279828, 24.11702659,
       18.4154556 , 21.55758537, 18.61622415, 35.81511694, 18.4206776 ,
       27.5541539 , 20.8411155 , 18.8068829 , 17.78863762, 18.45024216,
       22.20203272, 11.61547256, 44.01153436, 31.87221046, 24.12

In [35]:
mean_absolute_error(y_test, y_pred)

np.float64(3.5172180276230005)

In [36]:
mean_squared_error(y_test, y_pred)

np.float64(22.318202687677317)

In [37]:
r2_score(y_test, y_pred)

0.7805853359799123