In [None]:
#import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

# For linear regression, we will need
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In this lab, we will use Boston house prices dataset. You can load the dataset by running the cell below.

Here are you tasks:
1. Basic EDA
2.a. Build a simple linear regression model (using only one feature) to predict the house price, <br>
b. determine your model's applicability (R^2) and accuracy (MSE) both for training and testing, and <br>
c. Visualize ground-truth vs. prediction.
3. a. Build a linear regression model with two features to to predict the house price, <br>
b. determine your model's applicability (R^2) and accuracy (MSE) both for training and testing, and <br>
c. Visualize ground-truth vs. prediction.
4. Repeat task-3(a) with linear algebra

Use 80/20 train-test split.

In [None]:
# load the housing data from the scikit-learn library
from sklearn.datasets import load_boston
boston_dataset = load_boston()

# create a dataframe
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)

#  the target value MEDV is missing from the data.
# create a new column of target values and add it to the dataframe
boston['MEDV'] = boston_dataset.target
# check this link for more
# https://scikit-learn.org/stable/datasets/toy_dataset.html

In [None]:
print(boston_dataset.keys())

In [None]:
print(boston_dataset.DESCR)

In [None]:
# load the data into a pandas dataframe using pd.DataFrame


In [None]:
# get the basic statistics

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.histplot(boston['MEDV'], bins=30)
plt.show()

In [None]:
# let's look at the correlation among columns
correlation_matrix = boston.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

Build a simple linear regression model (x = one of the features, y = medv)

In [None]:
# Preparing the data for training the model
X = boston['RM'].values.reshape(-1, 1)
Y = boston['MEDV'].values.reshape(-1, 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=17)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
simple_lin_model = LinearRegression()
simple_lin_model.fit(X_train, Y_train)

In [None]:
# Model evaluation
# model evaluation for training set
y_train_predict = simple_lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

In [None]:
# model evaluation for testing set
y_test_predict = simple_lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:


# model evaluation for testing set
y_test_predict = simple_lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

To fit a linear regression model, we select those features which have a high correlation with our target variable MEDV. By looking at the correlation matrix we can see that RM has a strong positive correlation with MEDV (0.7) where as LSTAT has a high negative correlation with MEDV(-0.74).

In [None]:
plt.figure(figsize=(16, 8))

features = ['LSTAT', 'RM']
target = boston['MEDV']

for i, col in enumerate(features):
    plt.subplot(1, len(features) , i+1)
    x = boston[col]
    y = target
    plt.scatter(x, y, marker='o')
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('MEDV')

In [None]:
# numpy.c_ translates slice objects to concatenation along the second axis.

# Preparing the data for training the model
X = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM'], boston['PTRATIO']], columns = ['LSTAT','RM','PTRATIO'])
Y = boston['MEDV']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=17)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

In [None]:
# Model evaluation
# model evaluation for training set
y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

In [None]:
# model evaluation for testing set
y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:
plt.figure(figsize=(8, 8))
linex = np.arange(-10,61)
plt.plot(linex,linex,'r')
plt.scatter(Y_test,y_test_predict)
plt.xlabel('Target')
plt.ylabel('Prediction')

In [None]:
# This time let's use PTRATIO too
# Preparing the data for training the model
X = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM'], boston['PTRATIO']], columns = ['LSTAT','RM','PTRATIO'])
Y = boston['MEDV']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
# Model evaluation
# model evaluation for training set
y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

plt.figure(figsize=(8, 8))
linex = np.arange(-10,61)
plt.plot(linex,linex,'r')
plt.scatter(Y_test,y_test_predict)
plt.xlabel('Target')
plt.ylabel('Prediction')

## Linear Regression with Linear Algebra

### Training

In [None]:
from numpy.linalg import inv
# let's get our LR coefficients
beta = np.dot(inv(np.dot(X_train.T,X_train)),np.dot(X_train.T,Y_train))
# let's make some prediction on our TRAINING DATA
Y_train_predict = np.dot(X_train,beta)
print(beta)

In [None]:
plt.figure(figsize=(8, 8))
linex = np.arange(-10,61)
plt.plot(linex,linex,'r')
plt.scatter(Y_train,Y_train_predict)
plt.xlabel('Target')
plt.ylabel('Prediction')
plt.title('Training')

In [None]:
rmse = (np.sqrt(mean_squared_error(Y_train,Y_train_predict)))
r2 = r2_score(Y_train,Y_train_predict)

print("The model performance for TRAINING set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

### Testing

In [None]:
Y_test_predict = np.dot(X_test,beta)

rmse = (np.sqrt(mean_squared_error(Y_test, Y_test_predict)))
r2 = r2_score(Y_test, Y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:
plt.figure(figsize=(8, 8))
linex = np.arange(-10,61)
plt.plot(linex,linex,'r')
plt.scatter(Y_test,Y_test_predict)
plt.xlabel('Target')
plt.ylabel('Prediction')
plt.title('Testing')