In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Lets load the Boston House Pricing Dataset

In [2]:

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

### Alternative Data Source     from sklearn.datasets import fetch_california_housing

In [14]:
dataset=pd.DataFrame(data,columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT'])

dataset=pd.concat([dataset,pd.DataFrame(target,columns=['Price'])], axis=1)
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Preparing The Dataset

In [None]:
dataset['Price']=target

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
## Summarizing The Stats of the data
dataset.describe()

In [None]:
## Check the missing Values
dataset.isnull().sum()

In [None]:
### EXploratory Data Analysis
## Correlation
dataset.corr()

In [None]:
import seaborn as sns
sns.pairplot(dataset)

## Analyzing The Correlated Features

In [None]:
dataset.corr()

In [None]:
plt.scatter(dataset['CRIM'],dataset['Price'])
plt.xlabel("Crime Rate")
plt.ylabel("Price")

In [None]:
plt.scatter(dataset['RM'],dataset['Price'])
plt.xlabel("RM")
plt.ylabel("Price")

In [None]:
import seaborn as sns
sns.regplot(x="RM",y="Price",data=dataset)

In [None]:
sns.regplot(x="LSTAT",y="Price",data=dataset)

In [None]:
sns.regplot(x="CHAS",y="Price",data=dataset)

In [None]:
sns.regplot(x="PTRATIO",y="Price",data=dataset)

In [None]:
## Independent and Dependent features

X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

In [None]:
X.head()

In [None]:
y

In [None]:
##Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test=scaler.transform(X_test)

In [None]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression=LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
## print the coefficients and the intercept
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
## on which parameters the model has been trained
regression.get_params()

In [None]:
### Prediction With Test Data
reg_pred=regression.predict(X_test)

In [None]:
reg_pred

## Assumptions

In [None]:
## plot a scatter plot for the prediction
plt.scatter(y_test,reg_pred)

In [None]:
## Residuals
residuals=y_test-reg_pred

In [None]:
residuals

In [None]:
## Plot this residuals 

sns.displot(residuals,kind="kde")

In [None]:
## Scatter plot with respect to prediction and residuals
## uniform distribution
plt.scatter(reg_pred,residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test,reg_pred))
print(mean_squared_error(y_test,reg_pred))
print(np.sqrt(mean_squared_error(y_test,reg_pred)))

## R square and adjusted R square


Formula

**R^2 = 1 - SSR/SST**


R^2	=	coefficient of determination
SSR	=	sum of squares of residuals
SST	=	total sum of squares


In [None]:
from sklearn.metrics import r2_score
score=r2_score(y_test,reg_pred)
print(score)

**Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]**

where:

R2: The R2 of the model
n: The number of observations
k: The number of predictor variables

In [None]:
#display adjusted R-squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

## New Data Prediction

In [None]:
boston.data[0].reshape(1,-1)

In [None]:
##transformation of new data
scaler.transform(boston.data[0].reshape(1,-1))

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1,-1)))

## Pickling The Model file For Deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression,open('regmodel.pkl','wb'))

In [None]:
## Prediction
pickled_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))

In [None]:
dataset.head()