In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

# Import data from sklearn 

In [None]:
from sklearn.datasets import load_boston
"""Our dataset is boston city in Massachusetts famously know for boston tea party
    With the help of data you have to predict the price of house in Boston
    
"""

In [None]:
boston = load_boston()

In [None]:
# type(boston) #type is sklearn bunch => dict keys & values

boston.keys()

In [None]:
# Lets check the key:DESCR values
print(boston['DESCR'])

In [None]:
print(boston.data)

# Since data is nested loop understand it and try to find first element
# print(boston.data[0]) 
# This gives single list having 13 features values we total have 506 such list

In [None]:
# This to check price our goal is to find price i.e. target here to similar to our calculated 
print(boston.target)


In [None]:
print(boston.feature_names)
boston.feature_names[0]

# Prepare the dataset 
- It's the 3rd step in 7 ML steps, sometimes it's also reffered as EDA **Exploratory Data Analysis**

In [None]:
dataset = pd.DataFrame(boston.data)

#Check what is DataFrame and various method's of it 

In [None]:
dataset

In [None]:
dataset.head()

#Gives only first 5 data

In [None]:
# Q.Can we change our column number to their names? 
# A. Our DataFrame has parameter column in which we can pass our required argument to set as column name

dataset = pd.DataFrame(boston.data,columns=boston.feature_names)
dataset.head()

In [None]:
# Q. Can we add output dependent variable in dataframe?
# A. You can it's same as adding key with it's values but size should same

dataset['Price'] = boston.target
dataset.head()

In [None]:
dataset.info()

# This gives us clear understanding of each column and row are any null(blank) values present

In [None]:
dataset.describe()

# Finding out stats measures of our dat

In [None]:
dataset.isnull()
# dataset.isnull().sum()

"""Whenever you get dataset check whether it's have missing value the info just give idea is there present null or not
But to find exactly where null value is present you should use isnull() method 
dataset.isnull().sum() just helps in counting and displaying the null 
""" 

In [None]:
### Exploratory Data Analysis:- ###

"""

############
Cor-relation When you are dealing with regression problem try to find correlation 
############          
1. Between Independent features aka Multicollinearity: How & which independent features are inter-related with each other
2. Between Independent and Output features: Which independent features are related with output and what factor

"""

dataset.corr()
# Move cursor over corr() see it's parameter pearson is most powerful way to find correlation
# Values ranges between -1,0,1 {-1: Highly -ve dependent, 0: No dependence, 1: Highly postively dependent}


In [None]:
# Since we are not able to properly figure out correlation we are going to plot various graph to get better vizualization

# import seaborn as sns
# sns.pairplot(dataset)

In [None]:
#Since it's difficult to understand from graph we can create some individual graphs of Independent vs Dependent. Later you can try to find Multicollinearity i.e. releation between independent

fig,axs = plt.subplots(4,4,figsize=(12, 9),layout="constrained")
for count,ax in enumerate(axs.flat):
    if count<13:
        ax.scatter(dataset[boston.feature_names[count]],dataset['Price'])
        ax.set_xlabel(boston.feature_names[count])
        ax.set_ylabel("Price")
    else:
        break
fig.get_layout_engine().set(w_pad=4 / 72, h_pad=4 / 72, hspace=0,wspace=0)


In [None]:
# Since with the help of plot we are ablt to find independent vs dependent correlation i.e. RM & LSTAT has releation with Price.
# We also have some seaborn features that help in visualization how a ideal regression pattern should happen between features.

import seaborn as sns
sns.regplot(x="RM",y='Price',data=dataset)

In [None]:
import seaborn as sns
sns.regplot(x="LSTAT",y='Price',data=dataset)

# This both plot show how is regression pattern between them!

In [None]:
""" 
     Now we got idea about features that have high impact on dependent output!
     You can start with finding best model for our problem
"""

X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

""" 
     iloc is integer location [---,----] row,column
    Since we are creating model for input we will not include output in that   
"""

X.head()

In [None]:
#Splitting Training and testing data 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=3)
X_train

In [None]:
# Standardize the data i.e. feature scaling it's name as Standardize data which means normalize the range of independent variables or features of data
# Standardize scaler just change the features to same range is 1 feature CRIM in 0-40 and RAD changes from 2-30 and TAX changes from 0-1000 scaler just bring all to same scale 0 to 1 or -1 to 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)
# Here we not applied fit_transform because we don't want our model to learn from the test data also it's should unaware and unseen from it.
# If we used fit_transform model finds knowledge from that data i.e. it will learn from testing data also. 

In [None]:
print(X_train.min())
print(X_train.max())

# Q. Why we are normalizing the data in linear regression?
# A. Since we are using gradient descent in linear regression it will help to find global minima, Thus by normalizing we are setting same scale range for all features.

# Model Buidling

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
# Print the coefficients and intercepts


"""
Eqn of line is y = mx + c  <Slope is coefficient,wegiht) <y-intercept is called intercept,bias>
Weight(Coefficient) := In machine learning linear regression model have weight attach to each features | independent variables 
Bias(Intercept)     := Bias or intercept is point where linear line intercept at y-axis
"""


print(regression.coef_)
print(regression.intercept_)

In [None]:
# On which parameters of regrssion class the model has trained is find out by

regression.get_params()

In [None]:
# Prediction made by our model on test data

reg_pred = regression.predict(X_test)
reg_pred

In [None]:
# Now checking how our modelprediction is performing w.r.t y_test

"""
We plotting graph between truth values i.e. y_test for predicted values if that gives straight line means model perform well
When x & y both are same you get straight line.
"""

plt.scatter(y_test,reg_pred)



In [None]:
# Plot between residuals in prediction
# Residual = actual value — predicted value
# e = y — ŷ


residuals = y_test - reg_pred


In [None]:
# Plotting the residuals

""" 
Our data is normal distributed with some outilers after +10 in x-axis
"""

sns.displot(residuals,kind='kde')
sns.displot(residuals,kind='hist')

In [None]:
# Scatter plot wrt prediction and residuals

"""
This plot show our residuals i.e. our error are uniformly distributed 
"""
plt.scatter(reg_pred,residuals)


# Performance measures | Evaluation metric in Linear Regression

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print("Absoulte Mean Error:-",mean_absolute_error(y_test,reg_pred))
print("Mean Square Error:-",mean_squared_error(y_test,reg_pred))
print("Root Mean Square Error:-",np.sqrt(mean_squared_error(y_test,reg_pred)))


## Rsquare and adjusted Rsquare

- Formula

**R^2 = 1 - SSR/SST**

*R^2 = coefficient of determination SSR = sum of square of residuals SST = total sum of squares*

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test,reg_pred)
print(score)

## Adjusted Rsquare

- Formula

**R2 = 1 - [(1-R^2)*(n-1)/(n-k-1)]**

where:

*R2 : The R2 of the model n: The number of observations k: The number of prediction*

In [None]:
1-(1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

# Prediction on new data


In [None]:
print(boston.data[0],"Shpae:",boston.data[0].shape)
print(boston.data[0].reshape(1,-1),"Shape:",boston.data[0].reshape(1,-1).shape)
print(boston.data.shape)

In [None]:
print(boston.data[0].shape)
# We need to transform new prediction data so that it's calculate as per our standardize way
"""
Q. Why we reshape our single input
A. Boston is our dataset with shape (506,13) means 1 single input is of shape (1,13) but when we retrive it is in single list format
    i.e. (13,) a simple list containig 13 elements so we reshape it to make (1,13) 1 row with 13 columns
"""

print(boston.data[0].reshape(1,-1).shape)


In [None]:
# Here we transform the data to make it of same range

print(scaler.transform(boston.data[0].reshape(1,-1)))



In [None]:
# Here we predicting only single input
print(regression.predict(scaler.transform(boston.data[0].reshape(1,-1))))


# Prediction on user input
- Read the commets below

In [80]:
#Note here we are testing on unseen data
# Provide your testing input here which is simple array of size 13 for eg. it will look like user_input = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13])

user_input = np.array([])  #give input data inside [] brackets 
scaler.transform(user_input.reshape(1,-1))
print(regression.predict(scaler.transform(user_input.reshape(1,-1)))) 



ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by StandardScaler.