## MyHome real estate Price Predictor


In [None]:
import pandas as pd


In [None]:
housing = pd.read_csv("/content/sample_data/data.csv")
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
housing["CHAS"].value_counts()

0    471
1     35
Name: CHAS, dtype: int64

In [None]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284341,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.705587,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.884,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.208,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.625,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(20,15))
# plt.show() 

## Train test splitting



In [None]:
import numpy as np
# manual training and testing
def split_train_test(data, test_ratio):
    np.random.seed(42)      #  this ensures that random values are fixed and not change every time
    shuffled=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices=shuffled[:20]
    train_indices=shuffled[20:]
    return data.iloc[train_indices], data.iloc[test_indices]

# train_set, test_set = split_train_test(housing, 0.2)    

In [None]:
# training and testing by help of inbuilt fn
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


# here there is a problem, for eg CHAS hos only two values 0 and 1, 0-475 and 1-35
# but in spliting train and test data suppose traindata = 402, testdata=104
# if there are no 1's in train data then our program forms wrong pattern that 
# there is only one possibility of CHAS i.e, 1

In [None]:
# to solve above problem
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set['CHAS'].value_counts()

0    95
1     7
Name: CHAS, dtype: int64

In [None]:
strat_train_set['CHAS'].value_counts()

0    376
1     28
Name: CHAS, dtype: int64

In [None]:
# 95/7 ~= 376/28

In [None]:
housing = strat_train_set.copy() #this is to be done for large data and not include test data in it

## Looking for correlations

In [None]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680857
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [None]:
# if value is 1 meaning strong positive correlation
# if value is -1 meaning strong negative correlation
# next Rm value is 0.69 which is high positive correlation which means if RM increases chances
# of increasing MEDV increases, then ZN and B are weak positive correlation
# similarly lstat is high neg corr, lesser value of lstat higher value of medv

In [None]:
# from pandas.plotting import scatter_matrix
# attributes = ["MEDV", "RM","ZN","LSTAT"]
# scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
# housing.plot(kind="scatter", x="RM",y="MEDV",alpha=0.8)

## Trying out Attribute combinations

In [None]:
housing["TAXRM"]=housing["TAX"]/housing["RM"] #you can try any combination
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)
# finally we are not adding this in our main data for now........

MEDV       1.000000
RM         0.680857
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
TAXRM     -0.528626
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [None]:
# housing.plot(kind="scatter", x="TAXRM",y="MEDV",alpha=0.8)

In [None]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

## Missing attributes

In [None]:
# To take care of missing attributes, you have three options:
#     1. Get rid of the missing data points
#     2. Get rid of the whole attribute
#     3. Set the value to some value(0, mean or median)

In [None]:
a=housing.dropna(subset=["RM"]) # option 1
a.shape

(399, 13)

In [None]:
housing.drop("RM", axis=1).shape  #option 2
# Note that there is no RM column and also note that the original housing dataframe will remain unchanged

(404, 12)

In [None]:
median = housing["RM"].median()  #option 3
median

6.209

In [None]:
housing["RM"].fillna(median)
# Note that the original housing dataframe will remain unchanged

254    6.108
348    6.635
476    6.484
321    6.376
326    6.312
       ...  
155    6.152
423    6.103
98     7.820
455    6.525
216    5.888
Name: RM, Length: 404, dtype: float64

In [None]:
housing.describe()    # before we started filling missing attributes

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,399.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.279481,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.716784,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.8765,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.6305,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


In [None]:
#for reflecting the changes in original train and test data i.e, null alues to median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

SimpleImputer(strategy='median')

In [None]:
imputer.statistics_    # calculates for every row and automatically fills

array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
       6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
       1.90000e+01, 3.90955e+02, 1.15700e+01])

In [None]:
X = imputer.transform(housing)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [None]:
housing_tr.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.278609,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.712366,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.87875,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.209,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.63,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


## Scikit learn Design

Primarily, three types of objects

1) Estimators - It estimates some parameter based on a dataset. 
Eg. imputer. It has a fit method and transform method. 
Fit method - Fits the dataset and calculates internal parameters

2) Transformers - transform method takes input and returns output based on the learnings from fit(). 
It also has a convenience function called fit_transform() which fits and then transforms.

3) Predictors - LinearRegression model is an example of predictor. 
fit() and predict() are two common functions. It also gives score() function which will 
evaluate the predictions.

## Feature Scaling


Primarily, two types of feature scaling methods:

1) Min-max scaling (Normalization) (value - min)/(max - min) 
Sklearn provides a class called MinMaxScaler for this

2) Standardization (value - mean)/std 
Sklearn provides a class called StandardScaler for this

## Creating a pipeline

In [None]:
# instead of doing imputer, directly you can opt for pipeline which automates things
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
#     add as many as you want...
    ('std_scaler',StandardScaler())
])


In [None]:
housing_num_tr = my_pipeline.fit_transform(housing)
housing_num_tr    # this is a numpy array

array([[-0.43942006,  3.12628155, -1.12165014, ..., -0.97491834,
         0.41164221, -0.86091034],
       [-0.44352175,  3.12628155, -1.35893781, ..., -0.69277865,
         0.39131918, -0.94116739],
       [ 0.15682292, -0.4898311 ,  0.98336806, ...,  0.81196637,
         0.44624347,  0.81480158],
       ...,
       [-0.43525657, -0.4898311 , -1.23083158, ..., -0.22254583,
         0.41831233, -1.27603303],
       [ 0.14210728, -0.4898311 ,  0.98336806, ...,  0.81196637,
        -3.15239177,  0.73869575],
       [-0.43974024, -0.4898311 ,  0.37049623, ..., -0.97491834,
         0.41070422,  0.09940681]])

## Selecting a desired model for the problem

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()    # this model is bad becouse mse=0, which means it is overfitting
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)
housing_num_tr.shape

(404, 13)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

array([22.33 , 25.294, 16.593, 23.293, 23.488])

In [None]:
np.array(some_labels)

array([21.9, 24.5, 16.7, 23.1, 23. ])

## Evaluating the model

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [None]:
rmse

1.199907896630419

## Using better evaluation techniques - Cross Validation

In [None]:
# for eg 1 2 3 4 5 6 7 8 9, firstly it will train expect 1 and check for 1, then repeat it for 2, 3..
# so on.. to last value
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

array([2.84260562, 3.06814212, 4.32038672, 2.47611691, 3.3029497 ,
       2.64951368, 4.69987782, 3.28315354, 3.37916056, 3.16125513])

In [None]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [None]:
print_scores(rmse_scores)

Scores: [2.84260562 3.06814212 4.32038672 2.47611691 3.3029497  2.64951368
 4.69987782 3.28315354 3.37916056 3.16125513]
Mean:  3.3183161813225426
Standard deviation:  0.6624337459128998


## Saving the model

In [None]:
from joblib import dump, load
dump(model, 'Dragon.joblib') 

['Dragon.joblib']

## Testing the model on test data

In [None]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

[24.686 11.563 26.005 22.434 18.696 14.932 19.647 14.857 31.88  41.08
 19.559 11.463 24.812 29.051 19.597 11.171 31.735 14.312 23.652 18.078
 19.703 18.211 17.072 22.122 18.138 31.485 16.22  32.836  9.065 34.001
 23.59  21.346 22.951 11.335 20.886 11.23  43.126 24.733 23.296 41.52
 24.075 29.629 20.468 21.316 19.19  33.446 43.916 20.101 20.026 21.727
 20.966 14.064 20.822 15.201 25.297 32.97  41.682 29.021 19.353 20.72
 46.768 10.22  18.977 25.341 14.693 33.412 19.95  17.73  19.161 34.786
 26.045 22.671 21.54  21.976 35.653 13.005 16.161 19.963 21.052 21.435
 22.495 20.806 14.236 22.706 20.517 20.927 14.587 21.197 21.351 23.442
 18.835 26.716  6.945 26.623 18.734 29.885 19.496 31.788 14.47  27.906
 21.752 20.605] [16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1, 4

In [None]:
final_rmse

2.9217454358921926

In [None]:
prepared_data[0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.23979304, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

## Using the model

In [None]:
from joblib import dump, load
import numpy as np
model = load('Dragon.joblib') 
features = np.array([[-5.43942006, 4.12628155, -1.6165014, -0.67288841, -1.42262747,
       -11.44443979304, -7.31238772,  7.61111401, -26.0016879 , -0.5778192 ,
       -0.97491834,  0.41164221, -66.86091034]])
print(model.predict(features))

[24.554]
