### Model Evaluation and Bias Variance Tradeoff

In [332]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt


### Step-1: Load Dataset

In [333]:
Root_path = 'C:\\Users\\mdmes\\OneDrive\\Desktop\\Pandas Data'
dataset_path = os.path.join(Root_path, 'housing_data.csv')

house_data = pd.read_csv(
    dataset_path
)

house_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### Step-2: Clean data and Separate Number and Category colum

In [334]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [335]:
house_data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [336]:
house_data = house_data[[
    'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus','price'
]]

house_data.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,13300000
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,12250000
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,12250000
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,12215000
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,11410000


In [337]:
Numarical_colums = house_data.select_dtypes(include='number').columns
Numarical_colums

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price'], dtype='object')

In [338]:
Categorical_colums = house_data.select_dtypes(include='object').columns
Categorical_colums

Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')

### Step-3: Starndarize & Encoding for Number and Category

In [339]:
Number_mean = house_data[Numarical_colums].mean()
Number_mean

area         5.150541e+03
bedrooms     2.965138e+00
bathrooms    1.286239e+00
stories      1.805505e+00
parking      6.935780e-01
price        4.766729e+06
dtype: float64

In [340]:
Number_std = house_data[Numarical_colums].std()
Number_std

area         2.170141e+03
bedrooms     7.380639e-01
bathrooms    5.024696e-01
stories      8.674925e-01
parking      8.615858e-01
price        1.870440e+06
dtype: float64

In [341]:
house_data[Numarical_colums] = (house_data[Numarical_colums] - Number_mean) / Number_std

house_data.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,yes,no,no,no,yes,1.516299,yes,furnished,4.562174
1,1.755397,1.402131,5.400847,2.5297,yes,no,no,no,yes,2.67695,no,furnished,4.000809
2,2.216196,0.047235,1.420507,0.224204,yes,no,yes,no,no,1.516299,yes,semi-furnished,4.000809
3,1.08263,1.402131,1.420507,0.224204,yes,no,yes,no,yes,2.67695,yes,furnished,3.982096
4,1.045766,1.402131,-0.569663,0.224204,yes,yes,yes,no,yes,1.516299,no,furnished,3.551716


In [342]:
house_data[Categorical_colums] = house_data[Categorical_colums].apply(
    lambda colum : pd.Categorical(colum).codes
)

house_data.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0,4.562174
1,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0,4.000809
2,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1,4.000809
3,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0,3.982096
4,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0,3.551716


### Step-4: Split dataset by train, test & validation

In [343]:
seed =50
np.random.seed(seed)


In [344]:
def split_dataset(House_Dataset, Train_ratio=0.6, Test_Val_ratio=0.2):
    length_dataset = len(House_Dataset)
    select_data = np.random.permutation(length_dataset)

    Train_set = int(length_dataset * Train_ratio)
    Val_set = int(length_dataset * Test_Val_ratio)

    Train_indices = select_data[:Train_set]
    Val_indices = select_data[Train_set:Train_set+Val_set]
    Test_indices = select_data[Train_set + Val_set:]


    Train_ds = House_Dataset.iloc[Train_indices]
    Val_ds = House_Dataset.iloc[Val_indices]
    Test_ds = House_Dataset.iloc[Test_indices]

    Train_X = Train_ds.iloc[:,:-1]
    Train_Y = Train_ds.iloc[:,-1]

    Val_X = Val_ds.iloc[:,:-1]
    Val_Y = Val_ds.iloc[:,-1]


    Test_X = Test_ds.iloc[:,:-1]
    Test_Y = Test_ds.iloc[:,-1]

    return Train_X, Train_Y, Val_X, Val_Y, Test_X, Test_Y

Train_X, Train_Y, Val_X, Val_Y, Test_X, Test_Y = split_dataset(house_data)

In [345]:
print(len(Train_X))
print(len(Val_X))
print(len(Test_X))

327
109
109


### Step-5: Randomly initialize perameters

In [346]:
def predict_house_price(x,w,b):
    mse = np.dot(x, w) + b
    return mse

In [347]:
w = np.random.randint(100,200, size=Train_X.columns.size)
b = np.random.randint(100,200)

w

array([176, 114, 144, 139, 143, 170, 196, 120, 140, 101, 155, 116],
      dtype=int32)

### Step-6: Calculate cost/loss of dataset

In [348]:
def cost_function(X, y_true, w, b):
    y_pred = predict_house_price(X,w,b)
    mse = np.mean((y_true - y_pred)**2)
    return mse


mse = cost_function(Train_X, Train_Y, w, b)
mse

np.float64(653050.6196785802)

### Step-7: Gradeint Descent


In [349]:
def gradeint_descent(X, x_true, w, b):
    delta = 1e-9

    cost1 = cost_function(X, x_true, w, b)
    cost2 = cost_function(X, x_true, w+delta, b)
    cost3 = cost_function(X,x_true,w, b+delta)

    dw = (cost2 - cost1) / delta
    db = (cost3 - cost1) / delta

    return dw, db


In [350]:

def train_Model(Train_X, Train_Y, Val_X, Val_Y, epochs=100000, learning_rate =0.0001):
    w = np.zeros(Train_X.columns.size)
    b = 0
    for epoch in range(epochs):
        loss = cost_function(Train_X, Train_Y, w, b)
        valid_loss = cost_function(Val_X, Val_Y, w, b)

        dw, db = gradeint_descent(Train_X, Train_Y, w, b)

        w = w - learning_rate*dw
        b = b - learning_rate*db

        if epoch % 1000 == 0:
           print(f'Epoch={epoch} / {epochs} loss = {loss:0.3f}, Validation loss = {valid_loss:0.3f}')
    return w, b

w, b = train_Model(Train_X, Train_Y, Val_X, Val_Y)

Epoch=0 / 100000 loss = 1.013, Validation loss = 0.934
Epoch=1000 / 100000 loss = 0.614, Validation loss = 0.650
Epoch=2000 / 100000 loss = 0.576, Validation loss = 0.603
Epoch=3000 / 100000 loss = 0.545, Validation loss = 0.565
Epoch=4000 / 100000 loss = 0.520, Validation loss = 0.533
Epoch=5000 / 100000 loss = 0.501, Validation loss = 0.508
Epoch=6000 / 100000 loss = 0.485, Validation loss = 0.487
Epoch=7000 / 100000 loss = 0.472, Validation loss = 0.470
Epoch=8000 / 100000 loss = 0.462, Validation loss = 0.456
Epoch=9000 / 100000 loss = 0.454, Validation loss = 0.445
Epoch=10000 / 100000 loss = 0.447, Validation loss = 0.435
Epoch=11000 / 100000 loss = 0.442, Validation loss = 0.428
Epoch=12000 / 100000 loss = 0.438, Validation loss = 0.421
Epoch=13000 / 100000 loss = 0.435, Validation loss = 0.416
Epoch=14000 / 100000 loss = 0.432, Validation loss = 0.412
Epoch=15000 / 100000 loss = 0.430, Validation loss = 0.408
Epoch=16000 / 100000 loss = 0.428, Validation loss = 0.405
Epoch=1700

In [351]:
print(w)
print(b)

[0.21416544 0.21416544 0.21416544 0.21416544 0.21416544 0.21416544
 0.21416544 0.21416544 0.21416544 0.21416544 0.21416544 0.21416544]
-0.636198392073295


In [352]:
mse = cost_function(Train_X, Train_Y, w, b)
mse

np.float64(0.4209552897852878)

### Step-8: Calculate KL Divergence of Train, Test & Validation

In [353]:
from scipy.stats import entropy
def KL_Divergence(__y_true, __y_predict):
    hist_true,__=np.histogram(__y_true,bins=50,density=True)
    hist_predict,__=np.histogram(__y_predict,bins=50,density=True)
    return entropy(hist_true + 1e-10 , hist_predict + 1e-10)

In [354]:
print(f'Traing dataset KL Divergence is ->{KL_Divergence(np.array(Train_Y), predict_house_price(Train_X, w, b))}')
print(f'Testing dataset KL Divergence is ->{KL_Divergence(np.array(Test_Y), predict_house_price(Test_X, w, b))}')
print(f'Validation dataset KL Divergence is ->{KL_Divergence(np.array(Val_Y), predict_house_price(Val_X, w, b))}')


Traing dataset KL Divergence is ->0.3930924843695032
Testing dataset KL Divergence is ->8.65595890442876
Validation dataset KL Divergence is ->2.4428139420754174


### Thank You
