In [38]:
""" Class 9. Model Evaluation and Bias Variance Tradeoff
"""

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib notebook
plt.style.use('deeplearning.mplstyle')

In [39]:
""" Define the task: House price prediction with multiple inputs using linear regression

y_pred = w * x + b where x is a real number i.e. 8.6
y_pred = w * X + b where X = [x1, x2, ..., xn] X = [1.2, 1.5, 2.6, ....]

x1, x2, x3 = (2, 4, 5)
y_pred = w1 * x1 + w2 * x2 + w3 * x3 + b
       = [w1, w2, w3] x [x1, x2, x3] + b
       = W * X + b

Objective: 
   1. Define the task
   2. Data Cleaning and Preprocessing
   3. Data Splitting
   4. Model Training
"""

ROOT_DIR = "E:\\PyCharmProjects\\pythonProject"
DATA_DIR = os.path.join(ROOT_DIR, "data")
DATASET_PATH = os.path.join(DATA_DIR, "housing.csv")

In [40]:
housing_dataset = pd.read_csv('housing.csv')
housing_dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [41]:
housing_dataset.columns


Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [42]:
housing_dataset = housing_dataset[[
    'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
    'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
    'parking', 'prefarea', 'furnishingstatus', 'price'
]]

housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,13300000
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,12250000
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,12250000
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,12215000
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,11410000


In [43]:
""" Data Cleaning and Preprocessing """

' Data Cleaning and Preprocessing '

In [44]:
housing_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              545 non-null    int64 
 1   bedrooms          545 non-null    int64 
 2   bathrooms         545 non-null    int64 
 3   stories           545 non-null    int64 
 4   mainroad          545 non-null    object
 5   guestroom         545 non-null    object
 6   basement          545 non-null    object
 7   hotwaterheating   545 non-null    object
 8   airconditioning   545 non-null    object
 9   parking           545 non-null    int64 
 10  prefarea          545 non-null    object
 11  furnishingstatus  545 non-null    object
 12  price             545 non-null    int64 
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [45]:
numerical_cols = housing_dataset.select_dtypes(include='number').columns

numerical_cols

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price'], dtype='object')

In [46]:
categorical_cols = housing_dataset.select_dtypes(include='object').columns
categorical_cols

Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [47]:
""" Standardization of the numerical columns """

mean = housing_dataset[ numerical_cols ].mean()
mean

area         5.150541e+03
bedrooms     2.965138e+00
bathrooms    1.286239e+00
stories      1.805505e+00
parking      6.935780e-01
price        4.766729e+06
dtype: float64

In [48]:
std = housing_dataset[ numerical_cols ].std()
std

area         2.170141e+03
bedrooms     7.380639e-01
bathrooms    5.024696e-01
stories      8.674925e-01
parking      8.615858e-01
price        1.870440e+06
dtype: float64

In [49]:
""" 
x_standardized = (x - mean) / std
x_min_max_normalized = (x - min) / (max - min)
"""
housing_dataset[ numerical_cols ] = (housing_dataset[ numerical_cols ] - mean) / std
housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,yes,no,no,no,yes,1.516299,yes,furnished,4.562174
1,1.755397,1.402131,5.400847,2.5297,yes,no,no,no,yes,2.67695,no,furnished,4.000809
2,2.216196,0.047235,1.420507,0.224204,yes,no,yes,no,no,1.516299,yes,semi-furnished,4.000809
3,1.08263,1.402131,1.420507,0.224204,yes,no,yes,no,yes,2.67695,yes,furnished,3.982096
4,1.045766,1.402131,-0.569663,0.224204,yes,yes,yes,no,yes,1.516299,no,furnished,3.551716


In [50]:
""" Design decision: Your dataset must be representative of all categories uniformly.
"""
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

In [51]:
housing_dataset[categorical_cols] = housing_dataset[categorical_cols].apply(
    lambda col: pd.Categorical(col).codes
)

housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0,4.562174
1,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0,4.000809
2,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1,4.000809
3,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0,3.982096
4,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0,3.551716


In [52]:
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
1    227
2    178
0    140
Name: count, dtype: int64

In [53]:
len(housing_dataset.columns)

13

In [54]:
""" Split the dataset into train, test, validation set """

# Seed value is used to reproduce the random value
seed = 142
np.random.seed(seed)

np.random.rand()

0.9020615248125082

In [55]:
np.random.permutation(10)

array([6, 8, 9, 7, 3, 4, 2, 5, 1, 0])

In [56]:
my_array = np.array(
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
)

train=my_array[:6]
val=my_array[6:6+2]
test=my_array[6+2:]

print(train)
print(test)
print(val)

[1 2 3 4 5 6]
[ 9 10]
[7 8]


In [57]:
seed = 42
np.random.seed(seed)

def split_dataset(dataset, training_ratio=0.6, val_ratio=0.2):
    no_of_examples = len(dataset)
    random_indices = np.random.permutation(no_of_examples)
    
    train_size = int(training_ratio * no_of_examples)
    val_size = int(val_ratio * no_of_examples)
    
    training_indices = random_indices[:train_size]
    val_indices = random_indices[train_size:val_size + train_size]
    test_indices = random_indices[val_size + train_size:]
    
    train_ds = dataset.iloc[training_indices]
    val_ds = dataset.iloc[val_indices]
    test_ds = dataset.iloc[test_indices]
    
    train_X = train_ds.iloc[:, :-1]
    train_y = train_ds.iloc[:, -1]
    
    val_X = val_ds.iloc[:, :-1]
    val_y = val_ds.iloc[:, -1]
    
    test_X = test_ds.iloc[:, :-1]
    test_y = test_ds.iloc[:, -1]
    
    return train_X, train_y, val_X, val_y, test_X, test_y
    
train_X, train_y, val_X, val_y, test_X, test_y = split_dataset(housing_dataset)

In [58]:
print(train_X.shape)
print(len(val_X))
print(len(test_X))

(327, 12)
109
109


In [59]:
train_X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
316,0.34535,1.402131,1.420507,0.224204,0,0,1,0,0,0.355649,0,2
77,0.62183,0.047235,1.420507,1.376952,1,0,0,0,1,-0.805002,1,0
360,-0.511737,-1.307661,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,1
90,-0.069369,0.047235,-0.569663,0.224204,1,0,0,0,1,-0.805002,0,1
493,-0.548601,0.047235,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,0


In [60]:
""" ML algorithm 

X = [[x11, x12, x13]
     [x21, X22, x23]
     [x31, X32, X33]]
     
w = [w1, w2, w3]

X dot w + b = [x11 * w1 + x12 * w2 + x13 * w3 + b] => y_pred_1
              [x21 * w1 + x22 * w2 + x23 * w3 + b] => y_pred_2
              [x31 * w1 + x32 * w2 + x33 * w3 + b] => y_pred_3
"""

def get_house_price(X, w, b):
    y_pred = np.dot(X, w) + b
    return  y_pred

In [61]:
""" Initialization """

np.random.seed(55)
w = np.random.randint(low=100, high=200, size=train_X.columns.size)
b = np.random.randint(low=100, high=200)
w
b

146

In [62]:
def cost_function(x, y_true, w, b):
    y_pred = get_house_price(x, w, b)
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

In [63]:
mse = cost_function(train_X, train_y, w, b)
print(f"MSE is {mse:0.2f}  (Parameters not learned yet.)")

MSE is 668188.93  (Parameters not learned yet.)


In [64]:
"""Gradient Descent
- Compute cost (done)
- Compute gradients
- Update parameters
""" 

def compute_gradient(X, y_true, w, b):
    delta = 1e-9
    cost_1 = cost_function(X, y_true, w, b)
    cost_2 = cost_function(X, y_true, w + delta, b)
    cost_3 = cost_function(X, y_true, w, b + delta)
    dw = (cost_2 - cost_1) / delta
    db = (cost_3 - cost_1) / delta
    return dw, db

In [65]:
w = np.zeros(train_X.columns.size)
b = 0
w

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [66]:
epochs = 10000
lr = 0.0001

for epoch in range(epochs):
    """ Compute logis """
    loss = cost_function(train_X, train_y, w, b)
    valid_loss = cost_function(val_X, val_y, w, b)
    
    """ Compute gradients """
    dw, db = compute_gradient(train_X, train_y, w, b)
    
    """ Update parameters """
    w = w - lr * dw
    b = b - lr * db
    
    if epoch % 1000 == 0:
        print(f"Epoch {epoch} / {epochs}: loss = {loss:0.2f}, val_loss = {valid_loss:0.2f}")

Epoch 0 / 10000: loss = 1.06, val_loss = 0.80
Epoch 1000 / 10000: loss = 0.67, val_loss = 0.53
Epoch 2000 / 10000: loss = 0.63, val_loss = 0.49
Epoch 3000 / 10000: loss = 0.59, val_loss = 0.46
Epoch 4000 / 10000: loss = 0.56, val_loss = 0.43
Epoch 5000 / 10000: loss = 0.54, val_loss = 0.41
Epoch 6000 / 10000: loss = 0.52, val_loss = 0.40
Epoch 7000 / 10000: loss = 0.51, val_loss = 0.39
Epoch 8000 / 10000: loss = 0.50, val_loss = 0.38
Epoch 9000 / 10000: loss = 0.49, val_loss = 0.37


In [67]:
print("The learned parameters are:")
print(w)
print(b)

The learned parameters are:
[0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675
 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675]
-0.44733748718606137


In [68]:
test_loss = cost_function(test_X, test_y, w, b)
print(f"MSE is {test_loss}")

MSE is 0.46299880934442766


In [69]:
from scipy.stats import entropy

def kl_divergence(__y_true, __y_pred):
    hist_true, __ = np.histogram(__y_true, bins=50, density=True)
    hist_pred, __ = np.histogram(__y_pred, bins=50, density=True)
    return entropy(hist_true + 1e-10, hist_pred + 1e-10)

In [70]:
print(f"KL divergence on train dataset: {kl_divergence(np.array(train_y), get_house_price(train_X, w, b))}")
print(f"KL divergence on validation dataset: {kl_divergence(np.array(val_y), get_house_price(val_X, w, b))}")
print(f"KL divergence on test dataset: {kl_divergence(np.array(test_y), get_house_price(test_X, w, b))}")

KL divergence on train dataset: 1.2066097499540789
KL divergence on validation dataset: 3.102020655078701
KL divergence on test dataset: 2.49040767409723


In [71]:
print(w,b)

[0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675
 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675] -0.44733748718606137


In [72]:
print(f"KL divergence of validation dataset: {kl_divergence(housing_dataset.iloc[val_y.index.to_list()]['price'],get_house_price(val_X,w,b))}")

KL divergence of validation dataset: 3.102020655078701


In [73]:
"""
Generalization: Models performs good generally i.e. Good student
Underfitting: Models performs badly on every data set i.e. A inattentive student
Overfitting: Models performs very good on training dataset but performs badly in test set. i.e. students who work really hard but underperformed on the final exam
"""

""" Bias 

Definition: 
    It refers to the error due to overly simplistic assumptions in the learning algorithm

High bias: means that the model is too simple to fit the data. (Underfitting)
Low bias: achieved a remarkable performance on the data. (Desired)
"""

""" Variance
Variance refers to the model's sensitivity to small fluctuations in the training data.

High Variance: occurs when the model is too complex it fits the data (including the noise) very well. (Overfitting)
Low variance: achieved when the model is able to generalize the new data. (Desired)
"""

# Bias Variance Tradeoff used in Model selection
# Task, Dataset
# We measure the performance of the model using some evaluation metrics
# For classification we use accuracy, f1, precision, recall
# Model 1, accuracy 1, f1 1, precision 1, recall 1 
# ,total_error = bias ** 2 + variance = 1
# Model 2, accuracy 2, f1 2, precision 2, recall 2, 
# total_error = bias ** 2 + variance = 0.5
# Model 3, accuracy 3, f1 3, precision 3, recall 3, 
# total_error = bias ** 2 + variance = 2

" Variance\nVariance refers to the model's sensitivity to small fluctuations in the training data.\n\nHigh Variance: occurs when the model is too complex it fits the data (including the noise) very well. (Overfitting)\nLow variance: achieved when the model is able to generalize the new data. (Desired)\n"