# Objectives 
1. Understand why feature scaling is important
2. Choice of learning rate
3. Polynomial regression
4. Understand the problem of overfitting
5. Regularization to solve overfitting

In [1]:
import numpy as np
import pandas as pd

In [2]:
housing_dataset = pd.read_csv('../Datasets/housing.csv')
housing_dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
housing_dataset = housing_dataset[[
    'area', 'price'
]]

housing_dataset.head()

Unnamed: 0,area,price
0,7420,13300000
1,8960,12250000
2,9960,12250000
3,7500,12215000
4,7420,11410000


# Feature Scalling

In [4]:
mean = housing_dataset.mean()
std = housing_dataset.std()

housing_dataset = (housing_dataset - mean) / std
housing_dataset.head()

Unnamed: 0,area,price
0,1.045766,4.562174
1,1.755397,4.000809
2,2.216196,4.000809
3,1.08263,3.982096
4,1.045766,3.551716


# Split Data

In [5]:
def split_dataset(dataset, train_ratio=0.6, val_ratio=0.2):
    no_of_examples = len(dataset)
    random_indices = np.arange(no_of_examples)
    
    train_size = int(no_of_examples * train_ratio)
    val_size = int(no_of_examples * val_ratio)
    
    train_indices = random_indices[:train_size]
    val_indices = random_indices[train_size:train_size + val_size]
    test_indices = random_indices[train_size + val_size:]
    
    train_ds = dataset.iloc[train_indices]
    val_ds = dataset.iloc[val_indices]
    test_ds = dataset.iloc[test_indices]
    
    train_X = np.array(train_ds.iloc[:, :-1])
    train_y = np.array(train_ds.iloc[:, -1])
    val_X = np.array(val_ds.iloc[:, :-1])
    val_y = np.array(val_ds.iloc[:, -1])
    test_X = np.array(test_ds.iloc[:, :-1])
    test_y = np.array(test_ds.iloc[:, -1])
    return train_X, train_y, val_X, val_y, test_X, test_y

In [6]:
train_X, train_y, val_X, val_y, test_X, test_y = split_dataset(housing_dataset)

# House Price Predictor
Polynomial equation: y = a*x + b * x^2 + c


In [7]:
def get_house_price(x, w1, w2, b):
    y_pred = x * w1 + x ** 2 * w2 + b
    return y_pred

In [8]:
w1 = np.random.randint(low=100, high=200)
w2 = np.random.randint(low=100, high=200)
b = np.random.randint(low=100, high=200)

print(w1, w2, b)

y_pred = get_house_price(train_X, w1, w2, b)
y_pred

156 101 190


array([[ 463.5956092 ],
       [ 775.06513695],
       [1031.79081156],
       [ 477.27095646],
       [ 463.5956092 ],
       [ 477.27095646],
       [ 688.75564122],
       [3602.63595449],
       [ 588.58601195],
       [ 240.79854881],
       [2158.19799637],
       [ 266.5380869 ],
       [ 332.60129244],
       [ 129.77622235],
       [ 530.99834794],
       [ 266.5380869 ],
       [ 339.25041715],
       [ 671.37443234],
       [ 156.92466671],
       [ 315.81540362],
       [ 145.09016153],
       [ 420.25671816],
       [ 578.71999347],
       [ 155.02814191],
       [ 737.9693989 ],
       [ 331.28433507],
       [ 266.5380869 ],
       [ 755.22130813],
       [ 559.30964588],
       [ 217.73975552],
       [ 472.96792225],
       [ 396.30368737],
       [ 172.12189817],
       [ 262.23961476],
       [ 372.65885448],
       [ 396.30368737],
       [ 474.17006964],
       [ 784.51063913],
       [ 266.5380869 ],
       [ 266.5380869 ],
       [ 332.60129244],
       [ 308.312

# Cost function 

In [9]:
def cost_function(x, y_true, w1, w2, b):
    _lambda = 1.1
    y_pred = get_house_price(x, w1, w2, b)
    mse = np.mean((y_pred - y_true) ** 2)
    regularizer_term = w1 ** 2 + w2 ** 2 + b # L2
    # regularizer_term = np.abs(w1) + np.abs(w2) + np.abs(b)
    return mse + _lambda * regularizer_term

In [10]:
mse = cost_function(train_X, train_y, w1, w2, b)
print(f"MSE is {mse:0.2f}  (Parameters not learned yet.)")

MSE is 325762.35  (Parameters not learned yet.)


# Gradient Decent

In [11]:
def compute_gradient(X, y_true, w1, w2, b):
    delta = 1e-9
    
    cost_1 = cost_function(X, y_true, w1, w2, b)
    cost_2 = cost_function(X, y_true, w1 + delta, w2, b)
    cost_3 = cost_function(X, y_true, w1, w2 + delta, b)
    cost_4 = cost_function(X, y_true, w1, w2, b + delta)
    
    dw1 = (cost_2 - cost_1) / delta
    dw2 = (cost_3 - cost_1) / delta
    db = (cost_4 - cost_1) / delta
    return dw1, dw2, db

# Train Model

In [12]:
def train_model(train_X, train_y, val_X, val_y, epochs=2000, lr=0.001):
    """ Initialize the parameters """
    w1 = 0
    w2 = 0
    b = 0
    
    for epoch in range(epochs):
        """ compute logits """
        loss = cost_function(train_X, train_y, w1, w2, b)
        val_loss = cost_function(val_X, val_y, w1, w2, b)
        
        """ compute gradient """
        dw1, dw2, db = compute_gradient(train_X, train_y, w1, w2, b)
        
        """ update parameters """
        w1 = w1 - lr * dw1
        w2 = w2 - lr * dw2
        b = b - lr * db
        
        if epoch % 100 == 0:
            print(f"Epoch {epoch} / {epochs} loss: {loss:.4f} val_loss: {val_loss:.4f}")
            
    return w1, w2, b
    
w1, w2, b = train_model(train_X, train_y, val_X, val_y)
train_loss = cost_function(train_X, train_y, w1, w2, b)
print(f"w1: {w1}, w2: {w2}, b: {b}, loss={train_loss:.4f}")

Epoch 0 / 2000 loss: 1.1172 val_loss: 0.3961
Epoch 100 / 2000 loss: 1.0717 val_loss: 0.4428
Epoch 200 / 2000 loss: 1.0700 val_loss: 0.4303
Epoch 300 / 2000 loss: 1.0692 val_loss: 0.4121
Epoch 400 / 2000 loss: 1.0687 val_loss: 0.3959
Epoch 500 / 2000 loss: 1.0683 val_loss: 0.3822
Epoch 600 / 2000 loss: 1.0681 val_loss: 0.3707
Epoch 700 / 2000 loss: 1.0679 val_loss: 0.3610
Epoch 800 / 2000 loss: 1.0677 val_loss: 0.3529
Epoch 900 / 2000 loss: 1.0676 val_loss: 0.3460
Epoch 1000 / 2000 loss: 1.0676 val_loss: 0.3403
Epoch 1100 / 2000 loss: 1.0675 val_loss: 0.3355
Epoch 1200 / 2000 loss: 1.0675 val_loss: 0.3314
Epoch 1300 / 2000 loss: 1.0674 val_loss: 0.3280
Epoch 1400 / 2000 loss: 1.0674 val_loss: 0.3251
Epoch 1500 / 2000 loss: 1.0674 val_loss: 0.3227
Epoch 1600 / 2000 loss: 1.0674 val_loss: 0.3206
Epoch 1700 / 2000 loss: 1.0674 val_loss: 0.3189
Epoch 1800 / 2000 loss: 1.0674 val_loss: 0.3174
Epoch 1900 / 2000 loss: 1.0674 val_loss: 0.3162
w1: 0.02548110389355429, w2: 0.07055690587343122, b:

In [13]:
test_loss = cost_function(test_X, test_y, w1, w2, b)
print(f"MSE is {test_loss}")

MSE is 1.1176467379765123


In [14]:
print(test_loss / train_loss)

1.0470813156510357


# why Train loss << Testing loss

In [None]:
""" This phenomenon is called overfitting. 
Train loss << Testing loss
So, lack of generalization.

Opposite scenario, underfitting.
Train loss >> Testing loss

Another term, generalization.

Solutions to overcome overfitting.
1. Regularization (*)
  => Add penalty to your cost function 
  => penalty on the weight value
  => L1, L2 
2. Reduce model capacity
    => Model has higher dimensional polynomial equations
    => Dropout (0.5)
    => price = w1 x x1 + w2 * x2 ^ 4 
    
""";