# Linear Regression (** TODAY : Gradient Descent with Multiple Inputs **)
### Objectives 
    1. Define the task
    2. Data Cleaning and Preprocessing
    3. Data Splitting.


In [1]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy.stats import entropy
import matplotlib.pyplot as plt

%matplotlib notebook
%matplotlib inline 
plt.style.use('../Style/deeplearning.mplstyle')

In [2]:
housing_dataset = pd.read_csv('../Datasets/housing.csv')
housing_dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
housing_dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [4]:
housing_dataset = housing_dataset[[
    'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
    'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
    'parking', 'prefarea', 'furnishingstatus', 'price'
]]

In [5]:
housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,13300000
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,12250000
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,12250000
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,12215000
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,11410000


# Data Cleaning and Prepocessing

In [6]:
housing_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              545 non-null    int64 
 1   bedrooms          545 non-null    int64 
 2   bathrooms         545 non-null    int64 
 3   stories           545 non-null    int64 
 4   mainroad          545 non-null    object
 5   guestroom         545 non-null    object
 6   basement          545 non-null    object
 7   hotwaterheating   545 non-null    object
 8   airconditioning   545 non-null    object
 9   parking           545 non-null    int64 
 10  prefarea          545 non-null    object
 11  furnishingstatus  545 non-null    object
 12  price             545 non-null    int64 
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [7]:
numerical_cols = housing_dataset.select_dtypes(include='number').columns
numerical_cols

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price'], dtype='object')

In [8]:
categorical_cols = housing_dataset.select_dtypes(include='object').columns
categorical_cols

Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')

# Standardization 

In [9]:
mean = housing_dataset[numerical_cols].mean()
mean

area         5.150541e+03
bedrooms     2.965138e+00
bathrooms    1.286239e+00
stories      1.805505e+00
parking      6.935780e-01
price        4.766729e+06
dtype: float64

In [10]:
std = housing_dataset[numerical_cols].std()
std

area         2.170141e+03
bedrooms     7.380639e-01
bathrooms    5.024696e-01
stories      8.674925e-01
parking      8.615858e-01
price        1.870440e+06
dtype: float64

In [11]:
housing_dataset[numerical_cols] = (housing_dataset[numerical_cols] - mean) / std

In [12]:
housing_dataset.value_counts(["furnishingstatus"])

furnishingstatus
semi-furnished      227
unfurnished         178
furnished           140
Name: count, dtype: int64

In [13]:
housing_dataset[categorical_cols] = housing_dataset[categorical_cols].apply(
    lambda col: pd.Categorical(col).codes
)
housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0,4.562174
1,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0,4.000809
2,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1,4.000809
3,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0,3.982096
4,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0,3.551716


In [14]:
"""
mainroad: yes, no => 0, 1
furnishingstatus: furnished, semifurnished, notfurnished => 0, 1, 2
"""
housing_dataset.value_counts(['mainroad'])

mainroad
1           468
0            77
Name: count, dtype: int64

In [15]:
housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0,4.562174
1,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0,4.000809
2,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1,4.000809
3,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0,3.982096
4,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0,3.551716


In [16]:
len(housing_dataset.columns)

13

# Split Data

In [17]:
seed = 142
np.random.seed(seed)

In [18]:
"""
Split the dataset into Train, Test and Validation

Training dataset (60%) => For training the model
Validation dataset (20%) => For validating the performance during training.
Testing dataset (20%) => Entirely for testing. Model never seen during training. 
"""

def split_dataset(dataset, training_ratio=0.6, val_ratio=0.2):
    no_of_examples = len(dataset)
    
    random_indices = np.random.permutation(no_of_examples)
    
    train_size = int(no_of_examples * training_ratio)
    val_size = int(no_of_examples * val_ratio)
    
    train_indices = random_indices[:train_size]
    val_indices = random_indices[train_size:train_size + val_size]
    test_indices = random_indices[train_size + val_size:]
    
    train_ds = dataset.iloc[train_indices]
    val_ds = dataset.iloc[val_indices]
    test_ds = dataset.iloc[test_indices]
    
    train_X = train_ds.iloc[:, :-1]
    train_y = train_ds.iloc[:, -1]
    val_X = val_ds.iloc[:, :-1]
    val_y = val_ds.iloc[:, -1]
    test_X = test_ds.iloc[:, :-1]
    test_y = test_ds.iloc[:, -1]
    return train_X, train_y, val_X, val_y, test_X, test_y
    
    
train_X, train_y, val_X, val_y, test_X, test_y = split_dataset(housing_dataset)

# Multiple variable House Price Prediction

In [19]:
"""
y_pred = w * x + b

x = [x1, x2, x3,.....x12]
w = [w1, w2, w3,.....w12]

y_pred = w1 * x1 + w2 * x2 + .... + w12 * x12 + b
       = np.dot(w, x) + b
       
y_pred = 0.001 * x1 + 32 * x2 + b
"""

def get_house_price(X, w, b):
    y_pred = np.dot(X, w) + b
    return y_pred

In [20]:
w = np.random.randint(low=100, high=200, size=train_X.columns.size)
b = np.random.randint(low=100, high=200)

print(f"w = {w}, b = {b}")
y_pred = get_house_price(train_X, w, b)

w = [162 196 181 142 112 175 150 146 139 165 127 136], b = 155


# KL Divergence

In [21]:
def kl_divergence(__y_true, __y_pred):
    hist_true, __ = np.histogram(__y_true, bins=50, density=True)
    hist_pred, __ = np.histogram(__y_pred, bins=50, density=True)
    return entropy(hist_true + 1e-10, hist_pred + 1e-10)

# Cost Function

In [22]:
def cost_function(x, y_true, w, b):
    y_pred = get_house_price(x, w, b)
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

mse = cost_function(train_X, train_y, w, b)
print(f"MSE is {mse:0.2f}  (Parameters not learned yet.)")

MSE is 680931.10  (Parameters not learned yet.)


# Gradient Descent
- Compute cost (done)
- Compute gradients
- Update parameters

In [23]:
def compute_gradient(X, y_true, w, b):
    delta = 1e-9
    cost_1 = cost_function(X, y_true, w, b)
    cost_2 = cost_function(X, y_true, w + delta, b)
    cost_3 = cost_function(X, y_true, w, b + delta)
    dw = (cost_2 - cost_1) / delta
    db = (cost_3 - cost_1) / delta
    return dw, db

# Train Model

In [24]:
""" 
In each epoch there are forward propagation and backward propagation
forward propagation: Compute logits, gradients
backward propagation: Update the parameters
"""

def train_model(train_X, train_y, val_X, val_y, epochs=12000, lr=0.0001):
    """ Initialize parameters """
    w = np.zeros(train_X.columns.size) # [0, 0, 0, ...0]
    b = 0
    
    for epoch in range(epochs):
        """ Compute logits / loss """
        loss = cost_function(train_X, train_y, w, b)
        val_loss = cost_function(val_X, val_y, w, b)
        
        """ Compute gradients """
        dw, db = compute_gradient(train_X, train_y, w, b)
        
        """ Update parameters """
        w = w - lr * dw
        b = b - lr * db

        if epoch % 1000 == 0:
            print(f"epoch:{epoch}/{epochs}, loss:{loss:.3f}, val_loss:{val_loss:0.3f}")
    
    return w, b

w, b = train_model(train_X, train_y, val_X, val_y)
print("Weight:", w, "bias:", b)

epoch:0/12000, loss:1.030, val_loss:0.686
epoch:1000/12000, loss:0.630, val_loss:0.535
epoch:2000/12000, loss:0.592, val_loss:0.500
epoch:3000/12000, loss:0.561, val_loss:0.470
epoch:4000/12000, loss:0.537, val_loss:0.447
epoch:5000/12000, loss:0.517, val_loss:0.428
epoch:6000/12000, loss:0.502, val_loss:0.413
epoch:7000/12000, loss:0.489, val_loss:0.401
epoch:8000/12000, loss:0.479, val_loss:0.392
epoch:9000/12000, loss:0.471, val_loss:0.385
epoch:10000/12000, loss:0.465, val_loss:0.379
epoch:11000/12000, loss:0.459, val_loss:0.374
Weight: [0.19028871 0.19028871 0.19028871 0.19028871 0.19028871 0.19028871
 0.19028871 0.19028871 0.19028871 0.19028871 0.19028871 0.19028871] bias: -0.4600720282688098


# Evalute

In [25]:
test_loss = cost_function(test_X, test_y, w, b)
print(f"MSE is {test_loss}")

MSE is 0.5207415760433463


# KL Divergence (Optional)

In [26]:
print(f"KL divergence on train dataset: {kl_divergence(np.array(train_y), get_house_price(train_X, w, b))}")
print(f"KL divergence on validation dataset: {kl_divergence(np.array(val_y), get_house_price(val_X, w, b))}")
print(f"KL divergence on test dataset: {kl_divergence(np.array(test_y), get_house_price(test_X, w, b))}")

KL divergence on train dataset: 0.9057504078941923
KL divergence on validation dataset: 2.826824154089703
KL divergence on test dataset: 3.932792128969662


In [27]:
""" 
The phenomenon where the training performance is very impressive, but in testing not enough impressive.

Overfitting => Your model learned the training set well, not the testing set
To overcome you need to regularize (SEE YOU IN NEXT CLASS) the model parameters for Generalization.
"""

' \nThe phenomenon where the training performance is very impressive, but in testing not enough impressive.\n\nOverfitting => Your model learned the training set well, not the testing set\nTo overcome you need to regularize (SEE YOU IN NEXT CLASS) the model parameters for Generalization.\n'