In [367]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy.stats import entropy
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline


%matplotlib notebook
plt.style.use(
    'deeplearning.mplstyle'
)

In [368]:
""" Define the task """
housing_dataset = pd.read_csv( "housing.csv")
housing_dataset.head(5)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [369]:
housing_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [370]:
housing_dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [371]:
housing_dataset = housing_dataset[[
    'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
    'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
    'parking', 'prefarea', 'furnishingstatus', 'price'
]]

housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,13300000
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,12250000
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,12250000
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,12215000
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,11410000


In [372]:
numerical_cols=housing_dataset.select_dtypes(include='number').columns
numerical_cols

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price'], dtype='object')

In [373]:
categorical_cols=housing_dataset.select_dtypes(include='object').columns
categorical_cols

Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [374]:
mean=housing_dataset[numerical_cols].mean()
std=housing_dataset[numerical_cols].std()


In [375]:
housing_dataset[numerical_cols]=(housing_dataset[numerical_cols]-mean)/std
housing_dataset[categorical_cols]=housing_dataset[categorical_cols].apply(
    lambda col:pd.Categorical(col).codes
)
housing_dataset[categorical_cols].head()

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,1,0,0,0,1,1,0
1,1,0,0,0,1,0,0
2,1,0,1,0,0,1,1
3,1,0,1,0,1,1,0
4,1,1,1,0,1,0,0


In [376]:
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
1    227
2    178
0    140
Name: count, dtype: int64

In [377]:
housing_dataset.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price
0,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0,4.562174
1,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0,4.000809
2,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1,4.000809
3,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0,3.982096
4,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0,3.551716


In [378]:
seed=42
np.random.seed(seed)
def split_dataset(dataset=housing_dataset,test_ratio=0.6,val_ratio=0.2):
    number_of_rows=len(dataset)
    random_indices=np.random.permutation(number_of_rows)
    
    train_size=int(test_ratio*number_of_rows)
    val_size=int(val_ratio*number_of_rows)
    
    train_indices=random_indices[0:train_size]
    val_indices=random_indices[train_size:train_size+val_size]
    test_indices=random_indices[train_size+val_size:]

    train_ds=dataset.iloc[train_indices]
    val_ds=dataset.iloc[val_indices]
    test_ds=dataset.iloc[test_indices]
    
    train_x=train_ds.iloc[:,:-1]
    train_y=train_ds.iloc[:,-1]
    val_x=val_ds.iloc[:,:-1]
    val_y=val_ds.iloc[:,-1]
    test_x=test_ds.iloc[:,:-1]
    test_y=test_ds.iloc[:,-1]
    return train_x,train_y,val_x,val_y,test_x,test_y
train_x,train_y,val_x,val_y,test_x,test_y=split_dataset(housing_dataset)


In [379]:
print(len(train_x))

327


In [380]:
train_x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
316,0.34535,1.402131,1.420507,0.224204,0,0,1,0,0,0.355649,0,2
77,0.62183,0.047235,1.420507,1.376952,1,0,0,0,1,-0.805002,1,0
360,-0.511737,-1.307661,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,1
90,-0.069369,0.047235,-0.569663,0.224204,1,0,0,0,1,-0.805002,0,1
493,-0.548601,0.047235,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,0


In [401]:
val_x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
332,0.16103,1.402131,-0.569663,0.224204,1,1,1,0,0,-0.805002,0,1
294,-0.530169,1.402131,1.420507,0.224204,0,0,0,0,0,-0.805002,0,1
417,-0.696057,-1.307661,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,0
384,-0.299769,-1.307661,-0.569663,-0.928544,0,0,0,0,0,-0.805002,0,0
438,-0.299769,-1.307661,-0.569663,-0.928544,0,0,0,0,0,-0.805002,0,1


In [381]:
def get_house_price(x,w,b):
    y_pred=np.dot(x,w)+b
    return y_pred

In [382]:
def cost_function(x,y_true,w,b):
    y_pred=get_house_price(x,w,b)
    mse=np.mean((y_true-y_pred)**2)
    return mse

In [383]:
np.random.seed(55)
w=np.linspace(100,200,len(train_x.columns))
b=np.random.randint(100,200)
w
b

177

In [384]:
w=np.zeros(len(train_x.columns))
b=0

In [385]:
print(w)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [386]:
def compute_gradient(x,y_true,w,b):
    delta=1e-9
    cost1=cost_function(x,y_true,w,b)
    cost2=cost_function(x,y_true,w+delta,b)
    cost3=cost_function(x,y_true,w,b+delta)
    dw=(cost2-cost1)/delta
    db=(cost3-cost1)/delta
    return dw,db



In [387]:
mse=cost_function(train_x,train_y,w,b)
print(f"{mse:0.2f}")

1.06


In [388]:
learning_rate=0.0001
for epoch in range(10000):
    dw,db=compute_gradient(train_x,train_y,w,b)
    lost=cost_function(train_x,train_y,w,b)

    w=w-learning_rate*dw
    b=b-learning_rate*db
    if epoch%1000==0:
        print(f"After {epoch} iterations loss is={(lost):0.2f}")

After 0 iterations loss is=1.06
After 1000 iterations loss is=0.67
After 2000 iterations loss is=0.63
After 3000 iterations loss is=0.59
After 4000 iterations loss is=0.56
After 5000 iterations loss is=0.54
After 6000 iterations loss is=0.52
After 7000 iterations loss is=0.51
After 8000 iterations loss is=0.50
After 9000 iterations loss is=0.49


In [389]:
print(w,b)

[0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675
 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675] -0.44733748718606137


In [390]:
(train_x.columns)

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea',
       'furnishingstatus'],
      dtype='object')

In [391]:
y_pred=get_house_price(train_x,w,b)
x=housing_dataset.loc[train_x.index.to_list()]
x['predicted_price']=y_pred
x


Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price,predicted_price
316,0.345350,1.402131,1.420507,0.224204,0,0,1,0,0,0.355649,0,2,-0.377841,0.808684
77,0.621830,0.047235,1.420507,1.376952,1,0,0,0,1,-0.805002,1,0,1.006860,0.606480
360,-0.511737,-1.307661,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,1,-0.564963,-0.842433
90,-0.069369,0.047235,-0.569663,0.224204,1,0,0,0,1,-0.805002,0,1,0.894587,-0.107190
493,-0.548601,0.047235,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,0,-1.051480,-0.783235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,-0.779001,1.402131,-0.569663,0.224204,1,0,0,0,1,-0.805002,0,1,-0.405910,0.012917
532,-0.990968,-1.307661,-0.569663,-0.928544,0,0,0,0,0,-0.805002,0,2,-1.407011,-0.931635
485,-0.700665,-1.307661,-0.569663,-0.928544,1,0,0,0,0,-0.805002,0,2,-1.014055,-0.691462
487,0.114950,1.402131,-0.569663,0.224204,1,0,0,0,0,-0.805002,0,2,-1.014055,0.179314


In [392]:
print(type(x['predicted_price']))

<class 'pandas.core.series.Series'>


In [393]:
print(len(y_pred))

327


In [394]:
len(housing_dataset.loc[train_x.index.to_list()])

327

In [395]:
def kl_divergence(y_true,y_pred):
    price_hist,_=np.histogram(x['price'],bins=50,density=True)
    price_pred_hist,_=np.histogram(x['predicted_price'],bins=50,density=True)
    kl_divergence=entropy(price_hist+1e-10,price_pred_hist+1e-10)
    return (kl_divergence)

In [396]:
print(w,b)

[0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675
 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675 0.18613675] -0.44733748718606137


In [397]:
print(f"KL divergence of train dataset: {kl_divergence(x['price'],x['predicted_price'])}")
print(f"KL divergence of validation dataset: {kl_divergence(housing_dataset.iloc[val_y.index.to_list()]['price'],get_house_price(val_x,w,b))}")

KL divergence of train dataset: 1.2066097499540789
KL divergence of validation dataset: 1.2066097499540789


In [398]:
print(f"KL divergence on train dataset: {kl_divergence(np.array(train_y), get_house_price(train_x, w, b))}")
print(f"KL divergence on validation dataset: {kl_divergence(np.array(val_y), get_house_price(val_x, w, b))}")
print(f"KL divergence on test dataset: {kl_divergence(np.array(test_y), get_house_price(test_x, w, b))}")

KL divergence on train dataset: 1.2066097499540789
KL divergence on validation dataset: 1.2066097499540789
KL divergence on test dataset: 1.2066097499540789
