In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from utils import set_pretty_prints, load_dataset

In [None]:
set_pretty_prints()

In [None]:
df = load_dataset('imobiliare.ro')

In [None]:
df

In [None]:
y_sqmp = df["Price/Surface"]
y_price = df['Price']

# TODO: select viable features
START = 1
END = 9
X = df.iloc[:,START:END]


In [None]:
X

In [None]:
y_price

## Exploration

In [None]:
field = 'nr cam'
x_label = 'Nr rooms'
title = 'Distribution of nr of rooms per apartment'
X[field].hist(bins=20)
plt.xlabel(x_label)
plt.title(title)

In [None]:
# TODO: Analyse a few more features
field = 'mp'
x_label = 'Square meters per property'
title = 'Distribution of sqm per property'
X[field].hist(bins=20)
plt.xlabel(x_label)
plt.title(title)

In [None]:
# TODO: Analyse target distribution
target = y_price
title = 'Distribution of price'
x_label = 'Price'
plt.hist(target, bins=50)
plt.title(title)
plt.xlabel(x_label)

In [None]:
# TODO: Re-display the target distribution
target = y_price
title = "Distribution of price in log scale"
x_label = "Price"
plot_param = 'log'
plot_param_value = True
kwargs = {plot_param : plot_param_value}
plt.hist(target, bins=50, **kwargs)
plt.title(title)
plt.xlabel(x_label)

In [None]:
X.corr()

In [None]:
new_corr_features = ['nr cam', 'mp', 'parter', 'et1-2', 'et3+','etaj max', 'typ_decom', 'bloc nou', 'Price']

In [None]:
df_new = df[new_corr_features]

In [None]:
df_new.corr()

## Modelling
- further process X data maybe
- construct normal eq and determine model coefs `(((XtX)^-1)Xt)y (y = x*w => w = y/x)`
- validate results (how, when)

`f(X) = y = X[0]*w[0] + X[1]*w[1] + .... X[N]*w[N] +X[N+1]*w[N+1] | X[N+1] == 1`

In [None]:
X.mean()

In [None]:
X.min()

In [None]:
X.std()

In [None]:
X.max()

In [None]:
np_X = X.values

In [None]:
np_X[:20]

In [None]:
np_X.mean(axis=0)

In [None]:
np_X_n = (np_X - np_X.mean(0)) / np_X.std(0)

In [None]:
np_X_n[:20]

In [None]:
np_y = y_price.values
np_y[:20]

In [None]:

np_y_n = (np_y - np_y.min()) / (np_y.max() - np_y.min())
np_y_n[:20]

In [None]:
np_y.min()


In [None]:
np_y.max()

In [None]:
y_norm_sub = np_y.min()
y_norm_div = np_y.max() - np_y.min()
y_test = np_y_n * y_norm_div + y_norm_sub
y_test[:20]

In [None]:
# TODO: write normal eq for raw data
np_weights = np.linalg.pinv(np_X.T.dot(np_X)).dot(np_X.T).dot(np_y)

# TODO: write normal eq for normalized data
np_weights_n = np.linalg.pinv(np_X_n.T.dot(np_X_n)).dot(np_X_n.T).dot(np_y_n)


In [None]:
np_weights

In [None]:
np_weights_n

In [None]:
#TODO: calc predictions for raw data model
np_y_preds = np_X.dot(np_weights)

#TODO: calc predictions for normalized data model
np_y_preds_n = np_X_n.dot(np_weights_n)

In [None]:
np_y_preds[:20]

In [None]:
plt.figure()
plt.hist(np_y_preds)
plt.title('Raw model predictions')
plt.figure()
plt.hist(np_y_preds_n)
plt.title('Normed data model predictions')

### One more model before testing results

Lets further improve model by adding bias

In [None]:
ones = np.ones(shape=(np_X_n.shape[0], 1))
ones[:20]

In [None]:
np_X_nb = np.concatenate((np_X_n, ones), axis=-1)
np_X_nb[:20]

In [None]:
# TODO: calculate weights
np_weights_nb = None
np_weights_nb

In [None]:
# TODO: calculate predictions
np_y_preds_nb = None
np_y_preds_nb

In [None]:
plt.hist(np_y_preds_nb, bins=50)

In [None]:
np_y_pred_price = np_y_pred
np_y_pred_n_price = None
np_y_pred_nb_price = None

### Now lets prepare some friendly calitative analysis outputs

Raw model

In [None]:
df_result_raw = pd.DataFrame(
    {
        'GOLD' : y_price,
        'PRED' : np_y_pred_price.round(0),
    }
)
df_result_raw.head(10)
df_result_raw.tail(10)

Normed data model

In [None]:
df_result_n = pd.DataFrame(
    {
        'GOLD' : y_price,
        'PRED' : np_y_pred_n_price.round(0),
    }
)
df_result_n.head(10)
df_result_n.tail(10)

Normed & bias added

In [None]:

df_result_nb = pd.DataFrame(
    {
        'GOLD' : y_price,
        'PRED' : np_y_pred_nb_price.round(0),
    }
)
df_result_nb.head(10)
df_result_nb.head(10)

Now lets see some quantitative analysis of the results

In [None]:
# TODO: complete code below
abs_err = np.abs(y_price - None)
abs_err

In [None]:
proc_err = abs_err / y_price
proc_err = proc_err * 100

In [None]:
df_result = pd.DataFrame(
    {
        'GOLD' : y_price,
        'PRED' : np_y_pred2_price.round(0),
        'ERR%' : proc_err.round(2)
    }
)
df_result.head(20)

In [None]:
df_result.tail(20)

In [None]:
proc_err.mean()

In [None]:
def train_neq(inputs, gold):
    # TODO:
    weights = np.linalg.pinv(inputs.T.dot(inputs)).dot(inputs.T).dot(gold)
    return weights

def evaluate(theta, inputs, gold, y_div, y_sub, name=""):
    _y_pred = inputs.dot(theta)
    _y_vals = _y_pred * y_div + y_sub
    
    _y_true = gold * y_div + y_sub
    
    res_err = np.abs(_y_true - _y_vals)
    prc_err = res_err / _y_true
    
    overall = prc_err.mean()
    df_result = pd.DataFrame(
        {
        'GOLD' : _y_true,
        'PRED' : _y_vals.round(0),
        'ERR%' : prc_err.round(2)
        }
    )
    print('Results for', name)
    print(df_result.head(20))
    print(df_result.tail(20))
    print("Overall error: {:.1f}%".format(overall * 100))
    return overall

# Now for a more correct and real-life approach
We will not use the pre-processed data and perform a train-test split. There is no need for train-dev-test split as we do not have a training process to use the dev on.

In [None]:
from sklearn.model_selection import train_test_split
x_trn, x_tst, y_trn, y_tst = train_test_split(np_X_nb, np_y_n, test_size=0.2)

In [None]:
theta = train_neq(
    inputs=x_trn,
    gold=y_trn
)

In [None]:
theta

In [None]:
evaluate(
    theta=theta,
    inputs=x_trn,
    gold=y_trn,
    y_div=y_norm_div,
    y_sub=y_norm_sub,
    name='TRAIN',
)

In [None]:
evaluate(
    theta=theta,
    inputs=x_tst,
    gold=y_tst,
    y_div=y_norm_div,
    y_sub=y_norm_sub,
    name='TEST',
)

# Simple Neural model


In [None]:
import torch as th

class SimpleLinerRealEstateModel(th.nn.Module):
    def __init__(self, n_feats, n_hid1=32):
        super().__init__()
        self.hidden1 = th.nn.Linear(n_feats, n_hid1)
        self.act1 = th.nn.ReLU()
        self.readout = th.nn.Linear(n_hid1, 1)
        return
    
    def forward(self, inputs):
        #############################
        # TODO: complete forward pass 
        #############################
        th_x = None
        th_x = None
        th_out = None
        return th_out


In [None]:
model = SimpleLinerRealEstateModel(8, 256)
model

In [None]:
x_trn.shape

In [None]:
x_tst.shape

### Introducing "dev" dataset
Now we will have a training process so we need a dev dataset

In [None]:
DEV_PRC = None
DEV_SIZE = int(x_tst.shape[0] * DEV_PRC)
x_dev = x_tst[:DEV_SIZE,:]
x_test = x_tst[DEV_SIZE:,:]

Now we tensorize but we eliminate the bias term 

In [None]:

th_x_trn = th.tensor(x_trn[:,:-1], dtype=th.float32)
th_x_dev = th.tensor(x_dev[:,:-1], dtype=th.float32)
th_x_test = th.tensor(x_test[:,:-1], dtype=th.float32)

In [None]:
y_dev = y_tst[:1000].reshape(-1,1)
y_test = y_tst[1000:].reshape(-1,1)
y_trn = y_trn.reshape(-1,1)
y_trn[:10]

In [None]:
#############################
# TODO: complete y tensors creation 
#############################
th_y_trn = None
th_y_dev = None
th_y_test = None
th_y_trn[:20]

### Model training data feed
Now lets prepare the internal mechanics for data feeding in the model training process

In [None]:
th_ds = th.utils.data.TensorDataset(th_x_trn, th_y_trn)
th_dl = th.utils.data.DataLoader(th_ds, batch_size=BATCH_SIZE)

In [None]:
for th_x_batch, th_y_batch in th_dl:
    break
th_x_batch

Re-writing evaluation function

In [None]:

def th_evaluate(m, th_inputs, gold, y_div, y_sub, name="", verbose=False):
    m.eval()
    with th.no_grad():
        #############################
        # TODO: complete yhat generation 
        #############################            
        _y_pred = None
        
    _y_vals = _y_pred * y_div + y_sub
    
    _y_true = gold * y_div + y_sub
    
    res_err = np.abs(_y_true - _y_vals)
    prc_err = res_err / _y_true
    
    overall = prc_err.mean()
    if verbose:
        df_result = pd.DataFrame(
            {
            'GOLD' : _y_true.ravel(),
            'PRED' : _y_vals.ravel().round(0),
            'ERR%' : prc_err.ravel().round(2)
            }
        )
        print('Results for', name)
        print(df_result.head(20))
        print(df_result.tail(20))    
    m.train()
    return overall

In [None]:
loss_func = th.nn.MSELoss()
# optimizer: weights = weights - alpha * grads # alpha << 1
opt = th.optim.Adam(model.parameters(), lr=5e-5)
opt

In [None]:
DEBUG = True
# re-init model
model = SimpleLinerRealEstateModel(8, 256)
best_dev_err = 100
wait_time = 0
max_nr_of_succesive_fails = 2
for epoch in range(TOTAL_NR_EPOCHS):
    for th_x_batch, th_y_batch in th_dl:
        # compute current inferred values with forward prop
        th_y_hat = model(th_x_batch)
        # compute loss (compare results with actual truth)
        th_loss = loss_func(input=th_y_hat, target=th_y_batch) #((th_y_hat - th_y_batch)**2).mean()
        # nullfy the gradients
        opt.zero_grad()
        # compute loss 1st derv wrt all model weights (grads)
        th_loss.backward()
        
        if DEBUG:
            th_param = next(model.parameters())
            print(th_param.grad)
            break
        
        # apply gradients to weights with a hopefully smart approach
        opt.step()
    # now we evaluate on TRAIN and DEV to see how good we are
    th_evaluate(
        m=model,
        th_inputs=th_x_trn,
        gold=y_trn,
        y_div=y_norm_div,
        y_sub=y_norm_sub,
        verbose=False,
        name='TRAIN @ Epoch {}'.format(epoch)
    )
    dev_err = th_evaluate(
        m=model,
        th_inputs=th_x_dev,
        gold=y_dev,
        y_div=y_norm_div,
        y_sub=y_norm_sub,
        verbose=False,
        name='DEV @ Epoch {}'.format(epoch)
    )
    if best_dev_err > dev_err:
        best_dev_err = dev_err
        wait_time = 0
        print("BEST MODEL @ Epoch {}".format(epoch))
    else:
        wait_time += 1
        if wait_time > max_nr_of_succesive_fails:
            print("STOP TRAINING !")
            break

if not DEBUG: 
    # finally we evaluate on TEST
    th_evaluate(
        m=model,
        th_inputs=th_x_test,
        gold=y_test,
        y_div=y_norm_div,
        y_sub=y_norm_sub,
        verbose=True,
        name='Final TEST'
    )


In [None]:
import torch as th

class BetterLinerRealEstateModel(th.nn.Module):
    def __init__(self, n_input_feats, layers=[32], activation=th.nn.ReLU):
        super().__init__()
        self.layers = th.nn.ModuleList()
        n_prev = n_input_feats
        for layer_size in layers:
            hid = th.nn.Linear(n_prev, layer_size)
            act = activation()
            self.layers.append(hid)
            self.layers.append(act)
            n_prev = layer_size
        
        self.readout = th.nn.Linear(n_prev, 1)
        return
    
    def forward(self, inputs):
        th_x = inputs
        for layer in self.layers:
            th_x = layer(th_x)
        th_out = self.readout(th_x)
        return th_out

test_model = BetterLinerRealEstateModel(8, [100, 10], activation=th.nn.Sigmoid)
test_model

### Even closer to production grade experiments: model factories
Now we prepare a basic model factory

In [None]:
def get_model_and_optimizer(layers, activation, opt_class, lr):
    model = BetterLinerRealEstateModel(8, layers, activation)
    opt = opt_class(model.parameters(), lr=lr)
    return model, opt

#############################
# TODO: create a example model
#############################   
test_model, test_opt = get_model_and_optimizer(layers=None, activation=None, opt_class= th.optim.SGD, lr=0.01)
test_model

### Grid search
Next step is grid searching