In [6]:

# from lab_utils_uni import plt_house_x, plt_contour_wgrad, plt_divergence, plt_gradients
import math
import copy
import numpy as np
import matplotlib.pyplot as plot
plot.style.use('./utils_plot/deeplearning.mplstyle')

Setup "the best way": 
https://opensource.com/article/19/6/python-virtual-environments-mac
Docs: 
https://docs.python-guide.org/dev/virtualenvs/#virtualenvironments-ref
Starting in Jupyter: 
https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

In [7]:
def compute_cost(x, y, w, b): 
    """
    Computes the cost function for linear regression. 

    ndarray: N-dimensional array: https://numpy.org/doc/stable/reference/arrays.ndarray.html

    Args: 
        x (ndarray (m,)):       data, m examples
        y (ndarray, (m,)):      target values
        w, b (scalar):          model parameters
    
    Returns 
        total_cost (float):     the cost of using w, b as the parameters for linear regression 
                                to fit the data points in x and y 
    """
    # number of the training examples
    m = x.shape[0]
    cost_sum = 0
    for i in range(m): 
        # given a training example, and parameters w, b, calculate prediction 
        # (prediction is the value of the function f_wb)
        f_wb = w * x[i] + b  # 𝑓𝑤,𝑏(𝑥(𝑖))=𝑤𝑥(𝑖)+𝑏

        # the difference b/w the prediction of the tested model (f_wb), 
        # and the actual value for the given example (y[i])
        diff = f_wb - y[i]  # 𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖)

        # loss is the cost for the given training example
        # because diff can be a negative number, it needs to be squared, 
        # and because it's squared, the loss increases rapidly when w is too small or too large
        loss = diff ** 2  # (𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))2

        # update the cost sum with the value of loss
        cost_sum = cost_sum + loss  # ∑𝑖=0𝑚−1(𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))2

    # calculate the total cost for the given model parameters - J(w, b)
    total_cost = (1 / 2 * m) * cost_sum  # 𝐽(𝑤,𝑏)=12𝑚∑𝑖=0𝑚−1(𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))2
    
    return total_cost



How to automate the process of finding the optimal w and b parameters?

In [8]:
def compute_gradient(x, y, w, b):
    """
    Computes the gradient for linear regression
    
    Args: 
        x (ndarray (m,)):       data, m examples
        y (ndarray, (m,)):      target values
        w, b (scalar):          model parameters
    Returns: 
        dj_dw (scalar):         the gradient of the cost w.r.t the parameters w
        dj_db (scalar):         the gradient of the cost w.r.t. the parameters b
    """
    m = x.shape[0]
    dj_dw = 0
    dj_db = 0

    for i in range(m): 
        f_wb = w * x[i] + b

        dj_dw_i = (f_wb - y[i]) * x[i]  # (𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))𝑥(𝑖)
        dj_db_i = f_wb - y[i]  # (𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))

        # update both variables simultaneously
        dj_dw += dj_db_i  
        dj_db += dj_dw_i 

    dj_dw = dj_dw / m  # 1/𝑚∑𝑖=0𝑚−1(𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))𝑥(𝑖)
    dj_db = dj_db / m  # 1/𝑚∑𝑖=0𝑚−1(𝑓𝑤,𝑏(𝑥(𝑖))−𝑦(𝑖))

    return dj_dw, dj_db


In [None]:
def gradient_descent(x, y, w_initial, b_initial, alpha, number_of_iterations, cost_function, gradient_function): 
    """
    Performs gradient descent to fit w, b. 
    Updates w, b by taking number_of_iterations gradient steps with learning rate alpha

    Args: 
        x (ndarray (m,)):               data, m examples
        y (ndarray, (m,)):              target values
        w_initial, b_initial (scalar):  model parameters
        alpha (float):                  learning rate
        number_of_iterations (int):     number of iterations to run gradient descent
        cost_function:                  callback to produce cost
        gradient_function:              callback to produce gradient

    Returns: 
        w, b (scalar):                  updated values of parameters w, b after running gradient descent
        J_history (list):               history of cost values
        p_history (list):               history of parameters [w, b]
    """

    w = w_initial
    b = b_initial

    for i in range(number_of_iterations): 
        dj_dw, dj_db = gradient_function(x, y, w, b)

        w = w - alpha * dj_dw  # w = 𝑤−𝛼∂𝐽(𝑤,𝑏)∂𝑤
        b = b - alpha * dj_db  # b = 𝑏−𝛼∂𝐽(𝑤,𝑏)∂𝑏
    
    return w, b


Because the original data stored as json has much more features than we're going to need at the moment,
we need to reduce it: 
- from the list of features of each home, leave price, area and id
- filter out those homes, that have no price or no area
- parse the values to get integers/floats

> The process of encoding JSON is usually called serialization. 
This term refers to the transformation of data into a series of bytes(hence serial) to be stored or 
transmitted across a network. 
- encoding -> writing, serializing (to file, to http)
- decoding -> reading, deserializing (to memory)

In [81]:
import json
import re

# https://stackoverflow.com/questions/47060035/python-parse-json-array
input_file = open('../../output/2023-01-16T21-04-01.949Z.json')
parsed_json = json.load(input_file)
store_list = []


def is_id(feature):
    return bool(feature["label"] == "riferimento e Data annuncio")

def is_price(feature): 
    return bool(feature["label"] == "prezzo")


def is_area(feature):
    return bool(feature["label"] == "superficie")


class RealEstate:
    id = None
    price = None
    area = None

    def set_id(self, id):
        self.id = id

    def get_id(self):
        return self.id

    def set_price(self, price):
        self.price = price

    def get_price(self): 
        return self.price
    
    def set_area(self, area): 
        self.area = area
    
    def get_area(self): 
        return self.area

# https://stackoverflow.com/questions/4289331/how-to-extract-numbers-from-a-string-in-python
# find commas, flats, integers in a string: 
p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'

def parse_price(value): 
    if re.search(p, value) is not None:
        for catch in re.finditer(p, value):
            without_dot = str(catch[0]).replace(".", "")
            return int(without_dot)


def parse_area(value):
    if re.search(p, value) is not None:
        for catch in re.finditer(p, value):
            without_dot = str(catch[0]).replace(".", "")
            return int(without_dot)

items = []
# https://blog.teclado.com/destructuring-in-python/
for counter, item in enumerate(parsed_json):
    real_estate = RealEstate()

    for feature in item["features"]:

        if is_id(feature): 
            # id could be also parsed, to get the date out
            real_estate.set_id(feature["value"])
        
        if is_price(feature):  
            price = parse_price(feature["value"])
            real_estate.set_price(price)

        if is_area(feature):
            area = parse_area(feature["value"])
            real_estate.set_area(area)

    items.append(real_estate)


print("ITEMS: ")
for counter, item in enumerate(items):
    print(item.get_id())
    print(item.get_price())
    print(item.get_area())


ITEMS: 
EK-100272240 - 05/01/2023
15000
120
40080439 - 20/06/2022
22000
60
60541811 - 02/12/2022
25000
150
La Volpe e l'Uva - 18/06/2021
28000
70
321778 - 15/01/2023
33000
182
EK-55292876 - 10/01/2023
34000
100
TSS000 - 03/11/2022
35000
50
40043031 - 07/11/2022
35000
25
EK-100271526 - 05/01/2023
35000
130
EK-95288164 - 23/10/2022
35000
80
EK-89845513 - 05/05/2022
38000
122
60557692 - 06/12/2022
39000
10
EK-100271408 - 05/01/2023
39000
175
EK-88290511 - 27/05/2021
40000
114
40405961 - 02/12/2022
43000
120
EK-96247952 - 07/12/2022
45000
180
EK-77155860 - 14/04/2022
48000
150
VL ITTIRI SS 57 - 11/01/2023
50000
107
EK-97685594 - 05/08/2022
50000
50
EK-96386508 - 28/12/2022
50000
130
60659004 - 31/10/2022
55000
50
La Corte Rosa - 04/01/2023
55000
137
60579065 - 28/07/2022
59000
89
EK-98912560 - 18/10/2022
60000
108
EK-97688978 - 03/11/2022
60000
173
EK-99220190 - 04/11/2022
60000
168
60622858 - 03/11/2022
64000
80
60621836 - 04/01/2023
64000
80
18538758 - 01/09/2021
65000
121
40293092 - 28/

Now that we have only the data we need in a json format, let's convert it into python array of floats: 

In [10]:
# Load our data set
x_training_examples = np.array([1.0, 2.0])   # stores features
y_training_examples = np.array([300.0, 500.0])   # stores targets
