In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Leer el archivo e ingresarlo en un numpy array

In [14]:
df = pd.read_csv('kc_house_data.csv')
x = np.array(df.loc[:,['sqft_living']].copy())
y = np.array(df.loc[:,['price']].copy())

## 2. Implementar el descenso del gradiente 

Computar el algoritmo

In [15]:
def linear_cost(X, y, theta):
    h = X @ theta
    return ((h - y) ** 2).sum() / (2*len(X))

def linear_cost_gradient(X, y, theta):
    h = X @ theta
    return ((h - y).T @ X).T / (2*len(X))

def gradient_descent(
    X, y, theta_0, 
    cost_function, cost_function_gradient,
    learning_rate, threshold,
    max_iter=1000
):
    theta = theta_0
    iteration = 0
    costs = []
    thetas = []

    while np.linalg.norm(cost_function_gradient(X, y, theta)) > threshold and iteration < max_iter:
        iteration += 1
        theta -= learning_rate * cost_function_gradient(X, y, theta)
        costs.append(theta-learning_rate * cost_function_gradient(X, y, theta))
        thetas.append(theta.copy())

    return theta, costs, thetas


## 3. Determinar el mejor modelo con Cross Validation

In [39]:
np.random.shuffle(x)
np.random.shuffle(y)

In [40]:
r1 = round(x.shape[0]*0.70)
r2 = r1 + round(x.shape[0]*0.15)
r3 = r2 + round(x.shape[0]*0.15)

print('Rango del train: 0 -',str(r1), ',rango del CV: ',str(r1+1),' - ',str(r2), ', rango del test: ',str(r2+1),' - ',str(x.shape[0]))


Rango del train: 0 - 15129 ,rango del CV:  15130  -  18371 , rango del test:  18372  -  21613


Porción de train

In [41]:
X_train = np.hstack(
    (
        np.ones(r1).reshape(r1, 1),
        x[:r1],
    )
)
m, n = X_train.shape
theta_0 = np.random.rand(n, 1)
y_train = y[:r1]

Valores para manipular en el descenso del gradiente

In [71]:
v1 = (0.0000001, 0.01)
v2 = (0.0000000001, 0.0001)

Probar con distintos valores

In [72]:
r_theta, costs, thetas = gradient_descent(
    X_train, y_train, theta_0,
    linear_cost,
    linear_cost_gradient,
    learning_rate=v1[0],
    threshold=v1[1]
)

In [73]:
r_theta, costs, thetas = gradient_descent(
    X_train, y_train, theta_0,
    linear_cost,
    linear_cost_gradient,
    learning_rate=v2[0],
    threshold=v2[1]
)

## Verificación con cross validation

Porción de test

In [74]:
# modelo 1
X_test = np.hstack(
    (
        np.ones(r2 - r1 - 1).reshape(r2 - r1 - 1, 1),
        x[r1:r2-1],
    )
)
m, n = X_test.shape
theta_0 = np.random.rand(n, 1)
y_test = y[r1:r2-1]

r_theta, costs, thetas = gradient_descent(
    X_test, y_test, theta_0,
    linear_cost,
    linear_cost_gradient,
    learning_rate=v1[0],
    threshold=v1[1]
)

In [75]:
# modelo 2
X_test = np.hstack(
    (
        np.ones(r2 - r1 - 1).reshape(r2 - r1 - 1, 1),
        x[r1:r2-1],
    )
)
m, n = X_test.shape
theta_0 = np.random.rand(n, 1)
y_test = y[r1:r2-1]

r_theta, costs, thetas = gradient_descent(
    X_test, y_test, theta_0,
    linear_cost,
    linear_cost_gradient,
    learning_rate=v2[0],
    threshold=v2[1]
)

Porción de CV

In [76]:
# modelo 1
X_test = np.hstack(
    (
        np.ones(r3 - r2 - 1).reshape(r3 - r2 - 1, 1),
        x[r2:r3-1],
    )
)
m, n = X_test.shape
theta_0 = np.random.rand(n, 1)
y_test = y[r2:r3-1]

r_theta, costs, thetas = gradient_descent(
    X_test, y_test, theta_0,
    linear_cost,
    linear_cost_gradient,
    learning_rate=v1[0],
    threshold=v1[1]
)

In [77]:
# modelo 2
X_test = np.hstack(
    (
        np.ones(r3 - r2 - 1).reshape(r3 - r2 - 1, 1),
        x[r2:r3-1],
    )
)
m, n = X_test.shape
theta_0 = np.random.rand(n, 1)
y_test = y[r2:r3-1]

r_theta, costs, thetas = gradient_descent(
    X_test, y_test, theta_0,
    linear_cost,
    linear_cost_gradient,
    learning_rate=v1[0],
    threshold=v1[1]
)