In [0]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import random
from copy import deepcopy

In [125]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# For all algorithms, the model has to be made to predict housing prices
# While taking lot size and bedrooms as 2 features

dataset = "gdrive/My Drive/Deep Learning/Assignment_3/dataset.csv"

with open(dataset, 'r') as csvfile :
    csvreader = csv.reader(csvfile)
    fields = next(csvreader)
    rows = []
    for row in csvreader :
        rows.append(row)
        
rows = np.array(rows)

In [0]:
Y = rows[: , 1]
Y = np.reshape(Y, (Y.shape[0], 1))
Y = Y.astype(np.float)

In [0]:
X1 = rows[:, 2]
X1 = np.reshape(X1, (X1.shape[0], 1))
X1 = X1.astype(np.float)
X2 = rows[: ,3]
X2 = np.reshape(X2, (X2.shape[0], 1))
X2 = X2.astype(np.float)

X = np.hstack((X1, X2))

In [0]:
# roughly divided into 70 : 15 : 10

training_samples = 400
validation_samples = 73
testing_samples = 73

X_training_set = np.ndarray(shape=(400, 2), dtype=np.float32)
X_validation_set = np.ndarray(shape=(73, 2), dtype=np.float32)
X_test_set = np.ndarray(shape=(73, 2), dtype=np.float32)

Y_training_set = np.ndarray(shape=(400, 1), dtype=np.float32)
Y_validation_set = np.ndarray(shape=(73, 1), dtype=np.float32)
Y_test_set = np.ndarray(shape=(73, 1), dtype=np.float32)

indices = np.arange(546)
random.shuffle(indices)

for i in range(0, training_samples) :
    X_training_set[i] = X[indices[i]]
    Y_training_set[i] = Y[indices[i]]
    
mu = np.mean(X_training_set, axis=0)
sigma = np.std(X_training_set, axis=0)

X_training_set = (X_training_set - mu) / sigma
ones_extra = np.ones((400, 1))
X_training_set = np.hstack((ones_extra, X_training_set))
    
for i in range(training_samples, training_samples + validation_samples) :
    X_validation_set[i - training_samples] = X[indices[i]]
    Y_validation_set[i - training_samples] = Y[indices[i]]

X_validation_set = (X_validation_set - mu) / sigma
ones_extra = np.ones((73, 1))
X_validation_set = np.hstack((ones_extra, X_validation_set))

for i in range(training_samples + validation_samples, 546) :
    X_test_set[i - (training_samples + validation_samples)] = X[indices[i]]
    Y_test_set[i - (training_samples + validation_samples)] = Y[indices[i]]
    
X_test_set = (X_test_set - mu) / sigma
ones_extra = np.ones((73, 1))
X_test_set = np.hstack((ones_extra, X_test_set))

In [0]:
def cost_function(X, Y, theta) :
    A = np.dot(X, theta) - Y
    J1 = np.dot(np.transpose(A), A)
    J = J1 / (2 * X.shape[0])
    
    return J

def cost_function_grad_one(X, Y, W, index) : 
    scalar_val = (np.dot(np.transpose(W), np.transpose(X[index, :])) - Y[index][0])
    x = np.transpose(X[index, :]) * scalar_val
    return x
    
def cost_function_grad(X, Y, W) :
    return (np.dot(np.transpose(X), (np.dot(X, W) - Y)))/float(X.shape[0])

In [0]:
cost_epsilon = 1e-8

In [0]:
def check_if_terminate(X, Y, theta, old_theta) :
    old_cost = cost_function(X, Y, old_theta)
    new_cost = cost_function(X, Y, theta)
    
#     print(old_theta, theta)
    
#     print(old_cost, new_cost)
    return np.abs(old_cost - new_cost) <= cost_epsilon

In [0]:
def batch_gradient_descent(X, Y, alpha, theta) :
    
    theta = theta - alpha * cost_function_grad(X, Y, theta)
    
    return theta

In [0]:
def stochastic_grad_descent(X, Y, alpha, theta) :
    indices = np.arange(X.shape[0])
    random.shuffle(indices)
            
    for j in range(X.shape[0]) :
        index = indices[j]
        old_theta = deepcopy(theta)
        delta_theta = cost_function_grad_one(X, Y, theta, index)
        delta_theta = np.reshape(delta_theta, (delta_theta.shape[0], 1))
        theta = theta - alpha * delta_theta
        
#         print(old_theta.shape, theta.shape, delta_theta.shape)
        
        if(check_if_terminate(X, Y, theta, old_theta) == True) :
            break
            
    return theta

In [0]:
# apply this on the current mini-batch

def gradient_descent_with_momentum(X, Y, alpha, theta, beta_1, vdtheta, batch_num) : #batch_num >= 1
    
    vdtheta = beta_1 * vdtheta + (1 - beta_1) * cost_function_grad(X, Y, theta)
#     vdtheta = vdtheta / (1 - np.power(beta_1, batch_num))
    theta = theta - alpha * vdtheta
    
    return (vdtheta, theta)

In [0]:
# apply this on current mini-batch

def RMSprop(X, Y, alpha, theta, beta_2, sdtheta, epsilon) :
    
    cfg = cost_function_grad(X, Y, theta)
    
    sdtheta = beta_2 * sdtheta + (1 - beta_2) * np.power(cfg, 2)
    theta = theta - (alpha * cfg) / (np.sqrt(sdtheta) + epsilon)
    
    return (sdtheta, theta)

In [0]:
# apply on current mini-batch

def ADAM(X, Y, alpha, theta, beta_1, beta_2, epsilon, vdtheta, sdtheta, batch_num) : # batch_num >= 1
    
    vdtheta = beta_1 * vdtheta + (1 - beta_1) * cost_function_grad(X, Y, theta)
    sdtheta = beta_2 * sdtheta + (1 - beta_2) * np.power(cost_function_grad(X, Y, theta), 2)
    
    vdtheta_corrected = vdtheta / (1 - np.power(beta_1, batch_num))
    sdtheta_corrected = sdtheta / (1 - np.power(beta_2, batch_num))
    
    theta = theta - (alpha * vdtheta_corrected) / (np.sqrt(sdtheta_corrected) + epsilon) 
    
    return (vdtheta, sdtheta, theta)

In [0]:
alpha = 0.001
theta = np.random.randn(3, 1)
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-8
mini_batch_size = 20

In [139]:
# applying batch gradient descent 

num_iterations = 0
theta = np.random.randn(3, 1)

while(True) :
    num_iterations += 1
    
    old_theta = deepcopy(theta)
    theta = batch_gradient_descent(X_training_set, Y_training_set, alpha, theta)
    
    if(check_if_terminate(X_training_set, Y_training_set, theta, old_theta) == True) :
        break
        
print("Total number of iterations: ", num_iterations)
print("Cost value: ", cost_function(X_training_set, Y_training_set, theta))

Total number of iterations:  15911
Cost value:  [[2.18646144e+08]]


In [140]:
# applying stochastic gradient descent
alpha = 0.0001
cost_epsilon = 1e-5
theta = np.random.randn(3, 1)
num_iterations = 0

while(True) :
    num_iterations += 1
    
    old_theta = deepcopy(theta)
    theta = stochastic_grad_descent(X_training_set, Y_training_set, alpha, theta)
    
    if(check_if_terminate(X_training_set, Y_training_set, theta, old_theta) == True) :
        break
    if(num_iterations % 100 == 0) :
        print(num_iterations, cost_function(X_training_set, Y_training_set, old_theta), cost_function(X_training_set, Y_training_set, theta))
        
print("Total number of iterations: ", num_iterations)
print("Cost value: ", cost_function(X_training_set, Y_training_set, theta))

100 [[2.19502753e+08]] [[2.19436122e+08]]
200 [[2.18646383e+08]] [[2.18646361e+08]]
300 [[2.18646157e+08]] [[2.18646157e+08]]


KeyboardInterrupt: ignored

In [141]:
# applying gradient descent with momentum

alpha = 1e-3
cost_epsilon = 1e-5
theta = np.random.randn(3, 1)
num_iterations = 0

print(theta)

while(True) :
    num_iterations += 1
    
    vdtheta = 0
    
    for j in range(20) :
        
        X_train_mini_batch = X_training_set[j * 20 : (j + 1) * 20, :]
        Y_train_mini_batch = Y_training_set[j * 20 : (j + 1) * 20, :]          
        old_theta = deepcopy(theta)
        
        (vdtheta, theta) = gradient_descent_with_momentum(X_train_mini_batch, Y_train_mini_batch, alpha, theta, beta_1, vdtheta, j + 1)
        
        if(check_if_terminate(X_train_mini_batch, Y_train_mini_batch, theta, old_theta) == True) :
            break
            
    if(check_if_terminate(X_train_mini_batch, Y_train_mini_batch, theta, old_theta) == True) :
        break
        
    if(num_iterations % 100 == 0) :
        print(num_iterations, cost_function(X_train_mini_batch, Y_train_mini_batch, old_theta), cost_function(X_train_mini_batch, Y_train_mini_batch, theta))
        
                
print("Total number of iterations: ", num_iterations)
print("Cost value: ", cost_function(X_training_set, Y_training_set, theta))

[[ 0.64313049]
 [-0.04865786]
 [ 1.18019162]]
100 [[3.43169663e+08]] [[3.42767502e+08]]
200 [[1.50767301e+08]] [[1.50747014e+08]]
300 [[1.43798932e+08]] [[1.43795424e+08]]
400 [[1.46627412e+08]] [[1.46619273e+08]]
500 [[1.47979417e+08]] [[1.47968859e+08]]
600 [[1.48443413e+08]] [[1.48431999e+08]]
700 [[1.48591227e+08]] [[1.48579533e+08]]
800 [[1.48637308e+08]] [[1.48625524e+08]]
900 [[1.4865158e+08]] [[1.48639768e+08]]
1000 [[1.48655994e+08]] [[1.48644173e+08]]
1100 [[1.4865736e+08]] [[1.48645535e+08]]
1200 [[1.48657783e+08]] [[1.48645957e+08]]
1300 [[1.48657914e+08]] [[1.48646089e+08]]
1400 [[1.48657955e+08]] [[1.48646129e+08]]
1500 [[1.48657968e+08]] [[1.48646142e+08]]
1600 [[1.48657972e+08]] [[1.48646146e+08]]
1700 [[1.48657973e+08]] [[1.48646147e+08]]
1800 [[1.48657974e+08]] [[1.48646148e+08]]
1900 [[1.48657974e+08]] [[1.48646148e+08]]
2000 [[1.48657974e+08]] [[1.48646148e+08]]
2100 [[1.48657974e+08]] [[1.48646148e+08]]
2200 [[1.48657974e+08]] [[1.48646148e+08]]
2300 [[1.48657974e+

KeyboardInterrupt: ignored

In [145]:
# applying RMS prop

alpha = 1e-3
cost_epsilon = 1e-5
theta = np.random.randn(3, 1)
num_iterations = 0

while(True) :
    num_iterations += 1
    
    sdtheta = 0
    
    for j in range(20) :
        
        X_train_mini_batch = X_training_set[j * 20 : (j + 1) * 20, :]
        Y_train_mini_batch = Y_training_set[j * 20 : (j + 1) * 20, :]          
        old_theta = deepcopy(theta)
        
        (sdtheta, theta) =  RMSprop(X_train_mini_batch, Y_train_mini_batch, alpha, theta, beta_2, sdtheta, epsilon)
        
        if(check_if_terminate(X_train_mini_batch, Y_train_mini_batch, theta, old_theta) == True) :
            break
            
    if(check_if_terminate(X_train_mini_batch, Y_train_mini_batch, theta, old_theta) == True) :
        break
        
    if(num_iterations % 100 == 0) :
        print(num_iterations, cost_function(X_train_mini_batch, Y_train_mini_batch, old_theta), cost_function(X_train_mini_batch, Y_train_mini_batch, theta))
        
                
print("Total number of iterations: ", num_iterations)
print("Cost value: ", cost_function(X_training_set, Y_training_set, theta))

100 [[2.76606914e+09]] [[2.76606814e+09]]
200 [[2.76384778e+09]] [[2.76384679e+09]]
300 [[2.76162771e+09]] [[2.76162671e+09]]
400 [[2.75940892e+09]] [[2.75940792e+09]]
500 [[2.7571914e+09]] [[2.75719041e+09]]
600 [[2.75497517e+09]] [[2.75497417e+09]]
700 [[2.75276021e+09]] [[2.75275922e+09]]
800 [[2.75054654e+09]] [[2.75054554e+09]]
900 [[2.74833414e+09]] [[2.74833315e+09]]
1000 [[2.74612302e+09]] [[2.74612203e+09]]
1100 [[2.74391318e+09]] [[2.74391218e+09]]
1200 [[2.74170461e+09]] [[2.74170362e+09]]
1300 [[2.73949732e+09]] [[2.73949633e+09]]
1400 [[2.7372913e+09]] [[2.73729031e+09]]
1500 [[2.73508656e+09]] [[2.73508557e+09]]
1600 [[2.73288309e+09]] [[2.7328821e+09]]
1700 [[2.7306809e+09]] [[2.73067991e+09]]
1800 [[2.72847998e+09]] [[2.72847899e+09]]
1900 [[2.72628033e+09]] [[2.72627935e+09]]
2000 [[2.72408196e+09]] [[2.72408097e+09]]
2100 [[2.72188486e+09]] [[2.72188387e+09]]
2200 [[2.71968902e+09]] [[2.71968804e+09]]
2300 [[2.71749446e+09]] [[2.71749348e+09]]
2400 [[2.71530117e+09]] 

KeyboardInterrupt: ignored

In [148]:
# applying ADAM
alpha = 1e-3
cost_epsilon = 1e-5
theta = np.random.randn(3, 1)
num_iterations = 0

while(True) :
    num_iterations += 1
    
    vdtheta = 0
    sdtheta = 0
    
    for j in range(20) :
        
        X_train_mini_batch = X_training_set[j * 20 : (j + 1) * 20, :]
        Y_train_mini_batch = Y_training_set[j * 20 : (j + 1) * 20, :]          
        old_theta = deepcopy(theta)
        
        (vdtheta, sdtheta, theta) = ADAM(X_train_mini_batch, Y_train_mini_batch, alpha, theta, beta_1, beta_2, epsilon, vdtheta, sdtheta, j + 1)
        
        if(check_if_terminate(X_train_mini_batch, Y_train_mini_batch, theta, old_theta) == True) :
            break
            
    if(check_if_terminate(X_train_mini_batch, Y_train_mini_batch, theta, old_theta) == True) :
        break
        
    if(num_iterations % 100 == 0) :
        print(num_iterations, cost_function(X_train_mini_batch, Y_train_mini_batch, old_theta), cost_function(X_train_mini_batch, Y_train_mini_batch, theta))
        
                
print("Total number of iterations: ", num_iterations)
print("Cost value: ", cost_function(X_training_set, Y_training_set, theta))



100 [[2.76834799e+09]] [[2.76834789e+09]]
200 [[2.76817162e+09]] [[2.76817151e+09]]
300 [[2.76799525e+09]] [[2.76799514e+09]]
400 [[2.76781889e+09]] [[2.76781878e+09]]
500 [[2.76764253e+09]] [[2.76764242e+09]]
600 [[2.76746618e+09]] [[2.76746607e+09]]
700 [[2.76728984e+09]] [[2.76728973e+09]]
800 [[2.76711351e+09]] [[2.7671134e+09]]
900 [[2.76693718e+09]] [[2.76693707e+09]]
1000 [[2.76676086e+09]] [[2.76676075e+09]]
1100 [[2.76658455e+09]] [[2.76658444e+09]]
1200 [[2.76640825e+09]] [[2.76640814e+09]]
1300 [[2.76623195e+09]] [[2.76623184e+09]]
1400 [[2.76605566e+09]] [[2.76605555e+09]]
1500 [[2.76587938e+09]] [[2.76587927e+09]]
1600 [[2.7657031e+09]] [[2.76570299e+09]]
1700 [[2.76552683e+09]] [[2.76552673e+09]]
1800 [[2.76535057e+09]] [[2.76535046e+09]]
1900 [[2.76517432e+09]] [[2.76517421e+09]]
2000 [[2.76499807e+09]] [[2.76499796e+09]]
2100 [[2.76482183e+09]] [[2.76482173e+09]]
2200 [[2.7646456e+09]] [[2.76464549e+09]]
2300 [[2.76446938e+09]] [[2.76446927e+09]]
2400 [[2.76429316e+09]]

KeyboardInterrupt: ignored