In [None]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import scipy
import scipy.sparse
import scipy.sparse.linalg
import matplotlib.pyplot as plt

np.random.seed(42)
ROOT_PATH = os.path.normpath(os.path.join(os.getcwd(), os.pardir))
ROOT_PATH

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    data = pd.read_csv("/content/drive/MyDrive/data.csv")
except:
    data = pd.read_csv(os.path.join(ROOT_PATH, "data", "data.csv"))
# Load the data

print("Shape of data {}".format(data.shape))

In [None]:
# Convert into array
data = np.array(data)

# Split into samples and labels (X and Y)
X = data[:, 1:]
X = X.T

Y = data[:, 0]

In [None]:
def data_splits(X, Y, N_train):
    N = X.shape[1]

    # getting an array with indeces from 0 to N-1
    indeces = np.arange(N)
    # shuffling randomly
    np.random.shuffle(indeces)

    # get the first N_train for the train split (but now they are random)
    train_idx = indeces[:N_train]
    # the rest are for test split
    test_idx = indeces[N_train:]

    # slice the original datasets with an index array
    X_train = X[:, train_idx]  
    Y_train = Y[train_idx]
    
    X_test = X[:, test_idx]
    Y_test = Y[test_idx]

    # put in tuples the two splits
    return (X_train, Y_train), (X_test, Y_test)


def get_data_from_index(X, Y, indexes):
    
    # start from empty lists for both samples and labels
    final_X = []
    final_Y = []

    # for each chosen label
    for k in indexes:
        # find which samples have label=k
        idxs_k = (Y == k)
        # slice the samples and append them to a list
        final_X.append( X[:, idxs_k] )
        # same thing to the labels
        final_Y.append( Y[idxs_k] )

    # concatenate together all the previous iterations
    X = np.concatenate(final_X, axis=1)
    Y = np.concatenate(final_Y)

    # return the new dataset and labels
    return X, Y

In [None]:
# 80% to train, the rest to test
train_split = 0.8
N_train = round(X.shape[1]*train_split)

# get the train and test splits both for samples and for lables
(X_train, Y_train), (X_test, Y_test) = data_splits(X, Y, N_train)

In [None]:
# choose digits
indeces = [0, 6]
X_train, Y_train = get_data_from_index(X_train, Y_train, indeces)
X_test, Y_test = get_data_from_index(X_test, Y_test, indeces)

In [None]:
def SGD(loss, grad_l, w0, data, batch_size, n_epochs):
    alpha = 1e-3
    X, Y = data

    w = [w0]
    loss_val = []
    grads_val = []
    err_val = []

    k=0
    for epoch in range(n_epochs):
        for i in range(0, len(data), batch_size):
            batch_X = X[i:i+batch_size]
            batch_Y = Y[i:i+batch_size]
            w.append(w[k] - alpha*grad_l(batch_X, batch_Y, w0))
            k += 1

        loss_val.append(loss(w[-1], X, Y))
        grads_val.append(grad_l(w[-1], X, Y))
        err_val.append(np.linalg.norm(grads_val[-1])**2)
        
        
    return w, loss_val, grads_val, err_val