In [15]:
import numpy as np
import pandas as pd
from numpy import random

In [16]:
def get_csv_data(file_name):
    np_data_frame = pd.read_csv(file_name)
    return np.array(np_data_frame.values)

In [17]:
def mse(y, y_pred):
    return np.sum((y - y_pred)**2)/len(y_pred)


def rmse(y, y_pred):
    return np.sqrt(mse(y, y_pred))


def r2(y, y_pred):
    nom=np.sum((y - y_pred)**2)
    denom=np.sum((y - np.mean(y, axis=0))**2)
    return 1 - nom/denom

In [18]:
def split_to_folds(data, k):
    k_folds = np.array_split(data, k)
    
    x_data_folds = []
    y_data_folds = []
    for i in range(k):
        x_data_folds.append(k_folds[i][:, :-1])
        y_data_folds.append(k_folds[i][:, -1].reshape((-1, 1)))
    
    return x_data_folds, y_data_folds


def concat(arr_to_concat):
    concatenated = np.empty((0, arr_to_concat[0].shape[1]), int)
    
    for i in range (len(arr_to_concat)):
        concatenated = np.concatenate((concatenated, arr_to_concat[i]))
    
    return concatenated


def mean(data, _axis=0):
    return np.mean(data, axis=_axis)


def std(data, _axis=0):
    return np.std(data, axis=_axis)


def normalize_data(data):
    _mean = mean(data)
    _std = std(data)
    # revome division by zero
    np.place(_std, _std==0, 1)
    
    return (data - _mean) / _std

In [19]:
def create_batch(data, i, batch_size):
    offset = i * batch_size
    if offset + batch_size >= data.shape[0]:
        return data[offset : data.shape[0], :]
    else:
        return data[offset : offset + batch_size, :]    


def linear_regression(X, coeff, bias):
    return X.dot(coeff) + bias


def mini_batch_grad_desc(X, Y, learn_rate=0.001, epoch_numb=10, batch_size=800):
    coeff = np.random.sample((X.shape[1], 1))
    bias = 0
    
    rmse_arr = []
    r2_arr = []
    
    for epoch in range(epoch_numb):
        seed = np.random.randint(0, 10000)
        
        # shuffle data before batch creating
        np.random.seed(seed)
        np.random.shuffle(X)
        np.random.seed(seed)
        np.random.shuffle(Y)
        
        number_of_batches = X.shape[0] // batch_size
        if number_of_batches % batch_size != 0:
            # 1 batch will have lower size if there are some train rows left
            number_of_batches += 1
        
        for i in range(number_of_batches):
            X_batch = create_batch(X, i, batch_size)
            Y_batch = create_batch(Y, i, batch_size)
            
            #print(X_batch.shape)
            #print(Y_batch.shape)
            
            batch_actual_size = X_batch.shape[0]
            
            # predict Y
            Y_predicted = linear_regression(X_batch, coeff, bias)
            error = Y_batch - Y_predicted
            
            # upd values using grad descent
            coeff = coeff + learn_rate*2*((X_batch.T).dot(error)) / batch_actual_size
            bias = bias + learn_rate*2*np.sum(error) / batch_actual_size
            
            #print(error)
            
            _rmse = rmse(Y_batch, Y_predicted)
            _r2 = r2(Y_batch, Y_predicted)
            
            #print("rmse " + str(epoch) + "_" + str(i) + ":", _rmse)
            #print("r2 " + str(epoch) + "_" + str(i) + ":", _r2)
            
            rmse_arr.append(_rmse)
            r2_arr.append(_r2)
        
#     print("rmse:", rmse_arr)
#     print("r2:", r2_arr)
    return coeff, bias

In [24]:
train_data_f_name = "./dataset/facebook_comment_volume.csv"

# get data for training
train_data = get_csv_data(train_data_f_name)
# print(np.shape(train_data))

# shuffle it
np.random.shuffle(train_data)

# and make 5 folds
x_folds, y_folds = split_to_folds(train_data, 5)

# print(np.shape(x_train_folds[0]))
# print(x_train_folds[0])
# print(np.shape(y_train_folds[0]))
# print(y_train_folds[0])

coeff_arr = []
bias_arr = []

rmse_train_arr = []
r2_train_arr = []
rmse_test_arr = []
r2_test_arr = []

for i in range(len(x_folds)):
    # make 1 test fold
    x_test = x_folds[i]
    y_test = y_folds[i]
    
    # and 4 train ones
    x_train_folds = np.delete(x_folds, i)
    y_train_folds = np.delete(y_folds, i)
    
    # concatenate 4 train folds into 1
    x_train = concat(x_train_folds)
    y_train = concat(y_train_folds)
    
#     print(np.shape(x_train))
#     print(np.shape(y_train))
    
    # normalize train and test data
    x_test = normalize_data(x_test)
    x_train = normalize_data(x_train)
    
#     print(x_test)
#     print(np.shape(x_test))
#     print(x_test.shape[0])
#     print(x_train)
#     print(np.shape(x_train))

    trained_coeff, trained_bias = mini_batch_grad_desc(x_train, y_train)
    
#     print(trained_coeff)
#     print(trained_bias)
    
    coeff_arr.insert(i, trained_coeff)
    bias_arr.insert(i, trained_bias)
    
    y_train_predicted = linear_regression(x_train, trained_coeff, trained_bias)
    y_test_predicted = linear_regression(x_test, trained_coeff, trained_bias)
    
    # calculate metrics using trained coefficients for logging
    rmse_train_arr.insert(i, rmse(y_train, y_train_predicted))
    r2_train_arr.insert(i, r2(y_train, y_train_predicted))
    rmse_test_arr.insert(i, rmse(y_test, y_test_predicted))
    r2_test_arr.insert(i, r2(y_test, y_test_predicted))
    
    print(str(i) + ":", r2_test_arr[i],rmse_test_arr[i],r2_train_arr[i],rmse_train_arr[i])


0: 0.23302655715568388 31.95928776319663 0.31200124951976993 29.23013567662983
1: 0.23104437443289783 25.129474371178603 0.3071534010301822 30.801062730241537
2: 0.31045210058654205 31.050240989346964 0.2906362933777846 29.481633410758146
3: 0.3080280341696481 33.630573479710755 0.2861701892274785 28.851709899756163
4: 0.34652567735258055 26.963717511256615 0.28017067437738685 30.551389960444858


# Beautified output of calculated metrics

In [25]:
frame_data = {}
for i in range(5):
    frame_data[str(i + 1)] = [rmse_test_arr[i], r2_test_arr[i], rmse_train_arr[i], r2_train_arr[i], bias_arr[i]]

# print(data)

frame_data['E'] = [np.mean(rmse_test_arr), np.mean(r2_test_arr), np.mean(rmse_train_arr), np.mean(r2_train_arr), np.mean(bias_arr)]
frame_data['SD'] = [np.std(rmse_test_arr),np.std(r2_test_arr),np.std(rmse_train_arr),np.std(r2_train_arr), np.std(bias_arr)]
 
df1 = pd.DataFrame(frame_data, index =['test RMSE', 'test R2','train RMSE','train R2', 'bias']) 
df2 = pd.DataFrame(
    np.concatenate(
        (np.hstack(coeff_arr), 
         np.mean(coeff_arr, axis=0),
         np.std(coeff_arr, axis=0)),
        axis=1),
    columns=['1', '2', '3','4','5','E','SD'])

df = pd.concat([df1, df2], axis=0)
df

Unnamed: 0,1,2,3,4,5,E,SD
test RMSE,31.959288,25.129474,31.050241,33.630573,26.963718,29.746659,3.185687
test R2,0.233027,0.231044,0.310452,0.308028,0.346526,0.285815,0.045984
train RMSE,29.230136,30.801063,29.481633,28.85171,30.55139,29.783186,0.760349
train R2,0.312001,0.307153,0.290636,0.28617,0.280171,0.295226,0.012275
bias,4.053833,4.222569,4.090705,4.029523,4.103351,4.099996,0.066662
0,-0.185145,0.20699,0.150106,-0.287481,0.148342,0.006562,0.202035
1,-0.535028,-0.244802,-0.281763,0.002218,-0.15619,-0.243113,0.175758
2,-0.039962,-0.331414,-0.340516,0.02987,-0.029738,-0.142352,0.1599
3,0.130316,0.525466,0.424846,0.214428,0.340961,0.327203,0.141785
4,0.237178,-0.184321,0.001074,-0.09595,0.246647,0.040925,0.174296
