In [172]:
import numpy as np
import pandas as pd
import random as std_random
from numpy import random

In [173]:
def get_csv_data(file_name):
    np_data_frame = pd.read_csv(file_name)
    return np.array(np_data_frame.values)

In [174]:
def mse(y, y_pred):
    return np.sum((y - y_pred)**2)/len(y_pred)


def rmse(y, y_pred):
    return np.sqrt(mse(y, y_pred))


def r2(y, y_pred):
    nom=np.sum((y - y_pred)**2)
    denom=np.sum((y - np.mean(y, axis=0))**2)
    return 1 - nom/denom

In [175]:
def split_to_folds(data, k):
    k_folds = np.array_split(data, k)
    
    x_data_folds = []
    y_data_folds = []
    for i in range(k):
        x_data_folds.append(k_folds[i][:, :-1])
        y_data_folds.append(k_folds[i][:, -1].reshape((-1, 1)))
    
    return x_data_folds, y_data_folds


def concat(arr_to_concat):
    concatenated = np.empty((0, arr_to_concat[0].shape[1]), int)
    
    for i in range (len(arr_to_concat)):
        concatenated = np.concatenate((concatenated, arr_to_concat[i]))
    
    return concatenated


def mean(data, _axis=0):
    return np.mean(data, axis=_axis)


def std(data, _axis=0):
    return np.std(data, axis=_axis)


def normalize_data(data):
    _mean = mean(data)
    _std = std(data)
    # revome division by zero
    np.place(_std, _std==0, 1)
    
    return (data - _mean) / _std

In [180]:
def create_batch(data, i, batch_size):
    offset = i * batch_size
    if offset + batch_size >= data.shape[0]:
        return data[offset : data.shape[0], :]
    else:
        return data[offset : offset + batch_size, :]    


def linear_regression(X, coeff, bias):
    return X.dot(coeff) + bias


def mini_batch_grad_desc(X, Y, learn_rate=0.001, epoch_numb=10, batch_size=800):
    coeff = np.random.sample((X.shape[1], 1))
    bias = 0
    
    rmse_arr = []
    r2_arr = []
    
    for epoch in range(epoch_numb):
        seed = std_random.randint(1, 10000)
        
        # shuffle before batch creating
        std_random.Random(seed).shuffle(X)
        std_random.Random(seed).shuffle(Y)
        
        number_of_batches = X.shape[0] // batch_size
        if number_of_batches % batch_size != 0:
            # 1 batch will have lower size if there are some train rows left
            number_of_batches += 1
        
        for i in range(number_of_batches):
            X_batch = create_batch(X, i, batch_size)
            Y_batch = create_batch(Y, i, batch_size)
            
            #print(X_batch.shape)
            #print(Y_batch.shape)
            
            batch_actual_size = X_batch.shape[0]
            
            # predict Y
            Y_predicted = linear_regression(X_batch, coeff, bias)
            error = Y_batch - Y_predicted
            
            # upd values using grad descent
            coeff = coeff + learn_rate*2*((X_batch.T).dot(error)) / batch_actual_size
            bias = bias + learn_rate*2*np.sum(error) / batch_actual_size
            
            #print(error)
            
            _rmse = rmse(Y_batch, Y_predicted)
            _r2 = r2(Y_batch, Y_predicted)
            
            #print("rmse " + str(epoch) + "_" + str(i) + ":", _rmse)
            #print("r2 " + str(epoch) + "_" + str(i) + ":", _r2)
            
            rmse_arr.append(_rmse)
            r2_arr.append(_r2)
        
#     print("rmse:", rmse_arr)
#     print("r2:", r2_arr)
    return coeff, bias

In [181]:
train_data_f_name = "./dataset/facebook_comment_volume.csv"

# get data for training
train_data = get_csv_data(train_data_f_name)
# print(np.shape(train_data))

# shuffle it
np.random.shuffle(train_data)

# and make 5 folds
x_folds, y_folds = split_to_folds(train_data, 5)

# print(np.shape(x_train_folds[0]))
# print(x_train_folds[0])
# print(np.shape(y_train_folds[0]))
# print(y_train_folds[0])

coeff_arr = []
bias_arr = []

rmse_train_arr = []
r2_train_arr = []
rmse_test_arr = []
r2_test_arr = []

for i in range(len(x_folds)):
    # make 1 test fold
    x_test = x_folds[i]
    y_test = y_folds[i]
    
    # and 4 train ones
    x_train_folds = np.delete(x_folds, i)
    y_train_folds = np.delete(y_folds, i)
    
    # concatenate 4 train folds into 1
    x_train = concat(x_train_folds)
    y_train = concat(y_train_folds)
    
#     print(np.shape(x_train))
#     print(np.shape(y_train))
    
    # normalize train and test data
    x_test = normalize_data(x_test)
    x_train = normalize_data(x_train)
    
#     print(x_test)
#     print(np.shape(x_test))
#     print(x_test.shape[0])
#     print(x_train)
#     print(np.shape(x_train))

    trained_coeff, trained_bias = mini_batch_grad_desc(x_train, y_train)
    
#     print(trained_coeff)
#     print(trained_bias)
    
    coeff_arr.insert(i, trained_coeff)
    bias_arr.insert(i, trained_bias)
    
    y_train_predicted = linear_regression(x_train, trained_coeff, trained_bias)
    y_test_predicted = linear_regression(x_test, trained_coeff, trained_bias)
    
    # calculate metrics using trained coefficients for logging
    rmse_train_arr.insert(i, rmse(y_train, y_train_predicted))
    r2_train_arr.insert(i, r2(y_train, y_train_predicted))
    rmse_test_arr.insert(i, rmse(y_test, y_test_predicted))
    r2_test_arr.insert(i, r2(y_test, y_test_predicted))
    
    print(str(i) + ":", r2_test_arr[i],rmse_test_arr[i],r2_train_arr[i],rmse_train_arr[i])


0: 0.1836515217735737 35.838480663692 0.9191186454256817 4.177667226720612
1: 0.16692428447171803 31.909535167837277 0.8272816740843763 4.285313334219525
2: 0.2829324585805063 27.136391336340253 0.6752994486597668 5.167139991561769
3: 0.298954232137738 28.423873147148708 0.3880165234148505 4.948019009233205
4: 0.21035695760727613 32.334626814068436 0.7439804945723925 3.6211990282736095


# Beautified output of calculated metrics

In [183]:
frame_data = {}
for i in range(5):
    frame_data[str(i + 1)] = [rmse_test_arr[i], r2_test_arr[i], rmse_train_arr[i], r2_train_arr[i], bias_arr[i]]

# print(data)

frame_data['E'] = [np.mean(rmse_test_arr), np.mean(r2_test_arr), np.mean(rmse_train_arr), np.mean(r2_train_arr), np.mean(bias_arr)]
frame_data['SD'] = [np.std(rmse_test_arr),np.std(r2_test_arr),np.std(rmse_train_arr),np.std(r2_train_arr), np.std(bias_arr)]
 
df1 = pd.DataFrame(frame_data, index =['test RMSE', 'test R2','train RMSE','train R2', 'bias']) 
df2 = pd.DataFrame(
    np.concatenate(
        (np.hstack(coeff_arr), 
         np.mean(coeff_arr, axis=0),
         np.std(coeff_arr, axis=0)),
        axis=1),
    columns=['1', '2', '3','4','5','E','SD'])

df = pd.concat([df1, df2], axis=0)
df

Unnamed: 0,1,2,3,4,5,E,SD
test RMSE,35.838481,31.909535,27.136391,28.423873,32.334627,31.128581,3.082227
test R2,0.183652,0.166924,0.282932,0.298954,0.210357,0.228564,0.053026
train RMSE,4.177667,4.285313,5.16714,4.948019,3.621199,4.439868,0.556776
train R2,0.919119,0.827282,0.675299,0.388017,0.74398,0.710739,0.18084
bias,3.032512,3.618544,3.692879,3.348026,3.558849,3.450162,0.238302
0,-0.32967,0.376626,0.259049,-0.173634,0.190689,0.064612,0.269545
1,-0.589433,-0.879957,-0.420749,0.136868,-0.553072,-0.461269,0.334533
2,0.161469,0.357536,0.486341,0.292659,0.649643,0.38953,0.167003
3,0.272374,-0.100163,-0.159344,0.197867,0.237701,0.089687,0.181684
4,2.157206,1.437103,1.415163,0.264105,0.143418,1.083399,0.767208
