In [1]:
import pandas as pd
from scipy import stats
import random
import math


In [23]:
df = pd.read_csv("Data/Data/HeightWeight20.csv")
print(df)

        Height      Weight
0   187.571423  109.720985
1   174.706036   73.622732
2   188.239668   96.497550
3   182.196685   99.809504
4   177.499762   93.598619
5   170.822660   69.042216
6   174.714106   83.428219
7   173.605229   76.190352
8   170.228132   79.800187
9   161.179495   70.941642
10  160.475926   64.077173
11  159.097154   46.653553
12  157.677693   62.909437
13  152.477302   44.310238
14  150.109555   50.135381
15  168.078537   62.041159
16  170.350573   77.504315
17  162.224700   58.275377
18  175.346978   74.322166
19  157.338384   51.550324


In [106]:
def split_data(df : pd.DataFrame, seed : int, train_frac : float = 0.5):

    separator_index = math.floor(df.shape[0]*train_frac)

    sample_df = df.sample(frac=1.0, random_state=seed, ignore_index=True)
    train_df = sample_df.iloc[0:separator_index]
    test_df = sample_df.iloc[separator_index:].reset_index(drop=True)
    
    return(train_df, test_df)

def CalculateMSE(x_positions, y_positions, w0, w1):
    n = len(x_positions)
    MSE = 0
    for i in range(0, n):
        MSE += (w0 + w1*x_positions[i] - y_positions[i]) ** 2
    MSE *= 1/(n)
    return MSE

def CalculateMAE(x_positions, y_positions, w0, w1):
    n = len(x_positions)
    MAE = 0
    for i in range(0, n):
        MAE += abs(w0 + w1*x_positions[i] - y_positions[i])
    MAE *= 1/n
    return MAE

def CalculateRAE(x_positions, y_positions, w0, w1):
    n = len(y_positions)
    y_bar = sum(y_positions) / n
    
    err_from_model = 0
    err_from_avg = 0
    for i in range(0,n):
        err_from_model += abs(y_positions[i] - (w0 + w1*x_positions[i]))
        err_from_avg += abs(y_positions[i] - y_bar)
    return err_from_model / err_from_avg

def CalculateRRSE(x_positions, y_positions, w0, w1):
    n = len(y_positions)
    y_bar = sum(y_positions) / n
    
    err_from_model = 0
    err_from_avg = 0
    for i in range(0,n):
        err_from_model += (y_positions[i] - (w0 + w1*x_positions[i]))**2
        err_from_avg += (y_positions[i] - y_bar)**2
    return (err_from_model / err_from_avg)**(1/2)

"""def split_data_to_fold(df : pd.DataFrame, seed : int, folds : int):

    data_per_fold = df.shape[0] / folds

    sample_df = df.sample(frac=1.0, random_state=seed, ignore_index=True)
    dfs = []
    remainder = 0
    add_lower, add_upper = 0, 0

    for i in range(folds-1):

        if remainder >= 1:
            remainder = 0
            add_lower = 1

        remainder += data_per_fold

        if remainder >= 1:
            add_upper = 1

        dfs.append(sample_df.iloc[i * math.floor(data_per_fold) + add_lower: (i+1) * math.floor(data_per_fold) + add_upper])

        if i < folds-2:
            add_lower, add_upper = 0, 0
        
    dfs.append(sample_df.iloc[(folds-1)*math.floor(data_per_fold) + add_upper:])

    return dfs"""

'def split_data_to_fold(df : pd.DataFrame, seed : int, folds : int):\n\n    data_per_fold = df.shape[0] / folds\n\n    sample_df = df.sample(frac=1.0, random_state=seed, ignore_index=True)\n    dfs = []\n    remainder = 0\n    add_lower, add_upper = 0, 0\n\n    for i in range(folds-1):\n\n        if remainder >= 1:\n            remainder = 0\n            add_lower = 1\n\n        remainder += data_per_fold\n\n        if remainder >= 1:\n            add_upper = 1\n\n        dfs.append(sample_df.iloc[i * math.floor(data_per_fold) + add_lower: (i+1) * math.floor(data_per_fold) + add_upper])\n\n        if i < folds-2:\n            add_lower, add_upper = 0, 0\n        \n    dfs.append(sample_df.iloc[(folds-1)*math.floor(data_per_fold) + add_upper:])\n\n    return dfs'

Holdout

In [98]:
train_df, test_df = split_data(df, 11010, 0.5)
print(train_df)
print(test_df)

       Height      Weight
0  170.822660   69.042216
1  152.477302   44.310238
2  174.714106   83.428219
3  174.706036   73.622732
4  170.350573   77.504315
5  160.475926   64.077173
6  173.605229   76.190352
7  187.571423  109.720985
8  162.224700   58.275377
9  150.109555   50.135381
       Height     Weight
0  182.196685  99.809504
1  168.078537  62.041159
2  175.346978  74.322166
3  170.228132  79.800187
4  188.239668  96.497550
5  157.338384  51.550324
6  161.179495  70.941642
7  157.677693  62.909437
8  177.499762  93.598619
9  159.097154  46.653553


In [92]:
slope, intercept, r, p, se = stats.linregress(train_df["Height"], train_df["Weight"])
print(slope, intercept)

1.5493022523574467 -189.19619910355522


In [93]:
print(CalculateMSE(test_df["Height"], test_df["Weight"], intercept, slope)**(1/2))

7.819720132105899


In [110]:
avg_rmse = 0
avg_mae = 0
avg_r = 0
avg_rae = 0
avg_rrse = 0

for i in range(20):
    train_df, test_df = split_data(df, i, 0.5)
    slope, intercept, r, p, se = stats.linregress(train_df["Height"], train_df["Weight"])
    rmse = CalculateMSE(test_df["Height"], test_df["Weight"], intercept, slope)**(1/2)
    mae = CalculateMAE(test_df["Height"], test_df["Weight"], intercept, slope)
    rae = CalculateRAE(test_df["Height"], test_df["Weight"], intercept, slope)
    rrse = CalculateRRSE(test_df["Height"], test_df["Weight"], intercept, slope)

    avg_rmse += rmse
    avg_mae += mae
    avg_r += r
    avg_rae += rae
    avg_rrse += rrse

print(f"Average RMSE : {avg_rmse / 20}")
print(f"Average MAE : {avg_mae / 20}")
print(f"Average Correlation coefficient : {avg_r / 20}")
print(f"Average RAE : {avg_rae / 20}")
print(f"Average RRSE : {avg_rrse / 20}")

Average RMSE : 7.945404717120975
Average MAE : 7.191794079746131
Average Correlation coefficient : 0.9294227921780667
Average RAE : 0.5325480941900034
Average RRSE : 0.4691923249023392
