In [11]:
import pandas as pd
from scipy import stats
import random
import math
import numpy as np


In [70]:
df = pd.read_csv("Data/Data/HeightWeight100.csv")
print(df)

        Height      Weight
0   187.571423  109.720985
1   174.706036   73.622732
2   188.239668   96.497550
3   182.196685   99.809504
4   177.499762   93.598619
..         ...         ...
95  168.078537   62.041159
96  170.350573   77.504315
97  162.224700   58.275377
98  175.346978   74.322166
99  157.338384   51.550324

[100 rows x 2 columns]


In [4]:
def split_data(df : pd.DataFrame, seed : int, train_frac : float = 0.5):

    separator_index = math.floor(df.shape[0]*train_frac)

    sample_df = df.sample(frac=1.0, random_state=seed, ignore_index=True)
    train_df = sample_df.iloc[0:separator_index]
    test_df = sample_df.iloc[separator_index:].reset_index(drop=True)
    
    return(train_df, test_df)

def CalculateMSE(x_positions, y_positions, w0, w1):
    n = len(x_positions)
    MSE = 0
    for i in range(0, n):
        MSE += (w0 + w1*x_positions[i] - y_positions[i]) ** 2
    MSE *= 1/(n)
    return MSE

def CalculateMAE(x_positions, y_positions, w0, w1):
    n = len(x_positions)
    MAE = 0
    for i in range(0, n):
        MAE += abs(w0 + w1*x_positions[i] - y_positions[i])
    MAE *= 1/n
    return MAE

def CalculateRAE(x_positions, y_positions, w0, w1):
    n = len(y_positions)
    y_bar = sum(y_positions) / n
    
    err_from_model = 0
    err_from_avg = 0
    for i in range(0,n):
        err_from_model += abs(y_positions[i] - (w0 + w1*x_positions[i]))
        err_from_avg += abs(y_positions[i] - y_bar)
    return err_from_model / err_from_avg

def CalculateRRSE(x_positions, y_positions, w0, w1):
    n = len(y_positions)
    y_bar = sum(y_positions) / n
    
    err_from_model = 0
    err_from_avg = 0
    for i in range(0,n):
        err_from_model += (y_positions[i] - (w0 + w1*x_positions[i]))**2
        err_from_avg += (y_positions[i] - y_bar)**2
    return (err_from_model / err_from_avg)**(1/2)

Holdout

In [5]:
train_df, test_df = split_data(df, 11010, 0.5)
print(train_df)
print(test_df)

       Height      Weight
0  170.822660   69.042216
1  152.477302   44.310238
2  174.714106   83.428219
3  174.706036   73.622732
4  170.350573   77.504315
5  160.475926   64.077173
6  173.605229   76.190352
7  187.571423  109.720985
8  162.224700   58.275377
9  150.109555   50.135381
       Height     Weight
0  182.196685  99.809504
1  168.078537  62.041159
2  175.346978  74.322166
3  170.228132  79.800187
4  188.239668  96.497550
5  157.338384  51.550324
6  161.179495  70.941642
7  157.677693  62.909437
8  177.499762  93.598619
9  159.097154  46.653553


In [6]:
slope, intercept, r, p, se = stats.linregress(train_df["Height"], train_df["Weight"])
print(slope, intercept)

1.5493022523574467 -189.19619910355522


In [7]:
print(CalculateMSE(test_df["Height"], test_df["Weight"], intercept, slope)**(1/2))

7.819720132105899


In [63]:
def holdout(df : pd.DataFrame, seed_amt : int, train_frac : float):
    avg_rmse = 0
    avg_mae = 0
    avg_r = 0
    avg_rae = 0
    avg_rrse = 0

    rmse_list, mae_list, r_list, rae_list, rrse_list = [], [], [], [], []

    for i in range(seed_amt):
        train_df, test_df = split_data(df, i, train_frac)
        slope, intercept, r, p, se = stats.linregress(train_df["Height"], train_df["Weight"])

        rmse = CalculateMSE(test_df["Height"], test_df["Weight"], intercept, slope)**(1/2)
        mae = CalculateMAE(test_df["Height"], test_df["Weight"], intercept, slope)
        rae = CalculateRAE(test_df["Height"], test_df["Weight"], intercept, slope)
        rrse = CalculateRRSE(test_df["Height"], test_df["Weight"], intercept, slope)

        rmse_list.append(rmse)
        mae_list.append(mae)
        r_list.append(r)
        rae_list.append(rae)
        rrse_list.append(rrse)

        avg_rmse += rmse
        avg_mae += mae
        avg_r += r
        avg_rae += rae
        avg_rrse += rrse

    """print(f"Average RMSE : {avg_rmse / seed_amt}")
    print(f"Average MAE : {avg_mae / seed_amt}")
    print(f"Average Correlation coefficient : {avg_r / seed_amt}")
    print(f"Average RAE : {avg_rae / seed_amt}")
    print(f"Average RRSE : {avg_rrse / seed_amt}")
    print(f"Standard Deviation: {np.std(rmse_list)}")"""

    return {"RMSE" : avg_rmse / seed_amt, "MAE" : avg_mae / seed_amt, 
            "r" : avg_r / seed_amt, "RAE" : avg_rae / seed_amt, 
            "RRSE" : avg_rrse / seed_amt, "STD" : np.std(rmse_list)}

In [74]:
df = pd.read_csv("Data/Data/HeightWeight20.csv")
for i in range(1, 10):
    print(f"-----------------{i*10}%-------------------")
    holdout_result = holdout(df, 100, i/10)
    print(f"Average RMSE : {holdout_result['RMSE']:.4f}")
    print(f"Standard Deviation: {holdout_result['STD']:.4f}")
    print(f"Average MAE : {holdout_result['MAE']:.4f}")
    #print(f"Average Correlation coefficient : {holdout_result['r']:.4f}")
    print(f"Average RAE : {holdout_result['RAE']:.4f}")
    print(f"Average RRSE : {holdout_result['RRSE']:.4f}")

-----------------10%-------------------
Average RMSE : 27.5366
Standard Deviation: 57.4182
Average MAE : 23.6873
Average RAE : 1.7251
Average RRSE : 1.6088
-----------------20%-------------------
Average RMSE : 9.2948
Standard Deviation: 2.5371
Average MAE : 7.9804
Average RAE : 0.5696
Average RRSE : 0.5322
-----------------30%-------------------
Average RMSE : 8.4142
Standard Deviation: 1.8084
Average MAE : 7.4068
Average RAE : 0.5295
Average RRSE : 0.4842
-----------------40%-------------------
Average RMSE : 8.1178
Standard Deviation: 1.3866
Average MAE : 7.2618
Average RAE : 0.5366
Average RRSE : 0.4811
-----------------50%-------------------
Average RMSE : 7.8889
Standard Deviation: 1.2278
Average MAE : 7.1124
Average RAE : 0.5291
Average RRSE : 0.4709
-----------------60%-------------------
Average RMSE : 7.7964
Standard Deviation: 1.1417
Average MAE : 7.1255
Average RAE : 0.5416
Average RRSE : 0.4783
-----------------70%-------------------
Average RMSE : 7.5731
Standard Deviatio

In [75]:
df = pd.read_csv("Data/Data/HeightWeight100.csv")
for i in range(1, 10):
    print(f"-----------------{i*10}%-------------------")
    holdout_result = holdout(df, 100, i/10)
    print(f"Average RMSE : {holdout_result['RMSE']:.4f}")
    print(f"Standard Deviation: {holdout_result['STD']:.4f}")
    print(f"Average MAE : {holdout_result['MAE']:.4f}")
    #print(f"Average Correlation coefficient : {holdout_result['r']:.4f}")
    print(f"Average RAE : {holdout_result['RAE']:.4f}")
    print(f"Average RRSE : {holdout_result['RRSE']:.4f}")

-----------------10%-------------------
Average RMSE : 6.3293
Standard Deviation: 0.5321
Average MAE : 5.0733
Average RAE : 0.4204
Average RRSE : 0.4347
-----------------20%-------------------
Average RMSE : 6.0292
Standard Deviation: 0.3498
Average MAE : 4.8210
Average RAE : 0.4005
Average RRSE : 0.4152
-----------------30%-------------------
Average RMSE : 5.9283
Standard Deviation: 0.3419
Average MAE : 4.7346
Average RAE : 0.3942
Average RRSE : 0.4083
-----------------40%-------------------
Average RMSE : 5.8755
Standard Deviation: 0.3515
Average MAE : 4.6947
Average RAE : 0.3926
Average RRSE : 0.4063
-----------------50%-------------------
Average RMSE : 5.8319
Standard Deviation: 0.4027
Average MAE : 4.6482
Average RAE : 0.3905
Average RRSE : 0.4047
-----------------60%-------------------
Average RMSE : 5.8103
Standard Deviation: 0.4989
Average MAE : 4.6349
Average RAE : 0.3913
Average RRSE : 0.4057
-----------------70%-------------------
Average RMSE : 5.8208
Standard Deviation: 

In [76]:
full_df = pd.read_csv("Data/Data/HeightWeight.csv")

for i in range(1, 10):
    sum_rmse = 0
    sum_std = 0
    sum_mae = 0
    sum_r = 0
    sum_rae = 0
    sum_rrse = 0
    print(f"-----------------{i*10}%-------------------")
    for seed in range(100):
        sample_20_df = full_df.sample(n=20, random_state=seed, ignore_index=True)
        holdout_result = holdout(sample_20_df, 100, i/10)
        sum_rmse += holdout_result["RMSE"]
        sum_std += holdout_result["STD"]
        sum_mae += holdout_result["MAE"]
        sum_r += holdout_result["r"]
        sum_rae += holdout_result["RAE"]
        sum_rrse += holdout_result["RRSE"]
    avg_rmse = sum_rmse / 100
    avg_std = sum_std / 100
    avg_mae = sum_mae / 100
    avg_r = sum_r / 100
    avg_rae = sum_rae / 100
    avg_rrse = sum_rrse / 100
    print(f"Average RMSE : {avg_rmse:.4f}")
    print(f"Standard Deviation: {avg_std:.4f}")
    print(f"Average MAE : {avg_mae:.4f}")
    #print(f"Average Correlation coefficient : {avg_r:.4f}")
    print(f"Average RAE : {avg_rae:.4f}")
    print(f"Average RRSE : {avg_rrse:.4f}")

-----------------10%-------------------
Average RMSE : 61.8519
Standard Deviation: 375.0051
Average MAE : 51.0554
Average RAE : 4.5010
Average RRSE : 4.6062
-----------------20%-------------------
Average RMSE : 7.7745
Standard Deviation: 3.4092
Average MAE : 6.3888
Average RAE : 0.5496
Average RRSE : 0.5653
-----------------30%-------------------
Average RMSE : 6.6320
Standard Deviation: 1.5479
Average MAE : 5.4435
Average RAE : 0.4742
Average RRSE : 0.4880
-----------------40%-------------------
Average RMSE : 6.2348
Standard Deviation: 1.1339
Average MAE : 5.1183
Average RAE : 0.4518
Average RRSE : 0.4649
-----------------50%-------------------
Average RMSE : 6.0118
Standard Deviation: 1.0651
Average MAE : 4.9445
Average RAE : 0.4473
Average RRSE : 0.4592
-----------------60%-------------------
Average RMSE : 5.8405
Standard Deviation: 1.1548
Average MAE : 4.8252
Average RAE : 0.4524
Average RRSE : 0.4624
-----------------70%-------------------
Average RMSE : 5.7076
Standard Deviati