In [28]:
import math
import pandas as pd
from scipy import stats

In [53]:
def split_data_to_fold(df : pd.DataFrame, seed : int, folds : int):

    data_per_fold = df.shape[0] // folds
    remainder =  df.shape[0] % folds

    size_list = [data_per_fold]*folds
    for i in range(remainder):
        size_list[i] += 1

    sample_df = df.sample(frac=1.0, random_state=seed, ignore_index=True)
    dfs = []
    
    start_index = 0
    for i in range(len(size_list)):
        
        dfs.append(sample_df.iloc[start_index : start_index + size_list[i]] )
        start_index += size_list[i]

    return dfs

def CalculateRMSE(x_positions, y_positions, w0, w1):
    n = len(x_positions)
    RMSE = 0
    for i in range(0, n):
        RMSE += (w0 + w1*x_positions[i] - y_positions[i]) ** 2
    RMSE *= 1/(n)
    return RMSE**(1/2)

def train(df):
    slope, intercept, r, p, se = stats.linregress(df["Height"], df["Weight"])
    return slope,intercept

def cross_validation(fold_data):
    RMSE = []
    for i in range(len(fold_data)):
        test_df = fold_data[i]
        test_df.reset_index(inplace=True)
        train_df = list(fold_data)
        train_df.pop(i)
        slope,intercept = train(pd.concat(train_df, ignore_index=True))
        RMSE.append(CalculateRMSE(test_df["Height"],test_df["Weight"],intercept,slope))
    return RMSE

In [62]:
df = pd.read_csv("data/HeightWeight20.csv")
fold_df = split_data_to_fold(df,2,4)
RMSE_list = cross_validation(fold_df)
print(RMSE_list)



9.732602697813538
6.222517661517613
7.316986352177761
5.767329906413159


1
[2, 3, 4]
-------
2
[1, 3, 4]
-------
3
[1, 2, 4]
-------
4
[1, 2, 3]
-------
