In [None]:
import torch 
from torch.utils.data import Dataset 
import numpy as np 
import pandas as pd
import os 
import dill as pickle


In [None]:
labels_col = 13 #this is exaple
raw_data_path = 'Data/Raw/heart.csv'    #THIS NEEDS TO BE ABSOLUTE PATH
train_percent = 0.7
test_percent = 1 - train_percent
path_to_save_dir = 'Data/Datasets/Heart'  #path to save the data too


In [None]:
class TrainDataSet(Dataset):
    def __init__(self, raw_data_path, train_percent, labels_col):
        data = pd.read_csv(raw_data_path, dtype=np.float32)
        new_column_names = list(range(data.shape[1]))
        data.columns = new_column_names #cols are now 0 to n
        print(data.shape)

        self.rows = np.random.permutation(data.shape[0])[:int(data.shape[0] * train_percent)]

        self.x = torch.tensor(data.iloc[self.rows, data.columns != labels_col].values, dtype=torch.float32)
        self.y = torch.tensor(data.iloc[self.rows][labels_col].values, dtype=torch.float32).unsqueeze(1)
        self.n_samples = len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples
    
class TestDataSet(Dataset):
    def __init__(self, raw_data_path, test_percent, labels_col, train_rows):
        data = pd.read_csv(raw_data_path, dtype=np.float32)
        new_column_names = list(range(data.shape[1]))
        data.columns = new_column_names #cols are now 0 to n
        
        rows = np.ones(data.shape[0], dtype=bool)
        rows[train_rows] = False

        self.x = torch.tensor(data.iloc[rows, data.columns != labels_col].values, dtype=torch.float32)
        self.y = torch.tensor(data.iloc[rows][labels_col].values, dtype=torch.float32).unsqueeze(1)
        self.n_samples = len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples


In [None]:
def save_dataset(dataset, path_to_save_dir, file_name):
    with open(os.path.join(path_to_save_dir,file_name), 'wb') as f:
        pickle.dump(dataset, f)


In [None]:
#save to non raw data to data Datasets
train_set = TrainDataSet(raw_data_path, train_percent, labels_col)
test_set = TestDataSet(raw_data_path, train_percent, labels_col, train_set.rows)

os.makedirs(path_to_save_dir, exist_ok=False)

save_dataset(train_set, path_to_save_dir, 'train.pkl')
save_dataset(test_set, path_to_save_dir, 'test.pkl')

In [None]:
#test