In [1]:
import pandas as pd
from tqdm import tqdm
import os


class DatasetLoader():
    def __init__(self, dataset_directory_path="./12.12.-splits/") -> None:
        self.dataset_directory_path = dataset_directory_path
        self.splits_csv_list = self._get_list_of_files_splitted(
            self.dataset_directory_path
        )

    def _get_list_of_files_splitted(self, data_dir: str):
        dic_of_files = {}
        for split in ["train", "val", "test"]:
            dic_of_files[split] = []
            split_data_dir = os.path.join(data_dir, split)
            for root, dic, files in os.walk(split_data_dir, topdown=False):
                for name in files:
                    path = os.path.join(root, name)
                    dic_of_files[split].append(path)
        return dic_of_files

    def _get_X_Y(
        self, 
        csv_list: str
    ):
        all_X = None
        all_y = None
        for csv_path in tqdm(csv_list):
            df_raw = pd.read_csv(csv_path)
            X = df_raw.drop('teamVictory',axis=1)
            Y = df_raw['teamVictory']

            if type(all_X) == type(None):
                all_X = X
                all_y = Y
            else:
                all_X = pd.concat([all_X, X])
                all_y = pd.concat([all_y, Y])

        return all_X.reset_index().drop(['index','Unnamed: 0'],axis=1), all_y.reset_index().drop(['index'],axis=1)

    def load_datasets(self, split="train"):
        csv_list = self.splits_csv_list[split]
        print(f"Loading {len(csv_list)} files from {split} split")
        return self._get_X_Y(csv_list=csv_list)


dl = DatasetLoader(dataset_directory_path="./datasets/12.12.-splits")

In [2]:
X_train, Y_train = dl.load_datasets(split="train")
X_val,   Y_val   = dl.load_datasets(split="val")
X_test,  Y_test  = dl.load_datasets(split="test")

Loading 87 files from train split


100%|██████████| 87/87 [00:03<00:00, 27.81it/s]


Loading 5 files from val split


100%|██████████| 5/5 [00:00<00:00, 61.33it/s]


Loading 5 files from test split


100%|██████████| 5/5 [00:00<00:00, 60.10it/s]
