In [12]:
import os
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm

# Configuration

In [5]:
config = {
    'data_root': './data',
    'device': 'mps'
}

# Dataset

In [14]:
from torch.utils.data import Dataset

In [38]:
class TitanicDataset(Dataset):
    def __init__(self, data_root, training=False, transform=None):
        self.data_root = data_root
        self.training = training
        self.transform = transform
        
        if training:
            file_name = 'train.csv'
        else:
            file_name = 'test.csv'
            
        self.dataset = pd.read_csv(os.path.join(data_root, file_name))
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):  
        pclass = np.array(self.dataset['Pclass'][idx], dtype=np.float64)
        name = self.dataset['Name'][idx] # not used
        sex = np.array(pd.get_dummies(self.dataset['Sex']).iloc[idx], dtype=np.float64)
        age = np.array(self.dataset['Age'][idx] if not np.isnan(self.dataset['Age'][idx]) else -1, dtype=np.float64)
        sibsp = np.array(self.dataset['SibSp'][idx], dtype=np.float64)
        parch = np.array(self.dataset['Parch'][idx], dtype=np.float64)
        ticket = self.dataset['Ticket'][idx] # not used
        fare = np.array(self.dataset['Fare'][idx], dtype=np.float64)
        cabin = self.dataset['Cabin'][idx] # not used
        embarked = np.array(pd.get_dummies(self.dataset['Embarked']).iloc[idx], dtype=np.float64)
        
        if self.training:
            label = np.array(self.dataset['Survived'][idx], dtype=np.float64)
        else:
            label = None
        
        # feature: [paclass, female, male, age, sibsp, parch, fare, embarked(C), embarked(Q), embarked(S)]
        feature = np.concatenate((pclass, sex, age, sibsp, parch, fare, embarked), axis=None)
        sample = {'feature': feature, 'label': label}
        if self.transform:
            return self.transform(sample)
        else:
            return sample

In [41]:
dataset = TitanicDataset(data_root=config['data_root'], training=False)
dataset.__getitem__(3)

{'feature': array([ 3.    ,  0.    ,  1.    , 27.    ,  0.    ,  0.    ,  8.6625,
         0.    ,  0.    ,  1.    ]),
 'label': None}

# Data Loader

# Network

# Loss

# Metrics

# Trainer

# Train