In [1]:
import os, sys

import numpy as np
import pandas as pd
from easydict import EasyDict

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import (
    DataLoader
    , Dataset
    , SubsetRandomSampler
)

# Modularity in Deep Learning Projects

To make our code more understandable, we'd better decompose our huge blobs of modeling program into a collection smaller and easy-to-read code files. Further, we can organize these smaller pieces of code files into several categories, namely: 

* Data Loader
* Graphs
    + Models
    + Loss Layers
* Utils
* Config

To piece these seperate components together, we use the so-called **Agents**. In plain English or Computer Science Terminologies: main function (actually the main "object" instead of a function, explained later). To conduct experiments, we simply instantiate the corresponding Agent in a driver function and call the corresponding methods of our Agent.

In short, we are able to sort out the dependencies between code files in a hiearchical mannar that reduces the difficulties for others to approach our project. This is **NOT** my novel idea. The project template I deem really satisfying can be found [here](https://github.com/moemen95/PyTorch-Project-Template#tutorials) (please give them a star :D). 

Although their GitHub Repo already contains everything that thoroughly explain the idea mentioned above, the code files are scattered in their own folders, making referencing files back and forth annoying. In this tutorial, I will try to give a vanilla version and put all of the modules in a single file (note that some of the features may get lost). The modeling project in this notebook is a *Binary Classification* problem using *Logistic Regression* model based on the *Breast Cancer* dataset.

## Data Loader

Generally speaking, the Data Loaders convert raw data files, no matter the fomat, into tensors for training. For `PyTorch` projects, we'd better learner to use `DataLoader` and `Dataset` abstractions to help us organize the datasets. A more detailed introduction regarding these two objects are introduced in another tutorial. Here I simply copy the code for `Breast Cancer` dataset loading from that tutorial.

In a word, we use `Dataset` to read in the raw data file and convert them into *a set of Records*, e.g. a pair of predictors and corresponding target stored in a tuple. And we define how we can slice the dataset. `DataLoader` are then created based on a certain dataset, and it gives us a uniform interface to access and operate the underlying dataset during the training session.

So the general workflow is to instantiate a `DataLoader`, and, during the instantiation, it (1) creates a `Dataset` object, (2) shuffles the dataset, (3) "splits" the set according to certain configuration, and (4) defines iteration scheme, such as batch size. 



In [None]:
class BreastCancerDataset(Dataset):

    def __init__(self, path):

        # Let's just use pandas to read csv data
        self.df = pd.read_csv(
            path
            , header=None  # This file contains no header
            , index_col=0  # First column is an index column 
        ).replace({

            # Recode the targets such that:
            #   M(alignant) == 1
            #   B(enign) == 0
            1: {'M': 1, 'B': 0}  # 1 denote the second column
        })

        # Split X, Y and convert to tensors
        self.factor = torch.tensor(
            self.df.iloc[:, 1:].values, dtype=torch.float
        )
        self.target = torch.tensor(
            self.df.iloc[:, :1].values, dtype=torch.float
        )
        return

    def __len__(self):

        # Return # target
        return len(self.target)

    def __getitem__(self, idx):

        # Return a tuple with the first element being the predictors
        return self.factor[idx], self.target[idx]


class BreastCancerDataLoader(object):

    def __init__(self, config):
        self.config = EasyDict(config)

        # Load data
        self.dataset = BreastCancerDataset(
            os.path.join(self.config.path_dir, self.config.path_file)
        )

        # Split train test, possibly dev set
        #   1. Create indices
        #   2. Make a sampler
        #   3. Create seperate data loaders, feeding both the datset and sampler
        n_sample = len(self.dataset)
        cut_train = int(self.config.pct_train * n_sample)
        idxs_full = np.arange(n_sample)[torch.randperm(n_sample)]  # Shuffle

        self.idxs_train = idxs_full[:cut_train]
        self.idxs_valid = idxs_full[cut_train:]

        splr_train = SubsetRandomSampler(self.idxs_train)
        splr_valid = SubsetRandomSampler(self.idxs_valid)

        self.loader_train = DataLoader(
            self.dataset
            , sampler=splr_train
            , batch_size=self.config.batch_size
        )
        self.loader_valid = DataLoader(
            self.dataset
            , sampler=splr_valid
            , batch_size=self.config.batch_size
        )
        return

## Graph: Model and Loss Function

For this simple demo, the logistic regression, we don't really need a self-defined loss function. We simply instantiate the `BCELoss` object. Here we only define the logistic regression class.

In [None]:
class LogisticRegres(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = EasyDict(config)

        self.linear = nn.Linear(
            self.config.n_factor
            , out_features=1
            , bias=True
        )
        return

    def forward(self, x):
        
        linear = self.linear(x)
        return torch.sigmoid(linear)

## Combines Everything: Agent

In [None]:
class BreastCancerAgent(object):

    def __init__(self, config_data, config_model):

        return

In [None]:
CONFIG_DATA = {
    "path_dir": "../data/classification/breast_cancer"
    , "path_file": "data.csv"
    , "pct_train": 0.7
    , "batch_size": 4
}

CONFIG_MODEL = {
    "n_factor": 30
}