In [1]:
# going modular: adapting the code that we have written in Kaggle notebooks so far to be used in python scripts
# the directory structure will be discussed further down the line

# Our goal is to train pytorch model using a single line in CMD/Terminal for e.g.:
# python train.py --model Flower Image Classifier --batch_size 32 --lr 0.001 --num_epochs 2

In [1]:
# creating a directory for going modular

# necessary imports
from pathlib import Path

# using Path
GOING_MODULAR = Path("./going_modular")

# checking if directory already exists
if GOING_MODULAR.is_dir():
  print(f"{GOING_MODULAR} directory already exists...")
else:
  print(f"{GOING_MODULAR} directory does not exist, creating...")
  GOING_MODULAR.mkdir(parents=True, exist_ok=True)
    



going_modular directory does not exist, creating...


In [2]:
# installing torchmetrics since it is needed later
import pkgutil

if pkgutil.find_loader('torchmetrics') is not None:
    print("Package is installed")
else:
    print("Package is not installed, installing...")
    print()
    !pip install torchmetrics

Package is installed


In [3]:
# Download the dataset
# This cell has to run only once. 
# NO need to run every time you arrive on this notebook. 

import requests
import tarfile
import os
import shutil

# Define the URL and folder paths
url = "https://s3.amazonaws.com/content.udacity-data.com/nd089/flower_data.tar.gz"
folder_name = "flowers"
file_name = "flower_data.tar.gz"
file_path = os.path.join(folder_name, file_name)

# Remove the folder or symbolic link if it already exists (equivalent to `rm -rf flowers`)
try:
    if os.path.islink(folder_name) or os.path.isfile(folder_name):
        os.remove(folder_name)  # Remove the symbolic link or file
    elif os.path.isdir(folder_name):
        shutil.rmtree(folder_name)  # Remove the directory
    print(f"Removed existing {folder_name} folder/file/soft link, if any.")
except FileNotFoundError:
    pass  # If the file or directory does not exist, do nothing

# Create the folder
os.makedirs(folder_name)
print(f"Created folder: {folder_name}")

# Download the file
response = requests.get(url, stream=True)

# Save the file in the 'flowers' folder
with open('flower_data.tar.gz', "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

print(f"Downloaded {file_name} to {folder_name}")

# Extract the file in the 'flowers' folder
if file_path.endswith("tar.gz"):
    with tarfile.open('flower_data.tar.gz', "r:gz") as tar:
        tar.extractall(path=folder_name)
        print(f"Extracted {file_name} to {folder_name}")

# Clean up by removing the tar.gz file after extraction
os.remove('flower_data.tar.gz')
print(f"Removed the downloaded tar.gz file: {file_path}")


Removed existing flowers folder/file/soft link, if any.
Created folder: flowers
Downloaded flower_data.tar.gz to flowers
Extracted flower_data.tar.gz to flowers
Removed the downloaded tar.gz file: flowers/flower_data.tar.gz


In [4]:
%%writefile going_modular/data_acquire.py
"""Module to download data in PyTorch standard image classification format.

Used for setting up data directories for downlaoding data as per
the directory structure. Contains function 'data_download' that takes
input from user for where to download the data and returns
"TRAIN_PATH" and "TEST_PATH".

Typical usage example:
  from going_modular.data_acquire import data_download
  TRAIN_PATH, TEST_PATH = data_download(path=USER_GIVEN_PATH)
"""

# necessary imports to download and load the dataset
import requests
import zipfile
import os
from pathlib import Path

# Define the URL and folder paths
url = "https://s3.amazonaws.com/content.udacity-data.com/nd089/flower_data.tar.gz"
folder_name = "flowers"
file_name = "flower_data.tar.gz"
file_path = os.path.join(folder_name, file_name)

def data_download(path: str = 'file_path'):
    """Downloads subset of FOOD 101 at location defined by <path> parameter.

    Args:
        path: A string defining the path to the download location.
          By default <path> is set to a string: 'vision_datasets'.

    Returns:
        A tuple of pathlib.PosixPath objects defining the training and
        test set locations.

        (TRAIN_PATH, TEST_PATH)
    """

    # defining the location
    FLOWER_DATASETS = Path(f"./{path}")
    #FLOWER_DATASET 

    # if the path does not exist, download and prepare dataset
    if FLOWER_DATASETS.is_dir():
        print(f"{FLOWER_DATASETS} directory already exists...")
    else:
        print(f"{FLOWER_DATASETS} directory does not exist, creating...")
        FLOWER_DATASETS.mkdir(parents=True, exist_ok=True)

    # download flower dataset
    with open('flower_data.tar.gz', "wb") as f:
        request = requests.get("https://s3.amazonaws.com/content.udacity-data.com/nd089/flower_data.tar.gz")
        print("Downloading flower images")
        f.write(request.content)

    # unzipping flower dataset
    #with zipfile.ZipFile('flower_data.tar.gz', 'r') as zip_ref:
        #print("Unzipping flower dataset...")
        #zip_ref.extractall(FLOWER_DATASETS)

    # removing the downloaded .zip
    #os.remove('flowers/flower_data.tar.gz'.zip)

    # setup train and test paths
    TRAIN_PATH = FLOWER_DATASETS / "train"
    TEST_PATH = FLOWER_DATASETS / "test"

    # returning TRAIN_PATH and TEST_PATH
    return TRAIN_PATH, TEST_PATH


Writing going_modular/data_acquire.py


In [5]:
%%writefile going_modular/data_preprocess.py
"""Module to preprocess datasets and convert to PyTorch datasets + dataloaders.

For preprocessing the downloaded dataset as required and turning it to PyTorch
compatible 'torchvision.datasets.ImageFolder' and further using this dataset to
build dataloaders using 'torch.utils.data.DataLoader'.

Typical usage example:
  from going_modular.data_preprocess import data_setup
  trainloader, testloader, classes, class_to_idx = data_setup(
                                    train_path=PATH_TO_TRAIN_DATA,
                                    test_path=PATH_TO_TEST_DATA,
                                    data_transform=SOME_TRANSFORM)
"""


# necessary imports
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# setting number of workers for data laoding
NUM_WORKERS = os.cpu_count()

# function to return iterables over the train and test datasets
def data_setup(train_path: str,
               test_path: str,
               data_transform: transforms.Compose,
               batch_size: int = 32,
               shuffle: bool = True):
    """For making datasets and their corresponding dataloaders.

    Args:
       train_path: A string defining the path to the training dataset location.
       test_path: A string defining the path to the testing dataset location.
       data_transform: An instance of transforms.Compose.
       batch_size: An integer defining the batch size used (default: int '32').
    shuffle: Bool value for whether to shuffle training dataset (default: True)

   Returns:
       A tuple of an instance each of 'torch.utils.data.DataLoader' for
       train and test sets, list of classes in the dataset,
       dict mapping a class to its corresponding numerical label.

    (trainloader, testloader, classes, class_to_idx)
  """

    # loading datasets
    data_dir = 'flowers'
    train_dir = data_dir + '/train'
    test_dir = data_dir + '/test'
    train_set = datasets.ImageFolder(data_dir + '/train', transform=data_transform)
    test_set = datasets.ImageFolder(data_dir + '/test', transform=data_transform)

    # making dataloaders from datasets
    trainloader = torch.utils.data.DataLoader(train_path, batch_size=32, shuffle=True)
    testloader = torch.utils.data.DataLoader(test_path, batch_size=32, shuffle=True)

    # defining class list and class to label dictionary
    classes = train_set.classes
    class_to_idx = train_set.class_to_idx

    # returning dataloaders, classes, class_to_idx
    return trainloader, testloader, classes, class_to_idx


Writing going_modular/data_preprocess.py


In [6]:
%%writefile going_modular/model_builder.py
"""Module to define PyTorch model class and return its instance.

VGG16 class inherits from 'nn.Module' in order to create
a PyTorch module. '__init__' method of the class has parameters to take
the number of kernels per layer <num_hidden_units> (default: int '10')
and number of classes for the classification problem
<num_classes> (default: int '10').

Typical usage example:
  import torch
  from going_modular.model_builder import TinyVGGCNNExplainer
  model = VGG16(nn.Module)
  model.to('cuda' if torch.cuda.is_available() else 'cpu')
"""

# necessary imports
import torch
import torch.nn as nn

# creating VGG16 for modelling the data

class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer6 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())
        self.layer7 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        
        
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7*7*256, 25088),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(25088, 25088),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(25088, num_classes))
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

Writing going_modular/model_builder.py


In [7]:
%%writefile going_modular/engine.py
# Imports here
#%matplotlib inline
#%config InlineBackend.figure_format = 'retina'

import torchvision

import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import time
import torch
from collections import OrderedDict

from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from PIL import Image

epochs = 3
steps = 0
running_loss = 0
print_every = 5
for epoch in range(epochs):
    for inputs, labels in trainloader:
        steps += 1
        # Move input and label tensors to the default device
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        logps = model.forward(inputs)
        loss = criterion(logps, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        if steps % print_every == 0:
            valid_loss = 0
            accuracy = 0
            model.eval()
            with torch.no_grad():
                for inputs, labels in validloader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    logps = model.forward(inputs)
                    batch_loss = criterion(logps, labels)
                    
                    valid_loss += batch_loss.item()
                    
                    # Calculate accuracy
                    ps = torch.exp(logps)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == labels.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
                    
            print(f"Epoch {epoch+1}/{epochs}.. "
                  f"Train loss: {running_loss/print_every:.3f}.. "
                  f"Validation loss: {valid_loss/len(validloader):.3f}.. "
                  f"Validation accuracy: {accuracy/len(validloader):.3f}")
            running_loss = 0
            model.train()
            


Writing going_modular/engine.py


In [8]:
%%writefile train.py
"""Script to initiate the training from terminal.

This script connects all the modules in 'going_modular'. Building the complete
training pipeline + saving the trained model to './saved_models' directory.

Typical usage example (from terminal):
  !python train.py --batch_size 16 --num_hidden_units 32 --learning_rate 0.01 --epochs 3
"""

# necessary imports
import torch
import argparse
from torchvision import transforms
from going_modular.data_acquire import data_download
from going_modular.data_preprocess import data_setup
from going_modular.model_builder import TinyVGGCNNExplainer
from going_modular.engine import metrics, optim_utils, training_step, testing_step, training_eval_loop, save_model

# setting up the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# parsing arguments from terminal/CMD
ap = argparse.ArgumentParser()
ap.add_argument("-bs", "--batch_size", type=int, default=32,
                help="batch size to be used")
ap.add_argument("-nh", "--num_hidden_units", type=int, default=10,
                help="number of hidden units for TinyVGG architecture")
ap.add_argument("-lr", "--learning_rate", type=float, default=0.001,
                help="learning rate to use with the optimizer")
ap.add_argument("-e", "--epochs", type=int, default=5,
                help="number of epochs to train the model for")
args = vars(ap.parse_args())

# setting up hyperparameters
BATCH_SIZE = args["batch_size"]
NUM_HIDDEN_UNITS = args["num_hidden_units"]
LEARNING_RATE = args["learning_rate"]
EPOCHS = args["epochs"]

# setting up data transformation
dataset_transform = transforms.Compose([transforms.Resize(size=(64,64)),
                                        transforms.ToTensor()])

# downloading data, setting up datasets and dataloaders
TRAIN_PATH, TEST_PATH = data_download()
print() # for better output readability
trainloader, testloader, classes, class_to_idx = data_setup(
                                              train_path=str(TRAIN_PATH),
                                              test_path=str(TEST_PATH),
                                              data_transform=dataset_transform,
                                              batch_size=BATCH_SIZE)

# setting up model
num_classes = 100
num_epochs = 20
batch_size = 16
learning_rate = 0.001

model = VGG16(num_classes).to(device)


# Loss and optimizer
criterion = nn.NLLLoss ()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)  


# Train the model
total_step = len(train_loader)

model.to(device)

# setting up metrics, loss function and optimizer
train_eval_metrics = metrics(device)
criterion, optimizer = optim_utils(model, LEARNING_RATE)
print() # for better output readability

# starting training
results_dict = training_eval_loop(EPOCHS,
                                  model,
                                  training_step,
                                  testing_step,
                                  trainloader,
                                  testloader,
                                  criterion,
                                  train_eval_metrics[0],
                                  train_eval_metrics[1],
                                  train_eval_metrics[2],
                                  train_eval_metrics[3],
                                  optimizer,
                                  device)
print() # for better output readability

# printing results of the trained model
print(f"training loss: {results_dict['train_loss'][-1]:.3f} | training acc: {results_dict['train_acc'][-1]:.3f} | training f1-score: {results_dict['train_f1'][-1]:.3f}")
print(f"testing loss: {results_dict['test_loss'][-1]:.3f} | testing acc: {results_dict['test_acc'][-1]:.3f} | testing f1-score: {results_dict['test_f1'][-1]:.3f}")
print() # for better output readability

# saving model
save_model(model)
print() # for better output readability

Writing train.py


In [138]:
# executing the script
!python train.py

file_path directory already exists...
Downloading flower images

Traceback (most recent call last):
  File "/kaggle/working/train.py", line 82, in <module>
    for inputs, labels in trainloader:
ValueError: too many values to unpack (expected 2)
