In [84]:
%%writefile src/data_setup.py
"""
Contains functionality for creating PyTorch DataLoaders for 
LIBS benchmark classification dataset.
"""

import os
import torch
from torch.utils.data import DataLoader, TensorDataset
from load_libs_data import load_contest_train_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import numpy as np

NUM_WORKERS = os.cpu_count()

def create_dataloaders(
    train_dir: str, 
    batch_size: int, 
    num_classes: int,
    device: torch.device,
    num_workers: int=NUM_WORKERS, 
    split_rate: float=0.5,
    random_st: int=102,
    spectra_count: int=50
):
    """Creates training and validation DataLoaders.
    ...
    """

    X, y, samples = load_contest_train_dataset(train_dir, spectra_count)
    wavelengths = X.columns

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_rate, random_state=random_st, stratify=samples, shuffle=True)
    del X, y, samples

    y_train = y_train-1
    y_val = y_val-1

    scaler =  Normalizer(norm='max')
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Convert data to torch tensors
    X_train = torch.from_numpy(X_train).float() # removed: Add extra dimension for channels
    X_val = torch.from_numpy(X_val).float()  # Add extra dimension for channels

    # Convert y_train and y_val to PyTorch tensor and adjust them to zero-based index
    y_train = torch.from_numpy(np.array(y_train)).long()
    y_val = torch.from_numpy(np.array(y_val)).long()
    # y_train_onehot = torch.nn.functional.one_hot(y_train, num_classes=12)
    # y_val_onehot = torch.nn.functional.one_hot(y_val, num_classes=12)



    # Move data to device
    X_train = X_train.to(device)
    X_val = X_val.to(device) 
    y_train = y_train.to(device)
    y_val = y_val.to(device)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_val, y_val)
    
    # y_train_onehot = y_train_onehot.to(device)
    # y_val_onehot = y_val_onehot.to(device)

    # train_dataset = TensorDataset(X_train, y_train_onehot)
    # test_dataset = TensorDataset(X_val, y_val_onehot)

    # Create DataLoaders
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_dataloader, test_dataloader, y_train


Overwriting src/data_setup.py


In [10]:
%%writefile src/model_builder.py
"""
Contains PyTorch model code to instantiate an MLP model.
"""
import torch
from torch import nn

class SimpleMLP(nn.Module):
    """Creates a simple MLP architecture.

    Args:
        input_shape: An integer indicating the size of the input vector.
        hidden_units1: An integer indicating the number of hidden units in the first hidden layer.
        hidden_units2: An integer indicating the number of hidden units in the second hidden layer.
        output_shape: An integer indicating the number of output units.
    """
    def __init__(self, input_shape: int, hidden_units1: int, hidden_units2: int, output_shape: int) -> None:
        super().__init__()
        
        # First hidden layer
        self.hidden_layer_1 = nn.Sequential(
            nn.Linear(input_shape, hidden_units1),
            nn.ReLU()
        )
        
        # Second hidden layer
        self.hidden_layer_2 = nn.Sequential(
            nn.Linear(hidden_units1, hidden_units2),
            nn.ReLU()
        )

        # Output layer
        self.output_layer = nn.Linear(hidden_units2, output_shape)
    
    def forward(self, x: torch.Tensor):
        x = self.hidden_layer_1(x)
        x = self.hidden_layer_2(x)
        x = self.output_layer(x)
        return x


Overwriting src/model_builder.py


In [83]:
%%writefile src/engine.py
"""
Contains functions for training and testing a PyTorch model.
"""
import torch

from tqdm.auto import tqdm
from typing import Dict, List, Tuple

def train_step(model: torch.nn.Module, 
               dataloader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module, 
               optimizer: torch.optim.Optimizer,
               device: torch.device) -> Tuple[float, float]:
  """Trains a PyTorch model for a single epoch.

  Turns a target PyTorch model to training mode and then
  runs through all of the required training steps (forward
  pass, loss calculation, optimizer step).

  Args:
    model: A PyTorch model to be trained.
    dataloader: A DataLoader instance for the model to be trained on.
    loss_fn: A PyTorch loss function to minimize.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    device: A target device to compute on (e.g. "cuda" or "cpu").

  Returns:
    A tuple of training loss and training accuracy metrics.
    In the form (train_loss, train_accuracy). For example:

    (0.1112, 0.8743)
  """
  # Put model in train mode
  model.train()

  # Setup train loss and train accuracy values
  train_loss, train_acc = 0, 0

  # Loop through data loader data batches
  for batch, (X, y) in enumerate(dataloader):
      # Send data to target device
      X, y = X.to(device), y.to(device)

      # 1. Forward pass
      y_pred = model(X)
      # print(y_pred.shape)
      # print(y.shape)
      # 2. Calculate  and accumulate loss
      loss = loss_fn(y_pred, y)
      train_loss += loss.item() 

      # 3. Optimizer zero grad
      optimizer.zero_grad()

      # 4. Loss backward
      loss.backward()

      # 5. Optimizer step
      optimizer.step()

      # Calculate and accumulate accuracy metric across all batches
      y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
      train_acc += (y_pred_class == y).sum().item()/len(y_pred)

  # Adjust metrics to get average loss and accuracy per batch 
  train_loss = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader)
  return train_loss, train_acc

def test_step(model: torch.nn.Module, 
              dataloader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module,
              device: torch.device) -> Tuple[float, float]:
  """Tests a PyTorch model for a single epoch.

  Turns a target PyTorch model to "eval" mode and then performs
  a forward pass on a testing dataset.

  Args:
    model: A PyTorch model to be tested.
    dataloader: A DataLoader instance for the model to be tested on.
    loss_fn: A PyTorch loss function to calculate loss on the test data.
    device: A target device to compute on (e.g. "cuda" or "cpu").

  Returns:
    A tuple of testing loss and testing accuracy metrics.
    In the form (test_loss, test_accuracy). For example:

    (0.0223, 0.8985)
  """
  # Put model in eval mode
  model.eval() 

  # Setup test loss and test accuracy values
  test_loss, test_acc = 0, 0

  # Turn on inference context manager
  with torch.no_grad():
      # Loop through DataLoader batches
      for batch, (X, y) in enumerate(dataloader):
          # Send data to target device
          X, y = X.to(device), y.to(device)

          # 1. Forward pass
          test_pred_logits = model(X)
          # print(test_pred_logits.shape)

          # 2. Calculate and accumulate loss
          loss = loss_fn(test_pred_logits, y)
          test_loss += loss.item()

          # Calculate and accumulate accuracy
          test_pred_labels = test_pred_logits.argmax(dim=1)
          test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))

  # Adjust metrics to get average loss and accuracy per batch 
  test_loss = test_loss / len(dataloader)
  test_acc = test_acc / len(dataloader)
  return test_loss, test_acc

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List]:
  """Trains and tests a PyTorch model.

  Passes a target PyTorch models through train_step() and test_step()
  functions for a number of epochs, training and testing the model
  in the same epoch loop.

  Calculates, prints and stores evaluation metrics throughout.

  Args:
    model: A PyTorch model to be trained and tested.
    train_dataloader: A DataLoader instance for the model to be trained on.
    test_dataloader: A DataLoader instance for the model to be tested on.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    loss_fn: A PyTorch loss function to calculate loss on both datasets.
    epochs: An integer indicating how many epochs to train for.
    device: A target device to compute on (e.g. "cuda" or "cpu").

  Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for 
    each epoch.
    In the form: {train_loss: [...],
                  train_acc: [...],
                  test_loss: [...],
                  test_acc: [...]} 
    For example if training for epochs=2: 
                 {train_loss: [2.0616, 1.0537],
                  train_acc: [0.3945, 0.3945],
                  test_loss: [1.2641, 1.5706],
                  test_acc: [0.3400, 0.2973]} 
  """
  # Create empty results dictionary
  results = {"train_loss": [],
      "train_acc": [],
      "test_loss": [],
      "test_acc": []
  }

  # Loop through training and testing steps for a number of epochs
  for epoch in tqdm(range(epochs)):
      train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
      test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

      # Print out what's happening
      print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
      )

      # Update results dictionary
      results["train_loss"].append(train_loss)
      results["train_acc"].append(train_acc)
      results["test_loss"].append(test_loss)
      results["test_acc"].append(test_acc)

  # Return the filled results at the end of the epochs
  return results

Overwriting src/engine.py


In [12]:
%%writefile src/utils.py
"""
Contains various utility functions for PyTorch model training and saving.
"""
import torch
from pathlib import Path

def save_model(model: torch.nn.Module,
               target_dir: str,
               model_name: str):
  """Saves a PyTorch model to a target directory.

  Args:
    model: A target PyTorch model to save.
    target_dir: A directory for saving the model to.
    model_name: A filename for the saved model. Should include
      either ".pth" or ".pt" as the file extension.

  Example usage:
    save_model(model=model_0,
               target_dir="models",
               model_name="05_going_modular_tingvgg_model.pth")
  """
  # Create target directory
  target_dir_path = Path(target_dir)
  target_dir_path.mkdir(parents=True,
                        exist_ok=True)

  # Create model save path
  assert model_name.endswith(".pth") or model_name.endswith(".pt"), "model_name should end with '.pt' or '.pth'"
  model_save_path = target_dir_path / model_name

  # Save the model state_dict()
  print(f"[INFO] Saving model to: {model_save_path}")
  torch.save(obj=model.state_dict(),
             f=model_save_path)

Overwriting src/utils.py


In [88]:
%%writefile src/train.py
"""
Trains a PyTorch model using device-agnostic code.
"""

import os
import torch
import data_setup, engine, model_builder, utils
import win32com.client


# Setup hyperparameters
NUM_EPOCHS = 10
BATCH_SIZE = 128
INPUT_SHAPE = 40000  # Modify this based on your actual input vector length
OUTPUT_SHAPE = 12
HIDDEN_UNITS1 = 1024  # Number of neurons in the first hidden layer
HIDDEN_UNITS2 = 512  # Number of neurons in the second hidden layer
LEARNING_RATE = 0.001

# Setup directories for data - modify these paths as needed
shell = win32com.client.Dispatch("WScript.Shell")
shortcut = shell.CreateShortCut('data/contest_TRAIN.h5.lnk')
train_dir = shortcut.Targetpath

# train_dir = "data/train"
# test_dir = "data/test"   # this should be val, and also used only if there is a specific dataset for valiadation data. 

# Setup target device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create DataLoaders with help from data_setup.py
train_dataloader, test_dataloader, classes = data_setup.create_dataloaders(
    train_dir=train_dir,
    batch_size=BATCH_SIZE,
    device = device,
    num_classes = OUTPUT_SHAPE
)

# Create model with help from model_builder.py
model = model_builder.SimpleMLP(
    input_shape=INPUT_SHAPE,
    hidden_units1=HIDDEN_UNITS1,
    hidden_units2=HIDDEN_UNITS2,
    output_shape=OUTPUT_SHAPE
).to(device)

# Set loss and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=LEARNING_RATE)

# Start training with help from engine.py
engine.train(model=model,
             train_dataloader=train_dataloader,
             test_dataloader=test_dataloader,
             loss_fn=loss_fn,
             optimizer=optimizer,
             epochs=NUM_EPOCHS,
             device=device)

# Save the model with help from utils.py
utils.save_model(model=model,
                 target_dir="models",
                 model_name="simple_mlp_model.pth")


Overwriting src/train.py


In [None]:
# %%writefile src/test_data.py
# """
# Contains functionality for creating PyTorch DataLoaders for 
# LIBS benchmark classification dataset.
# """

# import os
# import torch
# from torch.utils.data import DataLoader
# from load_libs_data import load_contest_train_dataset
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import Normalizer, MinMaxScaler
# from siamese_net import prepare_triplets
# import numpy as np


# NUM_WORKERS = os.cpu_count()

# def create_dataloaders(
#     train_dir: str, 
#     #test_dir: str, 
#     batch_size: int, 
#     device: torch.device,
#     num_workers: int=NUM_WORKERS, 
#     split_rate: float=0.6,
#     random_st: int=102,
#     spectra_count: int=100
#     ):
#     """Creates training and validation DataLoaders.

#     Takes in a training directory directory path and split the data
#     to train/validation. After, it turns them into PyTorch Datasets and 
#     then into PyTorch DataLoaders.

#     Args:
#     train_dir: Path to training directory.
#     batch_size: Number of samples per batch in each of the DataLoaders.
#     num_workers: An integer for number of workers per DataLoader.

#     Returns:
#     A tuple of (train_dataloader, test_dataloader, class_labels).
#     Example usage:
#         train_dataloader, test_dataloader, class_labels, wavelengths = \
#         = create_dataloaders(train_dir=path/to/train_dir,
#                                 test_dir=path/to/test_dir,
#                                 transform=some_transform,
#                                 batch_size=32,
#                                 num_workers=4)
#     """



#     X, y, samples = load_contest_train_dataset(train_dir, spectra_count)
#     wavelengths = X.columns

#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_rate, random_state=random_st, stratify=samples, shuffle = True)
#     del X, y, samples

#     if True:
#       scaler =  Normalizer(norm = 'max')
#       X_train = scaler.fit_transform(X_train)
#       X_val = scaler.fit_transform(X_val)

#     # Convert data to torch tensors
#     X_train = torch.from_numpy(X_train).unsqueeze(1).float() # Add extra dimension for channels
#     X_val = torch.from_numpy(X_val).unsqueeze(1).float() # Add extra dimension for channels
#     y_train = torch.from_numpy(np.array(y_train)).long()
#     y_val = torch.from_numpy(np.array(y_val)).long()

#     #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     # If available, move data to the GPU
#     X_train.to(device)
#     X_val.to(device) 
#     y_train.to(device)
#     y_val.to(device)

#     # Prepare triplets for the training, validation
#     train_triplets = prepare_triplets(X_train, y_train)
#     val_triplets = prepare_triplets(X_val, y_val)


#     # Create PyTorch DataLoader objects for the training and validation sets
#     train_dataloader = DataLoader(train_triplets, batch_size=batch_size, shuffle=True)
#     val_dataloader = DataLoader(val_triplets, batch_size=batch_size, shuffle=True)


#     return train_dataloader, val_dataloader, y_train


In [86]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# import funcs
# import importlib
# importlib.reload(funcs)
# from funcs import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
%%writefile src/prediction_engine.py
"""
Contains functionality for creating PyTorch DataLoaders for 
LIBS benchmark classification dataset.
"""

import os
import torch
from torch.utils.data import DataLoader
from load_libs_data import load_contest_test_dataset, load_contest_train_dataset
from sklearn.preprocessing import Normalizer, MinMaxScaler
import numpy as np


def create_dataloaders(
    test_dir: str, 
    test_labels_dir: str, 
    batch_size: int, 
    device: torch.device,
    pred_test: bool,
    ):
    """Creates training and validation DataLoaders.

    Takes in a training directory directory path and split the data
    to train/validation. After, it turns them into PyTorch Datasets and 
    then into PyTorch DataLoaders.

    Args:
    train_dir: Path to training directory.
    batch_size: Number of samples per batch in each of the DataLoaders.
    num_workers: An integer for number of workers per DataLoader.

    Returns:
    A tuple of (train_dataloader, test_dataloader, class_labels).
    Example usage:
        train_dataloader, test_dataloader, class_labels, wavelengths = \
        = create_dataloaders(train_dir=path/to/train_dir,
                                test_dir=path/to/test_dir,
                                transform=some_transform,
                                batch_size=32,
                                num_workers=4)
    """

    if pred_test:
        X_test = load_contest_test_dataset(test_dir)
        y_test = np.loadtxt(test_labels_dir, delimiter = ',')
    else: # use with caution, only for predicting training embeddings
        X_test, y_test, _ = load_contest_train_dataset(test_dir)

    if True:
      scaler =  Normalizer(norm = 'max')
      X_test = scaler.fit_transform(X_test)

    # Convert data to torch tensors
    X_test = torch.from_numpy(X_test).unsqueeze(1).float() # Add extra dimension for channels
    y_test = torch.from_numpy(np.array(y_test)).long()


    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # If available, move data to the GPU
    X_test.to(device)
    y_test.to(device)



    # Create PyTorch DataLoader objects for the training and validation sets
    pred_test_loader = DataLoader(X_test, batch_size=batch_size)


    return pred_test_loader, y_test

def predict_test(
                model: torch.nn.Module, 
                dataloader: torch.utils.data.DataLoader,
                device: torch.device,
                test_dir: str, 
                test_labels_dir: str,
                batch_size: int,
                y_test
                ):
    X_test_pred=[]
    with torch.no_grad():
        for data in dataloader:
            input = data.to(device)
            output = (model.forward_once(input)).cpu()
            output = np.array(output)
            X_test_pred.append(output)
    X_test_pred = np.concatenate(X_test_pred, axis = 0)
    return X_test_pred






Overwriting src/prediction_engine.py


In [87]:
%%writefile src/prediction.py

import torch
import prediction_engine
import siamese_net
import numpy as np

NUM_EPOCHS = 50
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
INPUT_SIZE = 40000
OUTPUT_SIZE = 12
CHANNELS=50
KERNEL_SIZES=[50, 10]
STRIDES=[2, 2]
PADDINGS=[1, 1]
HIDDEN_SIZES=[256]

# Setup directories
#test_dir = "datasets/contest_TEST.h5"
test_labels_dir = "datasets/test_labels.csv"
model_dir = 'models/final_model2_dashing_dream_256b_50ep.pth'
test_dir = "datasets/contest_TRAIN.h5"


# Setup target device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create DataLoaders with help from data_setup.py
test_dataloader, y_test = prediction_engine.create_dataloaders(
    test_dir=test_dir,
    test_labels_dir=test_labels_dir,
    batch_size=BATCH_SIZE,
    device = device,
    pred_test = False # USE WITH CAUTION, turn to 'False' if you want to get embeddings of the training data
)


saved_state_dict = torch.load(model_dir, map_location=torch.device('cpu'))

# Create a new instance of your model
model = siamese_net.SiameseNetwork(
    input_size=INPUT_SIZE, 
    output_size=OUTPUT_SIZE, 
    channels=CHANNELS, 
    kernel_sizes=KERNEL_SIZES, 
    strides=STRIDES, 
    paddings=PADDINGS, 
    hidden_sizes=HIDDEN_SIZES
).to(device)
# Load the saved state into the new model instance
model.load_state_dict(saved_state_dict)

#todo save this to a file
prediction_X_test = prediction_engine.predict_test(
                    model=model, 
                    dataloader=test_dataloader,
                    device=device,
                    test_dir=test_dir, 
                    test_labels_dir=test_labels_dir,
                    batch_size=BATCH_SIZE,
                    y_test=y_test
                    )


np.save('datasets/prediction_X_train_dashing_dream.npy', prediction_X_test)        
np.save('datasets/y_train.npy', y_test)          

#https://colab.research.google.com/drive/15D5vAYkhbAs5-txhYTCb_Fp2jiCnHXVN#scrollTo=82F_qINOBbkL

Writing src/prediction.py


In [17]:
import numpy as np
test = np.load('datasets/prediction_X_test.npy')
test.shape

(20000, 12)