# Dry Bean Variety Prediction by Multiclass Classification using ANN

## Load dataset for the training process

In [1]:
import os
import numpy as np
import pandas as pd

from typing import List

### Open the CSV

In [2]:
df_train = pd.read_csv(os.path.join("DryBeanDataset", "dry_bean_train.csv"))
df_val = pd.read_csv(os.path.join("DryBeanDataset", "dry_bean_val.csv"))
print("Num train:", len(df_train))
print("Num val:", len(df_val))
df_train.head()

Num train: 9528
Num val: 1361


Unnamed: 0.1,Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,1885,45130,793.896,300.179289,191.903596,1.564219,0.76896,45635,239.710869,0.794095,0.988934,0.899804,0.798559,0.006651,0.001668,0.637696,0.997497,SIRA
1,6754,55559,934.184,369.299024,192.668844,1.916755,0.85312,56331,265.969765,0.652009,0.986295,0.800017,0.720202,0.006647,0.001103,0.51869,0.994203,HOROZ
2,8383,83982,1174.925,412.055694,260.285953,1.583088,0.775232,85570,327.000311,0.741956,0.981442,0.764497,0.793583,0.004906,0.0012,0.629774,0.996987,BARBUNYA
3,9464,33213,684.27,261.487558,162.196575,1.612164,0.784377,33610,205.640718,0.727637,0.988188,0.89138,0.786426,0.007873,0.001858,0.618466,0.99707,DERMASON
4,6555,48856,831.807,314.074497,198.512622,1.582139,0.774923,49420,249.410086,0.71281,0.988588,0.887325,0.794111,0.006429,0.001577,0.630613,0.997717,SIRA


### Create dataset loader for training and validation process

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset

### Features to be used and target

In [4]:
x_names = ["Area", "Perimeter", "MajorAxisLength", "MinorAxisLength", "Eccentricity", "ConvexArea", "EquivDiameter", "Extent", "Solidity", "ShapeFactor1", "ShapeFactor2", "ShapeFactor3", "ShapeFactor4"]
y_name = "Class"
y_classes = ["SEKER", "BARBUNYA", "BOMBAY", "CALI", "DERMASON", "HOROZ", "SIRA"]

### Convert Pandas dataframe to PyTorch dataset

In [5]:
def df_to_dataset(df: pd.DataFrame) -> Dataset:
    features = df[x_names].to_numpy(dtype=np.float32)
    # preprocess data
    df["Area"] /= 500_000
    df["Perimeter"] /= 5000
    df["MajorAxisLength"] /= 2000
    df["MinorAxisLength"] /= 2000
    df["ConvexArea"] /= 500_000
    df["EquivDiameter"] /= 2000
    # preprocess labels
    labels = df[[y_name]].applymap(lambda x: y_classes.index(x)) # map Class labels to int
    labels = labels.to_numpy(dtype=np.int64).squeeze(axis=1)
    # create the dataset
    features = torch.from_numpy(features)
    labels = torch.from_numpy(labels)
    my_dataset = TensorDataset(features, labels)
    return my_dataset

In [6]:
ds_train = df_to_dataset(df_train)
ds_val = df_to_dataset(df_val)

### Create PyTorch data loader

In [7]:
BATCH_SIZE = 64
loader_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) # drop last for stability
loader_val = DataLoader(ds_val, batch_size=BATCH_SIZE)

## Model training

In [8]:
import models

from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, accuracy_score

In [9]:
MAX_EPOCHS = 1000
INIT_LR = 1e-5

### Which device we will use for training process (CPU/GPU)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


### Create the model

In [11]:
model = models.MLP4Layers(n_features=len(x_names), n_classes=len(y_classes))
# Move the model from CPU to the device
# Actually, only required if the device is not CPU and has no effect if it is CPU
model = model.to(device)

### Define the loss function and the optimizer

In [12]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=INIT_LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-9)

### Prepare the logger

In [13]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = os.path.join('runs_clf', 'train_{}'.format(timestamp))
os.makedirs(save_dir, exist_ok=True)
writer = SummaryWriter(save_dir)
print(f"Saving model weights and logs to: {save_dir}")

Saving model weights and logs to: runs_clf/train_20220904_081814


### The training and validation process

During the training process, launch tensorboard to see the logged train/val metrics
```bash
tensorboard --logdir runs_clf
```
Then, open the link using web browser

In [14]:
# Variables to hold some training status
epoch_number = 0
lowest_loss = np.inf
best_f1 = 0.
# Training loop
for epoch in tqdm(range(MAX_EPOCHS)):
    # Make sure gradient tracking is on, and do a pass over the data for the training process
    model.train()
    running_loss = 0.
    for i, data in enumerate(loader_train):
        # Every data instance is an input & label pair
        inputs, labels = data
        # We move the data instance from CPU to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Zero your gradients for every batch!
        optimizer.zero_grad()
        # Make predictions for this batch
        outputs = model(inputs)
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
        # Adjust learning weights
        optimizer.step()
        # Gather data and report
        running_loss += loss.detach().item()
    # Calculate the average training loss
    avg_loss = running_loss / (i + 1)

    # We don't need gradients for the model validation process
    model.eval()
    running_vloss = 0.0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for i, vdata in enumerate(loader_val):
            vinputs, vlabels = vdata
            y_true.extend(vlabels.numpy().tolist())
            voutputs = model(vinputs.to(device))
            vloss = loss_fn(voutputs, vlabels.to(device))
            running_vloss += vloss.item()
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(voutputs.data, 1)
            y_pred.extend(predicted.cpu().numpy().tolist())

    # Calculate the average validation loss
    avg_vloss = running_vloss / (i + 1)
    scheduler.step(avg_vloss)
    # Calculate our classification metrics
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalar('train/loss', avg_loss, epoch_number + 1)
    writer.add_scalar('val/loss', avg_vloss, epoch_number + 1)
    writer.add_scalar('val/acc', acc, epoch_number + 1)
    writer.add_scalar('val/weighted_f1', f1, epoch_number + 1)
    writer.flush()


    # Track best performance, and save the model's state (weights)
    if f1 > best_f1:
        best_f1 = f1
        model_path = os.path.join(save_dir, 'best.pt')
        torch.save(model.state_dict(), model_path)
    if avg_vloss < lowest_loss:
        lowest_loss = avg_vloss
        model_path = os.path.join(save_dir, 'lowest_loss.pt')
        torch.save(model.state_dict(), model_path)
    model_path = os.path.join(save_dir, 'last.pt')
    torch.save(model.state_dict(), model_path)

    epoch_number += 1

  0%|          | 0/1000 [00:00<?, ?it/s]

## References
[1] https://pytorch.org/tutorials/beginner/introyt/trainingyt.html