In [None]:
# Prefix for naming output files (logs, checkpoints) from this training run
train_prefix="b_mosquito_simple_classifier-1_"

# Path to the directory containing the training dataset
train_folder="../b_mosquito_database_train_val_25_01_21/train/"
# Path to the directory containing the validation dataset
valid_folder="../b_mosquito_database_train_val_25_01_21/validation/"
# Path where the generated CSV file for training data will be saved
train_csv = "./csv_selected/trainData.csv"
# Path where the generated CSV file for validation data will be saved
valid_csv = "./csv_selected/valiData.csv"

# Flag to enable or disable the self-attention mechanism in the model
b_selfattention=True

# Number of samples processed before the model is updated
batch_size = 4
# Step size for the optimizer during training
learning_rate = 1e-3
# The total number of times the training algorithm will iterate over the entire training dataset
epochs = 500


In [None]:
import dataPreprocess
import model_modified
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import time
import os



In [None]:
# create csv-s for classification folders
# Fname,Genera,Species

import os
import csv

# Create CSV folder if it doesn't exist
os.makedirs(os.path.dirname(train_csv), exist_ok=True)
os.makedirs(os.path.dirname(valid_csv), exist_ok=True)

def generate_csv(folder_path, output_csv):
    """
    Generates a CSV from the folder structure based on class labels and filenames.
    """
    data = []

    # Iterate through subfolders and files
    for species in os.listdir(folder_path):
        species_path = os.path.join(folder_path, species)
        if os.path.isdir(species_path):
            for fname in os.listdir(species_path):
                # Add files only
                if os.path.isfile(os.path.join(species_path, fname)):
                    data.append({
                        "Fname": os.path.join(species_path, fname).replace("\\", "/"),
                        "Genera": "",
                        "Species": species
                    })

    # Writing CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["Fname", "Genera", "Species"])
        writer.writeheader()
        writer.writerows(data)

# Generate Train CSV
generate_csv(train_folder, train_csv)
print(f"Train CSV created: {train_csv}")

# Generate Validation CSV
generate_csv(valid_folder, valid_csv)
print(f"Validation CSV created: {valid_csv}")




In [None]:
def train(model, device, train_loader, valid_loader, epochs, lf, optimizer):
    sched = optim.lr_scheduler.CosineAnnealingLR(optimizer, 200, 0)
    best_valid_acc = 0
    best_model_report = ''

    output_file = open(train_prefix+'output_big.txt', 'w+')
    output_file.write('start...')
    output_file.write('\n')
    output_file.close()
    print('start...')
    # training procedure
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        model.train()
        loss_val = 0
        true_running = 0
        total_running = 0
        for i, data in enumerate(train_loader):
            #print(data)
            x, gt = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.long)
            optimizer.zero_grad()
            predicted = model(x)
            loss = lf(predicted, gt)

            result, predicted_class = torch.max(predicted, 1)
            true_running += (predicted_class == gt).sum()
            total_running += predicted_class.shape[0]

            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        train_loss = loss_val / len(train_loader)
        accuracy = torch.true_divide(true_running, total_running)
        print(f'Epoch - {epoch} Train - Loss : {train_loss} Accuracy : {accuracy}')
        output_file = open(train_prefix+'output_big.txt', 'a')
        output_file.write(f'Epoch {epoch}/{epochs} - Train')
        output_file.write(f'loss: {train_loss}')
        output_file.write('\n')
        output_file.write(f'accuracy: {accuracy}')
        output_file.write('\n')
        output_file.close()

        sched.step()
        model.eval()

        # validating procedure
        valid_loss_val = 0
        valid_true_running = 0
        valid_total_running = 0
        y_pred = np.array([])
        y_test = np.array([])
        for i, data in enumerate(valid_loader):
            x, gt = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.long)
            predicted = model(x)
            loss = lf(predicted, gt)

            result, predicted_class = torch.max(predicted, 1)
            valid_true_running += (predicted_class == gt).sum()
            valid_total_running += predicted_class.shape[0]

            valid_loss_val += loss.item()

            y_pred = np.append(y_pred, predicted_class.cpu().detach().numpy())
            y_test = np.append(y_test, gt.cpu().detach().numpy())

        # calculating measurements
        valid_loss = valid_loss_val / len(train_loader)
        accuracy = torch.true_divide(valid_true_running, valid_total_running)
        print(f'Epoch - {epoch} Validation - Loss : {valid_loss} Accuracy : {accuracy}')

        # accuracy and loss
        output_file = open(train_prefix+'output_big.txt', 'a')
        output_file.write(f'Epoch {epoch}/{epochs} - Validation')
        output_file.write(f'loss: {valid_loss}')
        output_file.write('\n')
        output_file.write(f'accuracy: {accuracy}')
        output_file.write('\n')

        # precision, recall, f1-score
        output_file.write('\nClassification Report\n')
        output_file.write(classification_report(y_test, y_pred, zero_division=0))
        output_file.write('\n')

        # confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        output_file.write('\nConfusion Matrix\n')
        output_file.write(str(conf_matrix))
        output_file.write('\n')

        # time usage for each epoch
        end_time = time.time()
        usage_time = end_time - start_time
        output_file.write(f'Time usage: {usage_time} secs')
        output_file.write('\n')
        output_file.write('\n')

        output_file.close()

        # save best model and its performance report, can be used for futher training
        
        # Save best loss model
        if epoch == 1 or valid_loss < best_loss_val:
            best_loss_val = valid_loss
            torch.save(model.state_dict(), f'./{train_prefix}checkpoints/best_loss_{epoch}.pth')

        # Save best accuracy model
        if accuracy > best_valid_acc:
            best_valid_acc = accuracy
            best_model_report = classification_report(y_test, y_pred, zero_division=0)
            torch.save(model.state_dict(), f'./{train_prefix}checkpoints/big_{epoch}.pth')

        # Save the last model
        torch.save(model.state_dict(), f'./{train_prefix}checkpoints/last_model.pth')

        # report the best training model
        if epoch == epochs:
            output_file = open(train_prefix+'output.txt', 'a')
            output_file.write(f'End Training Overall Report')
            output_file.write('\n')
            output_file.write(f'Best Validation Accuracy: {best_valid_acc}')
            output_file.write('\n')
            output_file.write(f'Classification Report: {best_model_report}')
            output_file.write('\n')
            output_file.write(f'The best model is saved under resnet_attention.pth')
            output_file.close()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
if train_prefix+"checkpoints" not in os.listdir("./"):
    os.mkdir(f"./{train_prefix}checkpoints")
else:
    print("Checkpoints folder already exists.")


loss_function = nn.CrossEntropyLoss()


In [None]:
label_list = {"not":0, "mosquito":1}

train_data = dataPreprocess.ListDataset(train_csv, label_list, "train")
vali_data = dataPreprocess.ListDataset(valid_csv, label_list, "validation")

print('loading training data')
train_loader = DataLoader(train_data, batch_size, shuffle=True)
print('loading validation data')
vali_loader = DataLoader(vali_data, batch_size, shuffle=False)


In [None]:

#resnet_model = resnet.resnet18(1, 3)  # 1 channel, 3 classes
resnet_model = model_modified.resnet18_attention(1, len(label_list), b_selfattention=b_selfattention)

resnet_model = resnet_model.to(device)
optimizer = optim.Adam(resnet_model.parameters(), lr=learning_rate)

train(resnet_model, device, train_loader, vali_loader, epochs, loss_function, optimizer)
