# **Setup and Library Imports**

### **Connect to google drive**

In [None]:
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build

auth.authenticate_user()
drive_service = build('drive', 'v3')
drive.mount('/content/drive')

### **Logging into Hugging Face Hub**

In [None]:
# from huggingface_hub import notebook_login
## Execute the login function to access the Hugging Face account
# notebook_login()

### **Installing Required Packages**

In [None]:
! pip uninstall -y torch

In [None]:
!pip install torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --extra-index-url https://download.pytorch.org/whl/cu124

In [None]:
! pip install --quiet transformers==4.48.3
! pip install --quiet datasets==3.3.2
! pip install --quiet evaluate
! pip install --quiet tabulate
! pip install --quiet ipywidgets
! pip install --quiet pillow
! pip install --quiet scikit-learn
! pip install --quiet tensorboard
! pip install --quiet openpyxl

### **Importing Libraries**

In [None]:
# PyTorch for tensor operations
import torch

# Hugging Face libraries for training and transformer models
from transformers import Trainer, TrainingArguments, TrainerCallback
from transformers import AutoImageProcessor, AutoModelForImageClassification
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import AdamW

# Evaluation metrics and utilities
import os
import evaluate
import numpy as np
from datetime import datetime

# Loading datasets for training and evaluation
from datasets import load_dataset

# Data manipulation and display utilities
import pandas as pd
from tabulate import tabulate
from collections import Counter

In [None]:
print(torch.__version__)

### **Defining Model, Dataset Paths, and Output Directories**

List of Models

base ~90M
```
microsoft/resnet-152
facebook/convnextv2-base-1k-224
google/vit-hybrid-base-bit-384

google/vit-base-patch16-224
microsoft/swin-base-patch4-window7-224
facebook/deit-base-patch16-224
facebook/dinov2-base
```

small ~20M
```
facebook/convnextv2-tiny-1k-224
WinKawaks/vit-small-patch16-224
microsoft/swin-tiny-patch4-window7-224
facebook/deit-small-patch16-224
```

tiny ~5M
```
apple/mobilevit-small
google/efficientnet-b0
facebook/convnextv2-femto-1k-224
WinKawaks/vit-tiny-patch16-224
microsoft/swin-tiny-patch4-window7-224
facebook/deit-tiny-patch16-224
```

List of Datasets


```
cvmil/rice-leaf-disease-augmented-v4
cvmil/rice-leaf-disease-augmented-v3
cvmil/rice-leaf-disease-augmented-v2
cvmil/rice-leaf-disease-augmented
cvmil/rice-leaf-disease-augmented-test
cvmil/rice-disease-02
```

Define paths for saving model training outputs and logs, incorporating model and dataset names along with the current date.

In [None]:
# Define model and dataset paths
model_path = ""
dataset_path = "cvmil/rice-leaf-disease-augmented-v4"
train_epochs = 15
resume_from_checkpoint = True

base_model_name = model_path.split("/")[-1]
dataset_name = dataset_path.split("/")[-1]

model_name = f"{base_model_name}_{dataset_name}_fft"
output_dir = f"./drive/Shareddrives/CS198-Drones/[v4] Training Output/{model_name}"

# Define directory for storing training logs
logging_dir = f"{output_dir}/logs"
metrics_dir = f"{output_dir}/training_metrics.xlsx"

# **Data Preparation and Processing Pipeline**

This section handles the dataset loading, label extraction, image processing setup, and defines necessary functions for data transformation, batching, and metric computation to prepare the data for model training and evaluation.

### **Load Dataset and Extract Labels**

Load the dataset from huggingface and extract the class labels from the training data.

In [None]:
# Load the dataset
dataset = load_dataset(dataset_path)

# Extract class labels from the training set
labels = dataset['train'].features['label'].names

Generate and display a table showing class distribution across training and validation splits.

In [None]:
label_mapping = dataset['train'].features['label'].int2str

# Count the number of samples per class in each split
train_counts = Counter(dataset['train']['label'])
validation_counts = Counter(dataset['validation']['label'])

# Create a DataFrame for the class distribution
data = {
    'ID': list(range(len(labels))),
    'Label': labels,
    'Training': [train_counts[i] if i in train_counts else 0 for i in range(len(labels))],
    'Validation': [validation_counts[i] if i in validation_counts else 0 for i in range(len(labels))],
}

# Display the class distribution in a table format
df = pd.DataFrame(data)
print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

### **Initialize Image Processor**

Load and initialize the image processor from the pre-trained model.

In [None]:
# Load the image processor from the pre-trained model
processor = AutoImageProcessor.from_pretrained(model_path)
print(processor)

### **Data Preparation and Processing Pipeline**

Create mappings for label-to-ID and ID-to-label.

In [None]:
label2id = {c: idx for idx, c in enumerate(labels)}
id2label = {idx: c for idx, c in enumerate(labels)}

Define the transformation function to process the image batch.

In [None]:
def transforms(batch):
    batch['image'] = [x.convert('RGB') for x in batch['image']]
    inputs = processor(batch['image'], return_tensors='pt')
    inputs['labels'] = batch['label']
    return inputs

Define the custom collation function for batching pixel values and labels.

In [None]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }


Define the function to compute accuracy during evaluation.

In [None]:
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


### **Apply Data Transformations to Dataset**

Apply the defined transformation function to the dataset for preprocessing. </br>
Note: This assumes that data augmentation and normalization have already been handled in the previous pipeline and is ready for fine-tuning.

In [None]:
processed_dataset = dataset.with_transform(transforms)

# **Model Initialization and Trainer Setup**

This section handles the initialization of the model, configuration of training parameters, and setting up the Trainer for fine-tuning, including the datasets, data processing, and evaluation metrics.

### **Initialize Pre-trained Model for Fine-tuning**

Load a pre-trained image classification model, configuring it with the correct label mappings and number of labels for the fine-tuning task.

In [None]:
# Load pre-trained model and configure it for fine-tuning
model = AutoModelForImageClassification.from_pretrained(
    model_path,                  # Path to the pre-trained model
    num_labels=len(labels),      # Set the number of labels for classification
    id2label=id2label,           # Map from ID to label
    label2id=label2id,           # Map from label to ID
    ignore_mismatched_sizes=True # Ignore size mismatches in weights
)

### **Check Model Parameters for Fine-tuning**

Unfreeze all layers of the model for full fine-tuning

In [None]:
for param in model.parameters():
    param.requires_grad = True

We can check how many parameters are there in the model along with how many are actually going to be trained now.

In [None]:
num_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {num_params:,} | Trainable parameters: {trainable_params:,}")

### **Define Training Arguments**

Set learning rate for the model layers, we use lower learning rate for finetuning the pretrained model weight, and higher weight for the classification layer.

In [None]:
for param in model.named_parameters():
    if "classifier" in param[0]:
        print(param[0])

### **Create LR Scheduler**

In [None]:
# Define different learning rates
base_lr = 3e-5
classifier_lr = 3e-4
weight_decay = 0.1

# Separate model parameters
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if "classifier" not in n],
        "lr": base_lr,
        "weight_decay": weight_decay
    },
    {
        "params": [p for n, p in model.named_parameters() if "classifier" in n],
        "lr": classifier_lr,
        "weight_decay": weight_decay
    },
]

# Define optimizer with different learning rates
optimizer = AdamW(optimizer_grouped_parameters)

In [None]:
batch_size = 64
train_dataset_size = len(processed_dataset["train"])

# Calculate steps per epoch
steps_per_epoch = max(1, train_dataset_size // batch_size)

# Calculate logging steps (2 times per epoch)
logging_steps = max(1, steps_per_epoch // 2)

# Warmup for 2 epoch
warmup_steps =  steps_per_epoch * 2

Set up the training configuration with parameters such as batch size, number of epochs, learning rate, and logging strategies for the fine-tuning process.

In [None]:
training_args = TrainingArguments(
    num_train_epochs=train_epochs,              # Number of training epochs
    per_device_train_batch_size=batch_size,     # Batch size for training
    per_device_eval_batch_size=batch_size,      # Batch size for evaluation

    fp16=True,                                  # Use mixed precision training
    warmup_steps=warmup_steps,                  # Warmup step for learning rate scheduler
    weight_decay=weight_decay,                  # Weight decay for regularization
    lr_scheduler_type='cosine_with_restarts',   # Learning rate scheduler type
    lr_scheduler_kwargs = { "num_cycles": 3 },  # Number of cycles for learning rate scheduler

    save_total_limit=3,                         # Limit the number of saved models
    report_to=['tensorboard'],                  # Log to TensorBoard
    save_strategy="steps",                      # Save strategy
    eval_strategy="steps",                      # Evaluation strategy
    logging_strategy="steps",                   # Logging strategy
    save_steps=logging_steps,                   # Save steps
    eval_steps=logging_steps,                   # Evaluation steps
    logging_steps=logging_steps,                # Logging steps
    logging_dir=logging_dir,                    # Directory for logging
    output_dir=output_dir,                      # Directory for saving outputs

    remove_unused_columns=False,                # Retain unused columns in the dataset
    load_best_model_at_end=True,                # Load best model at the end of training
    metric_for_best_model="eval_loss",          # Specify the metric to track
    greater_is_better=False,                    # For loss, lower is better
    push_to_hub=True,                           # Push model to Hugging Face Hub
)

### **Trainer Callback**

In [None]:
class CustomSaveCallback(TrainerCallback):
    def __init__(self, trainer):
        self.trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
      drive_id = "0AND7L-n1cnFpUk9PVA"
      try:
          drive_service.files().emptyTrash(driveId=drive_id).execute()
      except Exception as e:
          print(f"Error: {e}")


    def on_evaluate(self, args, state, control, **kwargs):
        previous_logs = state.log_history[-2:]
        new_logs = {k: v for log in previous_logs for k, v in log.items()}

        new_logs["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Add GPU VRAM usage details (in MB)
        if torch.cuda.is_available():
            new_logs["gpu_vram_allocated_mb"] = torch.cuda.memory_allocated() / (1024 ** 2)
            new_logs["gpu_vram_reserved_mb"] = torch.cuda.memory_reserved() / (1024 ** 2)
        else:
            new_logs["gpu_vram_allocated_mb"] = None
            new_logs["gpu_vram_reserved_mb"] = None

        # Read the existing Excel file, if it exists
        if os.path.exists(metrics_dir):
            try:
                df_existing = pd.read_excel(metrics_dir)
            except Exception as e:
                print(f"Error reading {metrics_dir}: {e}")
                df_existing = pd.DataFrame()
        else:
            df_existing = pd.DataFrame()

        # Check if this epoch's record already exists; if yes, update it; otherwise, append.
        if not df_existing.empty and (df_existing["epoch"] == new_logs["epoch"]).any():
            df_existing.loc[df_existing["epoch"] == new_logs["epoch"], new_logs.keys()] = new_logs.values()
            df_to_save = df_existing
        else:
            df_new = pd.DataFrame([new_logs])
            df_to_save = pd.concat([df_existing, df_new], ignore_index=True)

        # Save the updated DataFrame back to Excel
        df_to_save.to_excel(metrics_dir, index=False)
        return control

### **Initialize Trainer**

Initialize the Trainer object with the model, training arguments, data collator, metrics computation, and datasets for training and evaluation.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    tokenizer=processor,
    callbacks=[CustomSaveCallback(trainer=None)]
)

# Update callback with trainer instance
trainer.callback_handler.callbacks[0].trainer = trainer

### **Create Model Card**

In [None]:
trainer.create_model_card(
    language="en",
    license="MIT",
    tags=["image-classification", "fine-tuning"],
    model_name=model_name,
    finetuned_from=base_model_name,
    tasks=["image-classification"],
    dataset_tags=["image", "rice-leaf_disease"],
    dataset=dataset_name,
    dataset_args=["size: 224x224", "augmentation: true"],
)

# **Model Training and Evaluation**

### **Start Fine-tuning Process**

Initiates the fine-tuning of the model using the Trainer, applying the specified training configurations, such as the batch size, learning rate, and number of epochs. During training, the model will be evaluated at the end of each epoch on the validation dataset using the compute_metrics function, which calculates accuracy.

The model will undergo the following process during fine-tuning:
- **Training**: The model will be trained on the training dataset for the specified number of epochs.
- **Evaluation**: After each epoch, the model will be evaluated on the validation dataset, and accuracy will be computed using the compute_metrics function.
- **Metrics Logging**: The training progress and evaluation results will be logged to TensorBoard and can be monitored during training.

In [None]:
print(f"Training {model_name} on {dataset_name} dataset...")
train_results = trainer.train(resume_from_checkpoint=resume_from_checkpoint)

### **Save Model and Training State**

After the training process, the model and relevant training state are saved. This includes saving the model weights, training metrics, and the state of the trainer, ensuring that training progress can be restored if needed.

In [None]:
# Save the trained model
trainer.save_model()

# Log and save training metrics for later reference
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

# Save the state of the trainer, including configuration and optimizer state
trainer.save_state()