# ML4EO LAB5
## Transfer Learning

## Importing necessary packages

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import cv2
from glob import glob
import os
import pandas as pd
from sklearn.utils import shuffle
import random
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from PIL import Image

import torch.nn.functional as F
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
# devices
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

## Getting dataset

Link to the dataset:  https://drive.google.com/drive/folders/1-CCSBRiMvnNQFMvaSdi110vlM7mk2pbP?usp=sharing


In [None]:

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Specify the path to your image
file_path = "/content/drive/MyDrive/NCST-YieldPrediction/RwandaDroneImagery/imagery/_modeling/train/banana/2564_Feb2019.png"

# Open the image
image = Image.open(file_path)

# Display the image
plt.imshow(image)
plt.axis('off')  # Turn off axis numbers
plt.show()

In [None]:
def count_images_in_directory(directory_path):
    """
    This function is used to
    count the number of image files in each subdirectory of the given directory(train as well validation set).
    and itr also help to returns the total count of images in the whole sets.
    """
    # Supported image extension
    image_extensions = ['.png']

    total_image_count = 0

    # Iterate over each subdirectory in the main directory
    for subdirectory in os.listdir(directory_path):
        sub_dir_path = os.path.join(directory_path, subdirectory)

        # Make sure it's a directory
        if os.path.isdir(sub_dir_path):
            image_count = 0

            # Count image files
            for filename in os.listdir(sub_dir_path):
                if any(filename.endswith(ext) for ext in image_extensions):
                    image_count += 1

            print(f'Number of images in "{subdirectory}" of {os.path.basename(directory_path)}: {image_count}')
            total_image_count += image_count

    return total_image_count

# Main train directory paths
train_dir_path = '/content/drive/MyDrive/NCST-YieldPrediction/RwandaDroneImagery/imagery/_modeling/train'
total_train_images = count_images_in_directory(train_dir_path)
print(f'\nTotal number of images in training: {total_train_images}')

In [None]:
# Main test directory paths
test_dir_path = '/content/drive/MyDrive/NCST-YieldPrediction/RwandaDroneImagery/imagery/_modeling/test'
total_test_images = count_images_in_directory(test_dir_path)
print(f'\nTotal number of images in training: {total_test_images}')

In [None]:
def create_dataframe_with_labels(directory_path):
    """
    This fucntion help to make a dataframe so that all the images we have will be put together in a dataframe
    with their corresponding lables
    """
    image_extensions = ['.png']
    data = {'Image': [], 'label_name': []}

    for subdirectory in os.listdir(directory_path):
        sub_dir_path = os.path.join(directory_path, subdirectory)
        if os.path.isdir(sub_dir_path):
            for filename in os.listdir(sub_dir_path):
                if any(filename.endswith(ext) for ext in image_extensions):
                    image_path = os.path.join(sub_dir_path, filename)
                    data['Image'].append(image_path)
                    data['label_name'].append(subdirectory)  # Subdirectory name as label

    return pd.DataFrame(data)

In [None]:
# Create train dataframes
train_df = create_dataframe_with_labels(train_dir_path)
train_df.sample(10) # Print datasamples from dataframes

In [None]:
# Create dataframes
test_df = create_dataframe_with_labels(test_dir_path)
test_df.sample(10) # Print datasamples from dataframes

In [None]:
def plot_images(column_name, df):
    """
    This function helps to make a plot of different crop types in the dataset
    to view their characteristics
    """
    sample_df = df.sample(12, random_state = 0)
    # Create subplots
    fig, axes = plt.subplots(2, 6, figsize=(12, 4))
    # Iterate over the subplots
    for i, ax in enumerate(axes.flat):
        image_file_path = sample_df.iloc[i][column_name]
        image = Image.open(image_file_path)
        # Get the label name from the DataFrame
        label_name = sample_df.iloc[i]["Label"]

        # Plot the image and put labels
        ax.imshow(image)
        ax.set_title(label_name)
        ax.axis("off")

    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
labels_dict = {'legumes': 0,
 'maize': 1,
 'banana': 2,
 'forest': 3,
 'other': 4,
 'structure': 5}

In [None]:
# Encode labels
train_df["label"] = [labels_dict[label] for label in train_df.label_name.values]
test_df["label"] = [labels_dict[label] for label in test_df.label_name.values]

In [None]:
# Splitting into traing and test dataset
training_df, validation_df = train_test_split(train_df, test_size=0.2, random_state=42)


In [None]:
test_imgs, test_labels = test_df["Image"], test_df["label"]

In [None]:
# Splitting into traing and test dataset
train_imgs, val_imgs, train_labels, val_labels = training_df["Image"], validation_df["Image"], training_df["label"], validation_df["label"]


assert len(train_imgs)==len(train_labels), 'Not equal'

len(train_imgs), len(val_imgs)

In [None]:
test_imgs=pd.DataFrame(test_imgs).reset_index(drop=True)
test_labels=pd.DataFrame(test_labels).reset_index(drop=True)

In [None]:
train_imgs=pd.DataFrame(train_imgs).reset_index(drop=True)
val_imgs=pd.DataFrame(val_imgs).reset_index(drop=True)
train_labels=pd.DataFrame(train_labels).reset_index(drop=True)
val_labels=pd.DataFrame(val_labels).reset_index(drop=True)
train_imgs

## Let's visualize


In [None]:
random_idx = np.random.randint(low=0,high=len(train_imgs)-1,size=6)
random_idx

In [None]:
# Classes
labels_dict

In [None]:
# visualize
nrows=2
ncols=3
fig, ax=plt.subplots(nrows=nrows, ncols=ncols,figsize=(10,8))
i=0
for row in range(0,nrows):
    for col in range(0,ncols):
        img=cv2.imread(train_imgs["Image"][random_idx[i]])
        ax[row][col].imshow(img)
        ax[row][col].set_title(f'{img.shape}->{train_labels["label"][random_idx[i]]}')
        ax[row][col].axis(False)
        i+=1


## Dataset preparation  and building a data loader

In [None]:
class TrainDataset(Dataset):
    def __init__(self, img_list, label_list):
        self.img_s = img_list
        self.label_s = label_list

    def __len__(self):
        return len(self.img_s)

    def __getitem__(self, idx):
        image = self.img_s[idx]
        label = self.label_s[idx]

        image = torch.from_numpy(cv2.imread(image))

        image = image.permute(2,0,1)


        return image, label

In [None]:
class ValDataset(Dataset):
    def __init__(self, img_list, label_list):
        self.img_s = img_list
        self.label_s = label_list

    def __len__(self):
        return len(self.img_s)

    def __getitem__(self, idx):
        image = self.img_s[idx]
        label = self.label_s[idx]

        image = torch.from_numpy(cv2.imread(image))

        image = image.permute(2,0,1)


        return image, label

In [None]:
# Creating data
Train_data = TrainDataset(train_imgs["Image"],train_labels["label"])
val_data = ValDataset(val_imgs["Image"],val_labels["label"])
f"Len of train dataset: {len(Train_data)}->shape of image at idx 0: {Train_data[0][0].shape}->Label:{Train_data[0][1]}"

In [None]:
test_data = ValDataset(test_imgs["Image"], test_labels['label'])

In [None]:
batch = 32

In [None]:
test_loader = DataLoader(
    dataset     = test_data,
    batch_size  = batch,
    shuffle     = False
)

In [None]:
# Data loading
train_loader = DataLoader(
    dataset     = Train_data,
    batch_size  = batch,
    shuffle     = True
)

val_loader = DataLoader(
    dataset     = val_data,
    batch_size  = batch,
    shuffle     = False
)

In [None]:
# Check data loaders
f"Train batchs: {len(train_loader)}, Test batches: {len(val_loader)}"

In [None]:
for i, data in enumerate(train_loader):
    img, labels = data
    print(img.shape, labels.shape)
    break

In [None]:
for i, data in enumerate(val_loader):
    img,labels = data
    print(img.shape,labels.shape)
    break

In [None]:
from torchvision.models import vgg16

In [None]:
vgg16 = vgg16(pretrained=True)
vgg16

In [None]:
input_size = 25088
classifier = nn.Sequential(nn.Flatten(), nn.Linear(input_size, 512), nn.Sigmoid(), nn.Dropout(p=0.2), nn.Linear(512, 6), nn.Softmax(dim=1))

In [None]:
vgg16.classifier=classifier
vgg16

In [None]:
model =vgg16.to(device)

## Let's check the model output

In [None]:
# Get the batch to test the model
one_batch = next(iter(train_loader))
my_batch = one_batch[0].to(device)
my_batch.shape,my_batch.dtype

In [None]:
# Testing the model with a single batch
model(my_batch.float())

In [None]:
# print model outputs to see the shapes at each layer

try:
  import torchinfo
except:
  !pip install torchinfo
  import torchinfo

from torchinfo import summary
summary(model,input_size=(1,3,200,200))

In [None]:
# Let's check number of trainable parameters

params=[par_um.numel() for par_um in model.parameters() if par_um.requires_grad==True]
sum(params)

## Ready to train our model

In [None]:
# For optimer and loss function
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
scheduler =optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.95)

In [None]:
def train(model, dataloader, optimizer, criterion):

    model.train()

    # Progress Bar
    batch_bar   = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train', ncols=5)

    num_correct = 0
    total_loss  = 0

    for batch_idx, (images, target) in enumerate(dataloader):

        optimizer.zero_grad() # Zero gradients

        images, target = images.to(device), target.to(device)


        outputs = model(images.float())
        loss    = criterion(outputs, target)

        # Update correct predictions & loss
        num_correct     += int((torch.argmax(outputs.cpu(), axis=1) == target.cpu()).sum())
        total_loss      += float(loss.item())

        # tqdm - adding info to be checked on the progress bar
        batch_bar.set_postfix(
            acc         = f"{(100 * num_correct / (batch*(batch_idx + 1))):.04f}%",
            loss        = f"{total_loss / (batch_idx + 1):.04f}",
            num_correct = num_correct,
            lr          = f"{optimizer.param_groups[0]['lr']:.04f}"
        )

        batch_bar.update() # Update tqdm bar

    batch_bar.close() # close the tqdm bar after each epoch

    acc         = 100 * num_correct / (batch* len(dataloader))
    total_loss  = float(total_loss / len(dataloader))

    return acc, total_loss

In [None]:
def validate(model, dataloader, criterion):

    model.eval()
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc='Val', ncols=5)

    num_correct = 0.0
    total_loss = 0.0

    for batch_idx, (images, target) in enumerate(dataloader):

        # Move images to device
        images, target = images.to(device), target.to(device)

        # Get model outputs
        with torch.inference_mode():
            outputs = model(images.float())
            loss = criterion(outputs, target)

        num_correct += int((torch.argmax(outputs.cpu(), axis=1) == target.cpu()).sum())
        total_loss += float(loss.item())

        batch_bar.set_postfix(
            acc="{:.04f}%".format(100 * num_correct / (batch*(batch_idx + 1))),
            loss="{:.04f}".format(float(total_loss / (batch_idx + 1))),
            num_correct=num_correct)

        batch_bar.update()

    batch_bar.close()
    acc = 100 * num_correct / (batch* len(dataloader))
    total_loss = float(total_loss / len(dataloader))
    return acc, total_loss

In [None]:
best_valAcc = 0.0 # Initialize the accuracy to help saving the best model

Epochs = 5
for epoch in range(Epochs):

    curr_lr = float(optimizer.param_groups[0]['lr'])

    train_acc, train_loss = train(model, train_loader, optimizer, criterion)

    print("\nEpoch {}/{}: \nTrain Acc {:.04f}%\t Train Loss {:.04f}\t Learning Rate {:.04f}".format(
        epoch + 1,
        Epochs,
        train_acc,
        train_loss,
        curr_lr))

    val_acc, val_loss = validate(model, val_loader, criterion)

    print("Val Acc {:.04f}%\t Val Loss {:.04f}".format(val_acc, val_loss))

    # learning rate scheduler, update
    scheduler.step()

    # Save the best model based on accuracy
    if val_acc >= best_valAcc:

        path = '/content/drive/MyDrive/RwandaDroneImagery/checkpoint.pth'
        print("Saving model")
        torch.save({'model_state_dict':model.state_dict(),
                  'optimizer_state_dict':optimizer.state_dict(),
                  'scheduler_state_dict':scheduler.state_dict(),
                  'val_acc': val_acc,
                  'epoch': epoch}, path)
        best_valAcc = val_acc




In [None]:
model.eval()

true_labels = []
predicted_labels = []
total_loss  = 0

for i, (images, labels) in enumerate(test_loader):

    images, labels = images.to(device), labels.to(device)
    outputs = model(images.float())

    # Store true labels and predicted labels for F1-score calculation
    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(torch.argmax(outputs, axis=1).cpu().numpy())


In [None]:
inv_labels_dict = {0: 'legumes', 1: 'maize', 2: 'banana', 3: 'forest', 4: 'other', 5: 'structure'}

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


# Compute classification report
report = classification_report(true_labels, predicted_labels, target_names=np.unique(true_labels), output_dict=True)

# Create a dictionary to hold evaluation metrics per class
metrics_per_class = {}
for class_name, metrics in report.items():
    if class_name != 'accuracy' and class_name != 'macro avg' and class_name != 'weighted avg':
        metrics_per_class[inv_labels_dict[class_name]] = {
            'F1 Score': metrics['f1-score'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],

        }

# Add overall metrics
metrics_per_class['Overall'] = {
    'F1 Score': report['macro avg']['f1-score'],
    'Precision': report['macro avg']['precision'],
    'Recall': report['macro avg']['recall'],


}

# Print the table
print("| {:<10} | {:<9} | {:<9} | {:<6} |".format("Class", "F1 Score", "Precision", "Recall"))
print("|" + "-"*12 + "|" + "-"*11 + "|" + "-"*11 + "|" + "-"*8 + "|")
for class_name, metrics in metrics_per_class.items():
    print("| {:<10} | {:<9.2f} | {:<9.2f} | {:<6.2f} |".format(class_name,
                                                                                      metrics['F1 Score'],
                                                                                      metrics['Precision'],
                                                                                      metrics['Recall'],
                                                                                      ))


## ToDO
Discuss the benefits (5marks) and limitations (5marks) of transfer learning in deep learning applications