## Working with a Manageable Sample  


Starting from this section, we will work with a smaller sample of our dataset. By using this sample, we can run and iterate faster, enabling quicker experimentation and refinement of our algorithms. Once we have fine-tuned our approach and are confident in its effectiveness, we can scale up to the full dataset for the final model training and evaluation. This approach will help us efficiently develop a robust Bloom Classifier while reducing processing time and computational resources during the experimentation phase. Let's make the most of this manageable sample to optimize our workflow and achieve our goals more efficiently.      

It's generally a good idea to make sure that our sample includes all classes, especially in a multi-class problem such as this. This is important because the model needs to learn features from all the classes during the training process. If some classes are missing in the training set, our model won't be able to recognize them during inference.

If the classes are imbalanced (which is the case here), we might also want to consider using stratified sampling, which ensures that the distribution of classes in your sample matches the distribution in the full dataset. This can help the model learn more effectively, particularly for classes that have fewer examples. 

## Training on a Sample dataset 

In [13]:
import torch
import copy
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torchvision import models
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import numpy as np
from scipy.io import loadmat

# Specify the size of the sample you want to create
sample_size = 800  # Adjust this as needed

# Create a StratifiedShuffleSplit instance
sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size, random_state=0)

# Load the data
image_folder = './data/jpg/'
image_files = [os.path.join(image_folder, file) for file in os.listdir(image_folder)]
labels_file = "./data/imagelabels.mat"
labels_data = loadmat(labels_file)
labels = labels_data['labels'][0] - 1  # Adjusting labels to be in the range 0-101

# Get index for the sample
for train_index, sample_index in sss.split(image_files, labels):
    image_files_sample = [image_files[i] for i in sample_index]
    labels_sample = [labels[i] for i in sample_index]

# Now use this sample to split into training and validation data
image_files_train, image_files_val, labels_train, labels_val = train_test_split(image_files_sample, labels_sample, test_size=0.2, stratify=labels_sample)

class FlowerDataset(Dataset):
    def __init__(self, image_files, labels, transform=None):
        self.image_files = image_files
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image = Image.open(self.image_files[idx])  # Open image file
        image = image.convert('RGB')  # Convert image to RGB channels
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# Define transforms for the training data and testing data
train_transforms = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])

test_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406],
                                                           [0.229, 0.224, 0.225])])

# Create the custom dataset for training and validation data
dataset_train = FlowerDataset(image_files_train, labels_train, transform=train_transforms)
dataset_val = FlowerDataset(image_files_val, labels_val, transform=test_transforms)

# Create data loaders
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=64, shuffle=False)

dataloaders = {'train': dataloader_train, 'val': dataloader_val}

# Specify the device for computation
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Number of classes
num_classes = len(np.unique(labels))

dataset_sizes = {'train': len(dataset_train), 'val': len(dataset_val)}


# Get a pretrained ResNet model and modify the final layer
model = models.resnet50(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
model = model.to(device)

# Define the criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Define a learning rate scheduler
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


# Training function
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model

# Train the model
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=25)

# Save the model
torch.save(model.state_dict(), 'model_flowers.pth')




Epoch 0/24
----------
train Loss: 4.6788 Acc: 0.0141
val Loss: 4.6518 Acc: 0.0312

Epoch 1/24
----------
train Loss: 4.5820 Acc: 0.0328
val Loss: 4.5787 Acc: 0.0437

Epoch 2/24
----------
train Loss: 4.4959 Acc: 0.0312
val Loss: 4.5572 Acc: 0.0250

Epoch 3/24
----------
train Loss: 4.4374 Acc: 0.0406
val Loss: 4.5484 Acc: 0.0250

Epoch 4/24
----------
train Loss: 4.3977 Acc: 0.0484
val Loss: 4.5429 Acc: 0.0187

Epoch 5/24
----------
train Loss: 4.3644 Acc: 0.0563
val Loss: 4.5414 Acc: 0.0250

Epoch 6/24
----------
train Loss: 4.3198 Acc: 0.0594
val Loss: 4.5387 Acc: 0.0312

Epoch 7/24
----------
train Loss: 4.2934 Acc: 0.0609
val Loss: 4.5400 Acc: 0.0312

Epoch 8/24
----------
train Loss: 4.2867 Acc: 0.0516


KeyboardInterrupt: 

## Training on Full dataset 

In [3]:
import torch
import copy
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torchvision import models
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import numpy as np
from scipy.io import loadmat

# data directories

image_folder = './data/jpg/'
image_files = [os.path.join(image_folder, file) for file in os.listdir(image_folder)]

labels_file = "./data/imagelabels.mat"
labels_data = scipy.io.loadmat(labels_file)
labels = labels_data['labels'][0] - 1  # Adjusting labels to be in the range 0-101

# Create the custom dataset

class FlowerDataset(Dataset):
    def __init__(self, image_files, labels, transform=None):
        self.image_files = image_files
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image = Image.open(self.image_files[idx])  # Open image file
        image = image.convert('RGB')  # Convert image to RGB channels
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

In [16]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split
from torchvision import models, transforms
from torch.optim import lr_scheduler
import time
import os
import copy

# Setup device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Specify the size of the sample you want to create
sample_size = 500  # Adjust this as needed

# Define the transform
transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Create dataset instance and apply the transform
dataset = FlowerDataset(image_files, labels, transform=transform)

# Split the dataset into training, validation and test sets
n_train = int(len(dataset) * 0.7)
n_valid = int(len(dataset) * 0.15)
n_test = len(dataset) - n_train - n_valid
dataset_train, dataset_valid, dataset_test = random_split(dataset, [n_train, n_valid, n_test])

# Create data loaders
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=64, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=False)

# Specify number of classes
num_classes = 102

# Load pre-trained model
model = models.resnet50(pretrained=True)

# Update the classifier part of the pre-trained model
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)

model = model.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            if phase == 'train':
                dataloader = dataloader_train
            else:
                dataloader = dataloader_valid

            for inputs, labels in dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / len(dataloader.dataset)
            epoch_acc = running_corrects.double() / len(dataloader.dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

# Train the model
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=25)

# Save the model
torch.save(model.state_dict(), 'model_flowers.pth')


Epoch 0/24
----------
