In [1]:
#imports
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
import os
import cv2 as cv
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import random

In [2]:
#needs to be more reproducable
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
#load some data 
trainNames = pd.read_csv('./imagenet_train20a1.txt', sep=r"\s+", names=["imageName", "label"]) #have space inbetween name and label, tab between rows 
trainNames["folder"] = trainNames["imageName"].str.extract(r"(n\d+)_[0-9]+") #need to seperate out folder for later usage
valNames = pd.read_csv('./imagenet_val20.txt', sep=r"\s+", names=["imageName", "label"]) #we can do straight
display(trainNames.iloc[0:5, :]) #to check if they exist
display(valNames.iloc[0:5, :])

Unnamed: 0,imageName,label,folder
0,n01737021_10059.JPEG,0,n01737021
1,n01737021_10083.JPEG,0,n01737021
2,n01737021_10095.JPEG,0,n01737021
3,n01737021_10163.JPEG,0,n01737021
4,n01737021_10235.JPEG,0,n01737021


Unnamed: 0,imageName,label
0,ILSVRC2012_val_00000043.JPEG,1
1,ILSVRC2012_val_00000084.JPEG,0
2,ILSVRC2012_val_00000098.JPEG,4
3,ILSVRC2012_val_00000100.JPEG,8
4,ILSVRC2012_val_00000133.JPEG,6


In [4]:
#dataset checking
print(len(trainNames), len(valNames))
print(min(trainNames['label']), max(trainNames['label']), min(valNames['label']), max(valNames['label']))

6000 1000
0 19 0 19


In [5]:
#validation loading, was inefficent so changed
# valPath = "./imagenet_val20/imagenet_val20"
# def valLoadImage(name, path):
#     imagePath = os.path.join(path, name)
#     img = cv.imread(imagePath)
#     img = cv.resize(img, (224, 224)) #as we want 224 x 224 for alexnet 
#     if img is None:
#         print(f"Failed to load image: {imagePath}")
#     return img
# valFinal = pd.DataFrame()
# valFinal["image"] = val["imageName"].apply(lambda x: valLoadImage(x, valPath))
# valFinal["label"] = val["label"]

In [6]:
#train loading, was inefficent so changed
# trainPath = "./imagenet_train20a/imagenet_train20a"
# def trainLoadImage(name, folder, path): #note that within the train folder we first have image folders which have the images
#     newPath = os.path.join(path, folder)
#     return valLoadImage(name, newPath)
# trainFinal = pd.DataFrame()
# trainFinal["image"] = train[["imageName", "folder"]].apply(lambda row: trainLoadImage(row["imageName"], row["folder"], trainPath), axis=1)
# trainFinal["label"] = train["label"]

In [7]:
#need to actually put into torch rather than numpy, was inefficent so changed
# def convert_image(image):
#     return torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
# trainFinal["image"] = trainFinal["image"].apply(convert_image)
# valFinal["image"] = valFinal["image"].apply(convert_image)

# trainFinal["label"] = pd.Series(torch.from_numpy(trainFinal["label"].to_numpy()))
# valFinal["label"] = pd.Series(torch.from_numpy(valFinal["label"].to_numpy()))

In [11]:
#class to load better
class ImageNetDataset(Dataset):
    def __init__(self, df, base_path, is_train=True):
        self.df = df
        self.base_path = base_path
        self.is_train = is_train
        if self.is_train:
            self.transform = transforms.Compose([
                transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
                transforms.RandomRotation(15),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.4693, 0.4370, 0.3801], #values from below 
                                     std=[0.2276, 0.2210, 0.2184])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.4693, 0.4370, 0.3801], #assume should be same values
                                     std=[0.2276, 0.2210, 0.2184])
            ])
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        if self.is_train:
            folder = row['folder']
            img_path = os.path.join(self.base_path, folder, row['imageName'])
        else:
            img_path = os.path.join(self.base_path, row['imageName'])

        img = Image.open(img_path).convert('RGB')
        img = self.transform(img)
        label = row['label']
        return img, label

In [12]:
#the actual loading
train_path = './imagenet_train20a/imagenet_train20a'
val_path = './imagenet_val20/imagenet_val20'

trainDS = ImageNetDataset(trainNames, train_path, is_train=True)
valDS = ImageNetDataset(valNames, val_path, is_train=False)

trainLoad = DataLoader(trainDS, batch_size=32, shuffle=True, num_workers=0)
valLoad = DataLoader(valDS, batch_size=32, shuffle=False, num_workers=0)

In [10]:
from tqdm import tqdm

def compute_mean_std(dataloader):
    mean = 0.0
    std = 0.0
    total_images_count = 0

    for images, _ in tqdm(dataloader):
        # Images shape: (batch_size, channels, height, width)
        batch_samples = images.size(0)  # batch size (number of images)
        images = images.view(batch_samples, images.size(1), -1)  # Flatten H and W
        
        mean += images.mean(2).sum(0)
        std += images.std(2).sum(0)
        total_images_count += batch_samples

    mean /= total_images_count
    std /= total_images_count

    return mean, std
mean, std = compute_mean_std(trainLoad)
print("Mean:", mean)
print("Std:", std)

100%|██████████| 188/188 [00:20<00:00,  9.23it/s]

Mean: tensor([0.4693, 0.4370, 0.3801])
Std: tensor([0.2276, 0.2210, 0.2184])





In [13]:
#model
class AlexNet(nn.Module): 
    def __init__(self):
        super(AlexNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 96, kernel_size=11, padding=2, stride=4) #kernel size of 11, stride of 4, padding is 2 if we want to get 55x55
        self.conv2 = nn.Conv2d(96, 256, kernel_size=5, padding=2, stride=1) #k=5, s=1, p=2 if we want 27 to 27
        self.conv3 = nn.Conv2d(256, 384, kernel_size=3, padding=1, stride=1) #k=3, s=1, p=1 for all 13 to 13 convs 
        self.conv4 = nn.Conv2d(384, 384, kernel_size=3, padding=1, stride=1)
        self.conv5 = nn.Conv2d(384, 256, kernel_size=3, padding=1, stride=1)

        self.fc1 = nn.Linear(9216, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 20)

        self.relu = nn.LeakyReLU() #first few kept dying so switched to leaky relu 
        #self.softmax = nn.Softmax(dim=1) #apparently cross entropy loss already does softmax so better to not have
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2) #as we do a 3x3 pooling with stride of 2 
        self.dropout = nn.Dropout(p=0.5) #apparently makes overfitting less lkeley

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        Z1 = self.pool(self.relu(self.conv1(x)))
        Z2 = self.pool(self.relu(self.conv2(Z1)))
        Z3 = self.relu(self.conv3(Z2))
        Z4 = self.relu(self.conv4(Z3))
        Z5 = self.pool(self.relu(self.conv5(Z4)))
        flatten = torch.flatten(Z5, start_dim=1) #so now is batch size, flatten channels 
        x1 = self.relu(self.fc1(flatten))
        x1 = self.dropout(x1)
        x2 = self.relu(self.fc2(x1))
        x2 = self.dropout(x2)
        x3 = self.fc3(x2)
        return x3

In [14]:
#make model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AlexNet()
model.to(device)
print(device)

cuda


In [15]:
#other stuff we need before actual
def calcAcc(X, Y):
    predictions = torch.argmax(X, dim=1)
    return (predictions == Y).sum().item()/len(Y)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
num_epochs = 50
#endEarly = 20 #when I was infinitely looping 

In [13]:
#too expensive so will do data loading, redundant as change process 
# from torch.utils.data import DataLoader, TensorDataset

# X = torch.stack(trainFinal["image"].tolist())  # Make sure images are tensors
# Y = torch.tensor(trainFinal["label"].values)

# trainDS = TensorDataset(X, Y)
# trainLoad = DataLoader(trainDS, batch_size=128, shuffle=True, num_workers=4)

# val_images = torch.stack(valFinal["image"].tolist())
# val_labels = torch.tensor(valFinal["label"].values, dtype=torch.long)

# valDS = TensorDataset(val_images, val_labels)
# valLoad = DataLoader(valDS, batch_size=128, shuffle=False, num_workers=4)

In [16]:
#actual
#currBestAcc = -1
count = 0
for epoch in range(num_epochs):
    model.train()
    trainLoss, correct, total = 0.0, 0, 0 #just set all here
    for inputs, labels in trainLoad:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        trainLoss += loss.item() * inputs.size(0)
        batchAcc = calcAcc(outputs, labels)
        correct += batchAcc * inputs.size(0)
        total += inputs.size(0)
    trainLoss /= len(trainLoad.dataset)
    trainAcc = correct / total


    model.eval()
    with torch.no_grad():
        valLoss, correct, total = 0.0, 0, 0
        for inputs, labels in valLoad:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            valLoss += criterion(outputs, labels).item() * inputs.size(0)
            batchAcc = calcAcc(outputs, labels)
            correct += batchAcc * inputs.size(0)
            total += inputs.size(0)
    valLoss /= total
    valAcc = correct / total

    # if valAcc > currBestAcc:
    #     currBestAcc = valAcc
    #     count = 0
    # else:
    #     count += 1
    #     if count >= endEarly:
    #         break

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {trainLoss:.4f}, Train Acc: {trainAcc:.4f},  Val Loss: {valLoss:.4f}, Val Acc: {valAcc:.4f}")

Epoch 1/50, Train Loss: 2.8654, Train Acc: 0.1082,  Val Loss: 2.7121, Val Acc: 0.1650
Epoch 2/50, Train Loss: 2.5961, Train Acc: 0.1927,  Val Loss: 2.5715, Val Acc: 0.2110
Epoch 3/50, Train Loss: 2.3806, Train Acc: 0.2633,  Val Loss: 2.3892, Val Acc: 0.2720
Epoch 4/50, Train Loss: 2.1836, Train Acc: 0.3160,  Val Loss: 2.2017, Val Acc: 0.3140
Epoch 5/50, Train Loss: 1.9954, Train Acc: 0.3707,  Val Loss: 2.1550, Val Acc: 0.3390
Epoch 6/50, Train Loss: 1.8733, Train Acc: 0.3942,  Val Loss: 2.0705, Val Acc: 0.3370
Epoch 7/50, Train Loss: 1.7592, Train Acc: 0.4388,  Val Loss: 1.9098, Val Acc: 0.4040
Epoch 8/50, Train Loss: 1.6404, Train Acc: 0.4755,  Val Loss: 2.0433, Val Acc: 0.3940
Epoch 9/50, Train Loss: 1.5310, Train Acc: 0.5035,  Val Loss: 1.8608, Val Acc: 0.4230
Epoch 10/50, Train Loss: 1.4144, Train Acc: 0.5475,  Val Loss: 1.8642, Val Acc: 0.4250
Epoch 11/50, Train Loss: 1.3419, Train Acc: 0.5650,  Val Loss: 1.9491, Val Acc: 0.4370
Epoch 12/50, Train Loss: 1.2283, Train Acc: 0.6010, 

In [15]:
#failed first one as too little left in cuda
# for epoch in range(num_epochs):
#     #set up
#     model.train()
#     trainLoss = 0.0
#     A0 = torch.stack(trainFinal["image"].apply(lambda x: x.to(device)).tolist())
#     Y =  torch.tensor(trainFinal["label"].values, dtype=torch.long).to(device)

#     #forward
#     output = model(A0)
#     loss = criterion(output, Y)

#     #backward
#     optimizer.zero_grad()  # clear the gradients of all optimized variables
#     loss.backward()
#     optimizer.step()
#     train_loss += loss.item()

#     #see how good with val
#     model.eval()
#     output = model(valFinal["image"].to(device))
#     valLoss = criterion(output, A0)
#     valAcc = calculate_accuracy(output, A0)

#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {trainLoss:.4f}, "
#           f"Val Acc: {valAcc:.4f}, Val Loss: {valLoss:.4f}")