### Transfer learning

Install and import packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import torch as th
from torch.utils.data import DataLoader
import torchvision as tv
from torchvision import transforms
import os
from tqdm import tqdm
from xml.dom import minidom


### Load dataset
Maybe normalize dataset

In [None]:

from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms.functional as TF
import torchvision.transforms as tttttt
import random

random.seed(5)
np.random.seed(100)

def initPathsBase():
        files = os.listdir("./data/images/")
        jpgList = []
        
        for file in files:
            if file.endswith(".jpg"):
                jpgList.append(file)
        
        random.shuffle(jpgList)
        
        return jpgList

def initClasses(binary):
    files = initPathsBase()
    classList = []
    for fname in files:
        theClass = getClass(binary, fname)
        if theClass not in classList:
            classList.append(theClass)
    return classList
    
    
def initClassesCatDog(binary):
    files = initPathsBase()
    classList = []
    for fname in files:
        fname = cleanFileName(fname)
        theClass = getClass(binary, fname)
        if theClass not in classList:
            classList.append(theClass)
    return ["cat", "dog"]

def getClassCatDog(fileName):
    if fileName[0].isupper():
        return "cat"
    else:
        return "dog"
    
def cleanFileName(name):
        if name[0:3] == "AUG":
           return name[3:]
        return name
    
    # Classifies data into Cat/Dog or into one of the 37 classes of breeds
def getClass(binary, fileName):
    fileName = cleanFileName(fileName)
    # getCenterBoundingBox(fileName)
    if binary:
        return getClassCatDog(fileName)
    else:
        nameList = fileName.split("_")
        name = ""
        for namePart in nameList:
            if ".jpg" not in namePart:
                name += namePart
        return name

    
def open_image(path): # https://jovian.ai/aakashns/transfer-learning-pytorch
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')
    
def getDataLists():
    files = initPathsBase()
    return files[:int(len(files)*0.7)], files[int(len(files)*0.7):]

class MyDataset(Dataset):
    def __init__(self, binary=False, limit=1, train=True, paths=None, percentage=1.0, pseu=False, moddel=None):
        super().__init__()
        self.size=224
        self.model = moddel
        self.binary = binary
        self.train = train
        self.pseu = pseu
        self.files = self.initPaths(limit=limit, train=train, li=paths, percentage=percentage) # self.initPaths(limit, train)
        self.classes = initClasses(binary) if not binary else initClassesCatDog(binary)
        # TODO: fix transforms better
        self.transform = self.getTransform()
        self.augTransform = self.getAugTransform()

    def __len__(self):
        return len(self.files)

    def __getitem__(self, i, stats=False):
        path = "./data/images/" + self.files[i]
        class_idx = None
        if stats:
            img = open_image(path)
        else:
            bild = self.files[i]
            if self.files[i][0:3] == "PSU":
                bild = self.files[i][3:]
                path = "./data/images/" + bild
                return self.calcClass(bild, path)
            if bild[:3] == "AUG":
                img = self.augTransform(open_image("./data/images/" + bild[3:]))
            else:
                img = self.transform(open_image(path))
                
        if class_idx is None:
            class_idx = self.classes.index(self.getClass(self.files[i]))

        return img.to("cuda"), class_idx
    
    
    def calcClass(self, bild, path):
        img = None
        if bild[:3] == "AUG":
            img = self.augTransform(open_image("./data/images/" + bild[3:]))
        else:
            img = self.transform(open_image(path))
        img2 = np.expand_dims(img, axis = 0)

        klass = self.model(th.from_numpy(img2).to("cuda"))[0].argmax()

#         print(klass)
        
        return img.to("cuda"), klass.item()
    
    def getClass(self, fileName):
        return getClass(self.binary, fileName)
    
    def getTransform(self):
        return transforms.Compose([transforms.Resize(255),
                                    transforms.CenterCrop(224), 
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                                    ]) # normalization ABC
        
    def getAugTransform(self):
        return transforms.Compose([
            transforms.RandomHorizontalFlip(p = 0.7),                                  
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
                                    transforms.RandomRotation(degrees=(-30,30)),
                                    #transforms.RandomAffine(degrees=(-20,20), scale=(.9, 1.1), shear=(-20,20)),
                                    #transforms.Pad((25,25), fill=0, padding_mode='constant'),                        
                                    transforms.Resize(255),
                                    transforms.CenterCrop(224),
                                    ]) # normalization ABC
    

    
    def getCenterBoundingBox(self, fileName):
        try:
            file = minidom.parse('./annotations/xmls/'+fileName.replace(".jpg", '.xml'))
            xmin = file.getElementsByTagName('xmin')[0].firstChild.data
            xmax = file.getElementsByTagName('xmax')[0].firstChild.data
            ymin = file.getElementsByTagName('ymin')[0].firstChild.data
            ymax = file.getElementsByTagName('ymax')[0].firstChild.data
            return int(xmin), int(xmax), int(ymin), int(ymax)
        except:
            return 0, 0, 0, 0
    
    
    def getStats(self):
        avgImage = th.Tensor(np.zeros((self.size, self.size, 3)))
        
        for i in range(self.__len__()):
            img,_ = self.__getitem__(i, stats=True)
            t = transforms.Compose([transforms.Resize(255),
                                    transforms.CenterCrop(224), 
                                    transforms.ToTensor(),
                                    ]) 
            avgImage += t(img)
            
        
        avgImage /= self.__len__()
        
        mean = []
        std = []
        for dimension in enumerate(avgImage):
            print(dimension)
            mean.append(th.mean(dimension))
            std.append(th.std(dimension))
        
        print(mean, std)
            
        
        # return [mean[1], mean[2], mean[3]], [std[1], std[2], std[3]]
        
    def getCropStats(self):
        x = 0
        y = 0
        n = 0
        x_li = []
        y_li = []
        
        for file in self.files:
            xmin, xmax, ymin, ymax = self.getCenterBoundingBox(file)
            if (xmin != 0) and (xmax != 0) and (ymin != 0) and (ymax != 0):
                n += 1
                
                x += (xmax + xmin)/2
                y += (ymax + ymin)/2
                
                x_li.append(x)
                y_li.append(y)
                
        print("Average x:", x/n, "Average y:", y/n)
        
        x_np = np.asarray(x_li)
        y_np = np.asarray(y_li)
        
        x_std = np.std(x_np)
        y_std = np.std(y_np)
        print("STD x:", x_std, "STD y:", y_std)
            
    
    def addAugmented(self, jpgList):
        newJpgList = jpgList.copy()
        for jpg in jpgList: # Doubles the ammount of data 
            newJpgList.append("AUG"+jpg)

        return newJpgList
    
    def addPseudo(self, jpgList):
        newJpgList = []
        for jpg in jpgList: # Doubles the ammount of data 
            newJpgList.append("PSU"+jpg)

        return newJpgList
    
    def initPaths(self, limit, train, li, percentage):
        jpgList = li
        if train:
            jpgList = self.addAugmented(li)
            random.shuffle(jpgList)
            jpgOrig = jpgList.copy()
            jpgList = jpgOrig[:int(len(jpgOrig)*percentage)]
            if self.pseu:
                pseudoJpgList = self.addPseudo(jpgOrig[int(len(jpgOrig)*percentage):])
                jpgList = pseudoJpgList + jpgList
        else:
            pass
        
        return jpgList[0:int(len(jpgList)*limit)]
    


In [None]:
from torch.utils.data import random_split

def initDataset(batch_size, binary): #batch size affects computation time
    trainPaths, testValPaths = getDataLists()
    train = MyDataset(binary, train=True, paths=trainPaths)
    testValDataset = MyDataset(binary, train=False, paths=testValPaths)
    testValDataset.classes = train.classes # WTF
    # Train/Validation/Test split. Current: 70/15/15

    # train, test = random_split(dataset, [int(0.85*len(dataset))+1, int(0.15*len(dataset))])
    valid, test = random_split(testValDataset, [int(0.5*len(testValDataset))+1, int(0.5*len(testValDataset))]) 
    
    
    train_loader = th.utils.data.DataLoader(train,
                                            batch_size=batch_size, 
                                            shuffle=True)
    
    test_loader = th.utils.data.DataLoader(test,
                                            batch_size=batch_size,
                                            shuffle=True)
    valid_loader = th.utils.data.DataLoader(valid,
                                            batch_size=batch_size,
                                            shuffle=True)
    
    return train_loader , test_loader, valid_loader, train

train_loader, test_loader, valid_loader, train = initDataset(batch_size=32, binary=False)

### Initialize model

In [None]:
model = tv.models.resnet18(progress = True, pretrained=True)
# model.eval()

In [None]:

#Set requires_grad to false for every layer
moreLayers = True
for param in model.parameters():
    param.requires_grad = False

modules = model.named_modules()

for i in modules:
    if isinstance(i[1], th.nn.BatchNorm2d):
        if "layer4" in i[0] and "bn" in i[0]:
            i[1].momentum = 0.2

for i in modules:
    if isinstance(i[1], th.nn.BatchNorm2d):
        if "layer3" in i[0] and "bn" in i[0]:
            i[1].momentum = 0.2
    
# Replace the last layer of the pretrained model with our own:
# This should theoretically only set the last layer to requires_grad = True, since it is the default setting
model.fc = th.nn.Linear(model.fc.in_features, 37) # 37 if not binary

# Set the second layer to requires_grad = True
for param in model.layer4.parameters():
    param.requires_grad = True

for param in model.layer3.parameters():
    param.requires_grad = True

In [None]:
model.to("cuda")
pass

### Train the model

In [None]:
# Train the model on our dataset
def train_model(model, train_loader, valid_loader, epochs=5, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.9):
    # Define the loss function
    criterion = th.nn.CrossEntropyLoss().to("cuda")
    # Define the optimizer
    optimizer = th.optim.Adam(model.fc.parameters(), lr=lr, weight_decay=weight_decay) # filter(lambda p: p.requires_grad, model.parameters()),
    
    # Set different learning rates for different layers
    optimizer.add_param_group({'params': model.layer4.parameters(), 'lr': lr/10})
    
    # Train the model
    scheduler = th.optim.lr_scheduler.ExponentialLR(optimizer, gamma=sheduler_gamma)
    
    for epoch in range(epochs):
        # Training
        model.train() #trains model
        for batch_idx, batch in enumerate(tqdm(train_loader)):
            images = batch[0]
            labels = batch[1].to("cuda")
            print(type(batch[0]))
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            # Backward and optimize
            optimizer.zero_grad() # Reset the gradients, maybe we should not do this
            loss.backward() # Compute the gradients
            optimizer.step() # Update the weights
        # Validation
        scheduler.step()
        model.eval()
        with th.no_grad(): # Disables tracking of calculations required to calculate gradients
            correct = 0
            total = 0
            for batch_idx, batch in enumerate(valid_loader):
                images = batch[0]
                labels = batch[1].to("cuda")
                #print(labels)
                outputs = model(images)
                #print(outputs)
                _, predicted = th.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            print('Learning rate: {} \n'.format(lr))
            print('Accuracy of the network on the validation images: {} %'.format(100 * correct / total))
            
    return model


# Milestones:
## Grade E:
- [x] Achieve >99% on binary classification
- [x] Achieve >95% on multi-class classification
- [x] Examine fine tuning more layers
- [x] Examine different learning rates
- [x] Examine data augmentation
- [x] Fine tune batch-norm

## Grade A:
### Decrease the percentage of labelled data: 
- [x] 50 %
- [x] 10 %
- [x] 1 %
- [x] Implement Pseudo-labelling


### Pseudo labelling

In [None]:
## Helper functions
def getPseudoTrainLoader(classes, m, batch_size=32, binary=False, percentage=0.5, pseu=False): #batch size affects computation time
    trainPaths, validpath = getDataLists()
    
    train = MyDataset(binary, train=True, paths=trainPaths, percentage=percentage, moddel=m, pseu=pseu)
    
    train.classes = classes # Make sure we are labelling correctly
    train_loader = th.utils.data.DataLoader(train,
                                            batch_size=batch_size, 
                                            shuffle=True)
    
    testValDataset = MyDataset(binary, train=False, paths=validpath)
    testValDataset.classes = train.classes # WTF
    valid, test = random_split(testValDataset, [int(0.5*len(testValDataset))+1, int(0.5*len(testValDataset))]) 
    
    
    train_loader = th.utils.data.DataLoader(train,
                                            batch_size=batch_size, 
                                            shuffle=True)
    
    valid_loader = th.utils.data.DataLoader(valid,
                                            batch_size=batch_size,
                                            shuffle=True)
    
    test_loader = th.utils.data.DataLoader(test,
                                            batch_size=batch_size,
                                            shuffle=True)
    # if m is not None:
    #     print("aaa")
    #     n = 0
    #     n2 = 0
    #     for batch_idx, batch in enumerate(train_loader):
    #         #names = train_loader.dataset.samples[batch_idx]
    #         images = batch[0]
    #         labels = batch[1]
            
    #         for i in range(len(images)):
    #             print("bbb")
    #             img = np.expand_dims(images[i], axis = 0)
    #             print(names[i])
    #             # print(m(th.from_numpy(img))[0].argmax())
    #         input()
    
    return train_loader, valid_loader, test_loader

train_loader, test_loader, valid_loader, train = initDataset(batch_size=32, binary=False)
classes = train.classes

# Train the model on our dataset
def train_pseudo_model(oldModel, newModel, valid_loader, epochs=5, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.9,percentage=0.5, pseu=False, classes=None):
    criterion = th.nn.CrossEntropyLoss().to("cuda")# Define the loss function
    # Define the optimizer
    optimizer = th.optim.Adam(oldModel.fc.parameters(), lr=lr, weight_decay=weight_decay) # filter(lambda p: p.requires_grad, model.parameters()),
    optimizer.add_param_group({'params': oldModel.layer4.parameters(), 'lr': lr/10}) # Set different learning rates for different layers
    optimizer.add_param_group({'params': oldModel.layer3.parameters(), 'lr': lr/1000})
    scheduler = th.optim.lr_scheduler.ExponentialLR(optimizer, gamma=sheduler_gamma)
    
    # Train the model
    print("STARTAR HÄR\n")
    
    tl, valid, test = getPseudoTrainLoader(classes, newModel, percentage=percentage, pseu=pseu)
    # print("\n---------------1-----------------n")
    # for batch_idx, batch in enumerate(tl):
    #     pass
    # print("\n---------------2-----------------n")
    for epoch in range(epochs):
        # Training 
         #trains oldModel
        for batch_idx, batch in enumerate(tqdm(tl)):
            oldModel.train()
            images = batch[0]
            labels = batch[1].to("cuda")
            # Forward pass
            outputs = oldModel(images)
            loss = criterion(outputs, labels)
            # Backward and optimize
            optimizer.zero_grad() # Reset the gradients, maybe we should not do this
            loss.backward() # Compute the gradients
            optimizer.step() # Update the weights
        # Validation
        scheduler.step()
        oldModel.eval()
        with th.no_grad(): # Disables tracking of calculations required to calculate gradients
            correct = 0
            total = 0
            for batch_idx, batch in enumerate(valid):
                images = batch[0]
                labels = batch[1].to("cuda")
                #print(labels)
                outputs = oldModel(images)
                #print(outputs)
                _, predicted = th.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            print('Learning rate: {} \n'.format(lr))
            print('Accuracy of the network on the validation images: {} %'.format(100 * correct / total))
            
    return oldModel

In [None]:
import copy
model2=copy.deepcopy(model)
model3=copy.deepcopy(model)
model4=copy.deepcopy(model)
model5=copy.deepcopy(model)
model6=copy.deepcopy(model)
model7=copy.deepcopy(model)
model8=copy.deepcopy(model)
model9=copy.deepcopy(model)
model10=copy.deepcopy(model)
model11=copy.deepcopy(model)
model12=copy.deepcopy(model)

In [None]:
new = train_pseudo_model(model3, None, valid_loader, epochs=10, lr=0.001, weight_decay=0.0, sheduler_gamma=0.5, percentage=1, pseu=False, classes=classes)
# new31 = copy.deepcopy(new)
# new2 = train_pseudo_model(model3, new31, valid_loader, epochs=10, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.01, pseu=True, classes=classes)

In [None]:
with th.no_grad(): # Disables tracking of calculations required to calculate gradients
            correct = 0
            total = 0
            for batch_idx, batch in enumerate(test_loader):
                images = batch[0]
                labels = batch[1].to("cuda")
                #print(labels)
                outputs = new(images)
                #print(outputs)
                _, predicted = th.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            print('Accuracy of the network on the validation images: {} %'.format(100 * correct / total))

In [None]:
new123 = train_pseudo_model(model3, None, valid_loader, epochs=10, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.01, pseu=False, classes=classes)
new3123 = copy.deepcopy(new123)
new3 = train_pseudo_model(model4, new3123, valid_loader, epochs=10, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.01, pseu=True, classes=classes)

In [None]:
new321 = train_pseudo_model(model5, None, valid_loader, epochs=10, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.5, pseu=False, classes=classes)
new3321 = copy.deepcopy(new321)
new4 = train_pseudo_model(model6, new3321, valid_loader, epochs=10, lr=10**-3, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.5, pseu=True, classes=classes)

In [None]:
new111 = train_pseudo_model(model7, None, valid_loader, epochs=10, lr=10**-4, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.01, pseu=False, classes=classes)
new3111 = copy.deepcopy(new111)
new5 = train_pseudo_model(model8, new3111, valid_loader, epochs=10, lr=10**-4, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.01, pseu=True, classes=classes)

In [None]:
new1111 = train_pseudo_model(model9, None, valid_loader, epochs=10, lr=10**-4, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.1, pseu=False, classes=classes)
new31111 = copy.deepcopy(new1111)
new6 = train_pseudo_model(model10, new31111, valid_loader, epochs=10, lr=10**-4, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.1, pseu=True, classes=classes)

In [None]:
new11111 = train_pseudo_model(model11, None, valid_loader, epochs=10, lr=10**-4, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.5, pseu=False, classes=classes)
new311111 = copy.deepcopy(new11111)
new7 = train_pseudo_model(model12, new311111, valid_loader, epochs=10, lr=10**-4, weight_decay=0.0, sheduler_gamma=0.5, percentage=0.5, pseu=True, classes=classes)

In [None]:
# Pseudo labelling

# Reduce the amount of labels
# 50%, 10%, 1%


# Train a model on the reduced dataset

# Use the model to generate pseudo labels

# Train a model on the pseudo labels and the original dataset

# Implement ensambling? Implement Pretrain?