# Code  
---
This code is taken from: https://github.com/rafikg/CEAL
## Import all the required stuff

In [1]:
# imported from run_ceal/ceal_learning_algorithm.py
from model import AlexNet
from utils import Normalize, RandomCrop, SquarifyImage, \
    ToTensor, GameImageDataset
from utils import get_uncertain_samples, get_high_confidence_samples, \
    update_threshold
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

import numpy as np
import torch
import logging


# others
from sklearn.metrics import precision_recall_fscore_support as prfs

logging.basicConfig(format="%(levelname)s:%(name)s: %(message)s",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
device = None

## CEAL Algorithm

In [3]:
def ceal_learning_algorithm(du: DataLoader,
                            dl: DataLoader,
                            dtest: DataLoader,
                            k: int = 5,
                            delta_0: float = 0.005,
                            dr: float = 0.00033,
                            t: int = 1,
                            epochs: int = 10,
                            criteria: str = 'cl',
                            max_iter: int = 10):
    """
    Algorithm1 : Learning algorithm of CEAL.
    For simplicity, I used the same notation in the paper.
    Parameters
    ----------
    du: DataLoader
        Unlabeled samples
    dl : DataLoader
        labeled samples
    dtest : DataLoader
        test data
    k: int, (default = 1000)
        uncertain samples selection
    delta_0: float
        hight confidence samples selection threshold
    dr: float
        threshold decay
    t: int
        fine-tuning interval
    epochs: int
    criteria: str
    max_iter: int
        maximum iteration number.

    Returns
    -------

    """
    logger.info('Initial configuration: len(du): {}, len(dl): {} '.format(
        len(du.sampler.indices),
        len(dl.sampler.indices)))

    # Create the model
    model = AlexNet(n_classes=4, device=device)

    # Initialize the model
    logger.info('Intialize training the model on `dl` and test on `dtest`')

    model.train(epochs=epochs, train_loader=dl, valid_loader=None)

    # Evaluate model on dtest
    p, r, f = model.evaluate(test_loader=dtest)

    print('====> Initial precision: {} '.format(p))
    print('====> Initial recall: {} '.format(r))
    print('====> Initial f-score: {} '.format(f))

    for iteration in range(max_iter):

        logger.info('Iteration: {}: run prediction on unlabeled data '
                    '`du` '.format(iteration))

        pred_prob = model.predict(test_loader=du)

        # get k uncertain samples
        uncert_samp_idx, _ = get_uncertain_samples(pred_prob=pred_prob, k=k,
                                                   criteria=criteria)

        # get original indices
        uncert_samp_idx = [du.sampler.indices[idx] for idx in uncert_samp_idx]

        # add the uncertain samples selected from `du` to the labeled samples
        #  set `dl`
        dl.sampler.indices.extend(uncert_samp_idx)

        logger.info(
            'Update size of `dl`  and `du` by adding uncertain {} samples'
            ' in `dl`'
            ' len(dl): {}, len(du) {}'.
            format(len(uncert_samp_idx), len(dl.sampler.indices),
                   len(du.sampler.indices)))

        # get high confidence samples `dh`
        hcs_idx, hcs_labels = get_high_confidence_samples(pred_prob=pred_prob,
                                                          delta=delta_0)
        # get the original indices
        hcs_idx = [du.sampler.indices[idx] for idx in hcs_idx]

        # remove the samples that already selected as uncertain samples.
        hcs_idx = [x for x in hcs_idx if
                   x not in list(set(uncert_samp_idx) & set(hcs_idx))]

        # add high confidence samples to the labeled set 'dl'

        # (1) update the indices
        dl.sampler.indices.extend(hcs_idx)
        # (2) update the original labels with the pseudo labels.
        for idx in range(len(hcs_idx)):
            dl.dataset.labels[hcs_idx[idx]] = hcs_labels[idx]
        logger.info(
            'Update size of `dl`  and `du` by adding {} hcs samples in `dl`'
            ' len(dl): {}, len(du) {}'.
            format(len(hcs_idx), len(dl.sampler.indices),
                   len(du.sampler.indices)))

        if iteration % t == 0:
            logger.info('Iteration: {} fine-tune the model on dh U dl'.
                        format(iteration))
            model.train(epochs=epochs, train_loader=dl)

            # update delta_0
            delta_0 = update_threshold(delta=delta_0, dr=dr, t=iteration)

        # remove the uncertain samples from the original `du`
        logger.info('remove {} uncertain samples from du'.
                    format(len(uncert_samp_idx)))
        for val in uncert_samp_idx:
            du.sampler.indices.remove(val)

        p, r, f = model.evaluate(test_loader=dtest)
        print(
            "Iteration: {}, len(dl): {}, len(du): {},"
            " len(dh) {}, p: {} r: {} f: {} ".format(
                iteration, len(dl.sampler.indices),
                len(du.sampler.indices), len(hcs_idx), p, r, f))

In [4]:
dataset_train = GameImageDataset(
    root_dir="data/train",
    transform=transforms.Compose(
        [SquarifyImage(),
         RandomCrop(224),
         Normalize(),
         ToTensor()]))

dataset_test = GameImageDataset(
    root_dir="data/test",
    transform=transforms.Compose(
        [SquarifyImage(),
         RandomCrop(224),
         Normalize(),
         ToTensor()]))

# Creating data indices for training and validation splits:
random_seed = 123
validation_split = 0.1  # 10%
shuffling_dataset = True
batch_size = 16
dataset_size = len(dataset_train)

indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

if shuffling_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

du = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size,
                                 sampler=train_sampler, num_workers=4)
dl = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size,
                                 sampler=valid_sampler, num_workers=4)
dtest = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size,
                                    num_workers=4)

ceal_learning_algorithm(du=du, dl=dl, dtest=dtest)

INFO:__main__: Initial configuration: len(du): 54, len(dl): 5 
INFO:model.alexnet: The code is running on cuda:0 
INFO:__main__: Intialize training the model on `dl` and test on `dtest`


====> Epoch: 0 Average loss: 0.1674
====> Epoch: 1 Average loss: 0.1488
====> Epoch: 2 Average loss: 0.1333
====> Epoch: 3 Average loss: 0.0938
====> Epoch: 4 Average loss: 0.0856
====> Epoch: 5 Average loss: 0.0675
====> Epoch: 6 Average loss: 0.0479
====> Epoch: 7 Average loss: 0.0452
====> Epoch: 8 Average loss: 0.0343
====> Epoch: 9 Average loss: 0.0238


INFO:__main__: Iteration: 0: run prediction on unlabeled data `du` 


====> Initial precision: [0.4, 0.5454545454545454, 0.0, 0.0] 
====> Initial recall: [1.0, 0.75, 0.0, 0.0] 
====> Initial f-score: [0.5714285714285715, 0.631578947368421, 0.0, 0.0] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 10, len(du) 54
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 10, len(du) 54
INFO:__main__: Iteration: 0 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0512
====> Epoch: 1 Average loss: 0.0432
====> Epoch: 2 Average loss: 0.0382
====> Epoch: 3 Average loss: 0.0291
====> Epoch: 4 Average loss: 0.0203
====> Epoch: 5 Average loss: 0.0197
====> Epoch: 6 Average loss: 0.0255
====> Epoch: 7 Average loss: 0.0263
====> Epoch: 8 Average loss: 0.0247


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0214


INFO:__main__: Iteration: 1: run prediction on unlabeled data `du` 


Iteration: 0, len(dl): 10, len(du): 49, len(dh) 0, p: [0.75, 0.5, 0.0, 0.25] r: [0.5, 0.375, 0.0, 0.14285714285714285] f: [0.6, 0.42857142857142855, 0.0, 0.18181818181818182] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 15, len(du) 49
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 15, len(du) 49
INFO:__main__: Iteration: 1 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0256
====> Epoch: 1 Average loss: 0.0231
====> Epoch: 2 Average loss: 0.0212
====> Epoch: 3 Average loss: 0.0207
====> Epoch: 4 Average loss: 0.0239
====> Epoch: 5 Average loss: 0.0207
====> Epoch: 6 Average loss: 0.0173
====> Epoch: 7 Average loss: 0.0141
====> Epoch: 8 Average loss: 0.0147


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0145


INFO:__main__: Iteration: 2: run prediction on unlabeled data `du` 


Iteration: 1, len(dl): 15, len(du): 44, len(dh) 0, p: [0.5555555555555556, 0.5, 0.0, 0.5] r: [0.8333333333333334, 0.25, 0.0, 0.2857142857142857] f: [0.6666666666666667, 0.3333333333333333, 0.0, 0.36363636363636365] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 20, len(du) 44
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 20, len(du) 44
INFO:__main__: Iteration: 2 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0286
====> Epoch: 1 Average loss: 0.0423
====> Epoch: 2 Average loss: 0.0262
====> Epoch: 3 Average loss: 0.0260
====> Epoch: 4 Average loss: 0.0290
====> Epoch: 5 Average loss: 0.0201
====> Epoch: 6 Average loss: 0.0227
====> Epoch: 7 Average loss: 0.0274
====> Epoch: 8 Average loss: 0.0156


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0165


INFO:__main__: Iteration: 3: run prediction on unlabeled data `du` 


Iteration: 2, len(dl): 20, len(du): 39, len(dh) 0, p: [0.8333333333333334, 0.3333333333333333, 1.0, 1.0] r: [0.8333333333333334, 0.25, 0.4, 0.14285714285714285] f: [0.8333333333333334, 0.28571428571428575, 0.5714285714285715, 0.25] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 25, len(du) 39
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 25, len(du) 39
INFO:__main__: Iteration: 3 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0295
====> Epoch: 1 Average loss: 0.0289
====> Epoch: 2 Average loss: 0.0279
====> Epoch: 3 Average loss: 0.0232
====> Epoch: 4 Average loss: 0.0240
====> Epoch: 5 Average loss: 0.0220
====> Epoch: 6 Average loss: 0.0200
====> Epoch: 7 Average loss: 0.0195
====> Epoch: 8 Average loss: 0.0201


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0159


INFO:__main__: Iteration: 4: run prediction on unlabeled data `du` 


Iteration: 3, len(dl): 25, len(du): 34, len(dh) 0, p: [0.42857142857142855, 0.5, 0.6666666666666666, 0.0] r: [0.5, 0.25, 0.4, 0.0] f: [0.4615384615384615, 0.3333333333333333, 0.5, 0.0] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 30, len(du) 34
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 30, len(du) 34
INFO:__main__: Iteration: 4 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0166
====> Epoch: 1 Average loss: 0.0158
====> Epoch: 2 Average loss: 0.0163
====> Epoch: 3 Average loss: 0.0158
====> Epoch: 4 Average loss: 0.0143
====> Epoch: 5 Average loss: 0.0155
====> Epoch: 6 Average loss: 0.0156
====> Epoch: 7 Average loss: 0.0127
====> Epoch: 8 Average loss: 0.0143


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0121


INFO:__main__: Iteration: 5: run prediction on unlabeled data `du` 


Iteration: 4, len(dl): 30, len(du): 29, len(dh) 0, p: [0.6666666666666666, 0.4, 1.0, 0.5] r: [0.6666666666666666, 0.25, 0.6, 0.42857142857142855] f: [0.6666666666666666, 0.3076923076923077, 0.7499999999999999, 0.4615384615384615] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 35, len(du) 29
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 35, len(du) 29
INFO:__main__: Iteration: 5 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0175
====> Epoch: 1 Average loss: 0.0156
====> Epoch: 2 Average loss: 0.0165
====> Epoch: 3 Average loss: 0.0142
====> Epoch: 4 Average loss: 0.0179
====> Epoch: 5 Average loss: 0.0158
====> Epoch: 6 Average loss: 0.0133
====> Epoch: 7 Average loss: 0.0145
====> Epoch: 8 Average loss: 0.0095


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0152


INFO:__main__: Iteration: 6: run prediction on unlabeled data `du` 


Iteration: 5, len(dl): 35, len(du): 24, len(dh) 0, p: [0.5714285714285714, 0.0, 1.0, 0.5] r: [0.6666666666666666, 0.0, 0.6, 0.42857142857142855] f: [0.6153846153846153, 0.0, 0.7499999999999999, 0.4615384615384615] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 40, len(du) 24
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 40, len(du) 24
INFO:__main__: Iteration: 6 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0157
====> Epoch: 1 Average loss: 0.0146
====> Epoch: 2 Average loss: 0.0140
====> Epoch: 3 Average loss: 0.0180
====> Epoch: 4 Average loss: 0.0153
====> Epoch: 5 Average loss: 0.0104
====> Epoch: 6 Average loss: 0.0108
====> Epoch: 7 Average loss: 0.0108
====> Epoch: 8 Average loss: 0.0094


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0110


INFO:__main__: Iteration: 7: run prediction on unlabeled data `du` 


Iteration: 6, len(dl): 40, len(du): 19, len(dh) 0, p: [0.625, 0.2, 1.0, 0.4] r: [0.8333333333333334, 0.125, 0.4, 0.2857142857142857] f: [0.7142857142857143, 0.15384615384615385, 0.5714285714285715, 0.3333333333333333] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 45, len(du) 19
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 45, len(du) 19
INFO:__main__: Iteration: 7 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0140
====> Epoch: 1 Average loss: 0.0147
====> Epoch: 2 Average loss: 0.0155
====> Epoch: 3 Average loss: 0.0125
====> Epoch: 4 Average loss: 0.0116
====> Epoch: 5 Average loss: 0.0126
====> Epoch: 6 Average loss: 0.0127
====> Epoch: 7 Average loss: 0.0116
====> Epoch: 8 Average loss: 0.0128


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0122


INFO:__main__: Iteration: 8: run prediction on unlabeled data `du` 


Iteration: 7, len(dl): 45, len(du): 14, len(dh) 0, p: [0.5555555555555556, 0.75, 1.0, 0.5] r: [0.8333333333333334, 0.375, 0.4, 0.2857142857142857] f: [0.6666666666666667, 0.5, 0.5714285714285715, 0.36363636363636365] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 50, len(du) 14
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 50, len(du) 14
INFO:__main__: Iteration: 8 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0172
====> Epoch: 1 Average loss: 0.0168
====> Epoch: 2 Average loss: 0.0173
====> Epoch: 3 Average loss: 0.0172
====> Epoch: 4 Average loss: 0.0161
====> Epoch: 5 Average loss: 0.0125
====> Epoch: 6 Average loss: 0.0136
====> Epoch: 7 Average loss: 0.0131
====> Epoch: 8 Average loss: 0.0136


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0124


INFO:__main__: Iteration: 9: run prediction on unlabeled data `du` 


Iteration: 8, len(dl): 50, len(du): 9, len(dh) 0, p: [0.5555555555555556, 0.4, 0.75, 0.6] r: [0.8333333333333334, 0.25, 0.6, 0.42857142857142855] f: [0.6666666666666667, 0.3076923076923077, 0.6666666666666665, 0.5] 


INFO:__main__: Update size of `dl`  and `du` by adding uncertain 5 samples in `dl` len(dl): 55, len(du) 9
INFO:__main__: Update size of `dl`  and `du` by adding 0 hcs samples in `dl` len(dl): 55, len(du) 9
INFO:__main__: Iteration: 9 fine-tune the model on dh U dl


====> Epoch: 0 Average loss: 0.0145
====> Epoch: 1 Average loss: 0.0140
====> Epoch: 2 Average loss: 0.0137
====> Epoch: 3 Average loss: 0.0153
====> Epoch: 4 Average loss: 0.0146
====> Epoch: 5 Average loss: 0.0145
====> Epoch: 6 Average loss: 0.0164
====> Epoch: 7 Average loss: 0.0122
====> Epoch: 8 Average loss: 0.0103


INFO:__main__: remove 5 uncertain samples from du


====> Epoch: 9 Average loss: 0.0154
Iteration: 9, len(dl): 55, len(du): 4, len(dh) 0, p: [0.5555555555555556, 0.6, 0.6666666666666666, 0.4] r: [0.8333333333333334, 0.375, 0.4, 0.2857142857142857] f: [0.6666666666666667, 0.4615384615384615, 0.5, 0.3333333333333333] 


# Extra processing
---
The cells are converted to markdown cells so that they are not run when you run all cells. Change them to code cells to run if required

Exporting data zip file

import zipfile
with zipfile.ZipFile('annotated-20210501T142205Z-001.zip', 'r') as zip_ref:
    zip_ref.extractall()

Move files to train and test directory

import os
dataitems = os.listdir('data')
finalitems = []
for item in dataitems:
    if not os.path.isdir('data/'+item):
        finalitems.append(item)

for idx, item in enumerate(finalitems):
    # this will assign 20% images(every 5th image) to the test directory
    if idx % 5 == 0: 
        os.rename('data/' + item, 'data/test/'+item)
    else:
        os.rename('data/' + item, 'data/train/'+item)