In [1]:
# import libaraies
import csv
import numpy as np
import random
import time
import os
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
from torchvision.io import read_image
from torch.utils.data import DataLoader
import torch.optim as optim

In [2]:
torch.__version__

'1.13.0'

In [3]:
# path of dataset
TRAIN_PATH = "/kaggle/input/captcha-hacker/train"
TEST_PATH = "/kaggle/input/captcha-hacker/test"

# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
# mapping for label's one-hot encoding 
mapping = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 
            'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '0': 26, 
            '1': 27, '2': 28, '3': 29, '4': 30, '5': 31, '6': 32, '7': 33, '8': 34, '9': 35 }

# dataset classes
class Task1Dataset(Dataset):
    def __init__(self, data, root, return_filename=False):
        self.data = [sample for sample in data if sample[0].startswith(TASK)]
        self.return_filename = return_filename
        self.root = root
    
    def __getitem__(self, index):
        filename, label = self.data[index]
        img = read_image(f"{self.root}/{filename}")
        if self.return_filename:
            # normalize the value of image to 0 ~ 1
            return torch.FloatTensor(img / 255), filename
        else:
            # normalize the value of image to 0 ~ 1
            return torch.FloatTensor(img / 255), int(label)

    def __len__(self):
        return len(self.data)
    
class Task2Dataset(Dataset):
    def __init__(self, data, root, return_filename=False):
        self.data = [sample for sample in data if sample[0].startswith(TASK)]
        self.return_filename = return_filename
        self.root = root
    
    def __getitem__(self, index):
        filename, label = self.data[index]
        img = read_image(f"{self.root}/{filename}")
        if self.return_filename:
            # normalize the value of image to 0 ~ 1
            return torch.FloatTensor((img) / 255), filename
        else:
            # one-hot encoding
            encoding_label = [0] * 72
            encoding_label[mapping[label[1]] + 36] = 1
            encoding_label[mapping[label[0]]] = 1
            # normalize the value of image to 0 ~ 1
            return torch.FloatTensor(img / 255), torch.FloatTensor(encoding_label)

    def __len__(self):
        return len(self.data)

class Task3Dataset(Dataset):
    def __init__(self, data, root, return_filename=False):
        self.data = [sample for sample in data if sample[0].startswith(TASK)]
        self.return_filename = return_filename
        self.root = root
    
    def __getitem__(self, index):
        filename, label = self.data[index]
        img = read_image(f"{self.root}/{filename}")
        if self.return_filename:
            # normalize the value of image to 0 ~ 1
            return torch.FloatTensor(img / 255), filename
        else:
            # one-hot encoding
            encoding_label = [0] * 144
            encoding_label[mapping[label[3]] + 108] = 1
            encoding_label[mapping[label[2]] + 72] = 1
            encoding_label[mapping[label[1]] + 36] = 1
            encoding_label[mapping[label[0]]] = 1
            # normalize the value of image to 0 ~ 1
            return torch.FloatTensor(img / 255), torch.FloatTensor(encoding_label)

    def __len__(self):
        return len(self.data)

In [5]:
# get train data and validation data
def get_dataLoader(TASK):  
    random.seed(time.time())
    train_data = []
    val_data = []

    with open(f'{TRAIN_PATH}/annotations.csv', newline='') as csvfile:
        for row in csv.reader(csvfile, delimiter=','):
            if row[0].startswith(TASK):
                # random distribute data to train set or validation set
                if random.random() < 0.9:
                    train_data.append(row)
                else:
                    val_data.append(row)
    
    if TASK == "task1":
        train_ds = Task1Dataset(train_data, root=TRAIN_PATH)
        val_ds = Task1Dataset(val_data, root=TRAIN_PATH)
    elif TASK == "task2":
        train_ds = Task2Dataset(train_data, root=TRAIN_PATH)
        val_ds = Task2Dataset(val_data, root=TRAIN_PATH)
    else:
        train_ds = Task3Dataset(train_data, root=TRAIN_PATH)
        val_ds = Task3Dataset(val_data, root=TRAIN_PATH)

    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, drop_last=False, shuffle=False)
    
    return train_dl, val_dl

In [6]:
def train(model, preprocess, optimizer, loss_fn, train_dl, TASK, PATH):
    # training loop
    for epoch in range(NUM_EPOCHS):
        print(f"Epoch [{epoch}]", end=" -> ")
        model.train()
        for image, label in train_dl:
            image = preprocess(image)
            image = image.to(device)
            label = label.to(device)

            pred = model(image)
            loss = loss_fn(pred, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        sample_count = 0
        correct_count = 0

        # evaluate the traning with validation data
        model.eval()
        with torch.no_grad():
            for image, label in val_dl:
                image = preprocess(image)
                image = image.to(device)
                label = label.to(device)

                pred = model(image)
                loss = loss_fn(pred, label)
                
                if TASK == "task1":                    # pred is a number
                    pred = torch.argmax(pred, dim=1)
                    sample_count += len(image)
                    correct_count += (label == pred).sum()
                elif TASK == "task2":                  # pred is one-hot encoding vector
                    pred = torch.softmax(pred, dim=1)
                    pred = pred.view(-1, 2, 36)
                    pred = torch.argmax(pred, dim=2)
                    label = label.view(-1, 2, 36)
                    label = torch.argmax(label, dim=2)
                    sample_count += len(image)
                    for i in range(len(pred)):
                        if pred[i][0] == label[i][0] and pred[i][1] == label[i][1]:
                            correct_count += 1
                else:                                  # pred is one-hot encoding vector
                    pred = torch.softmax(pred, dim=1)
                    pred = pred.view(-1, 4, 36)
                    pred = torch.argmax(pred, dim=2)
                    label = label.view(-1, 4, 36)
                    label = torch.argmax(label, dim=2)
                    sample_count += len(image)
                    for i in range(len(pred)):
                        if pred[i][0] == label[i][0] and pred[i][1] == label[i][1] and pred[i][2] == label[i][2] and pred[i][3] == label[i][3]:
                            correct_count += 1
                            
                # delete variable to avoid excess cuda memory
                del image, label, pred

        print("accuracy (validation):", (correct_count / sample_count))

    # save model weight
    torch.save(model.state_dict(), PATH)
    
    print("Finish Training !!")

In [7]:
# mapping one-hot encoding back to string
mapping2 = { 0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 
            15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z', 26: '0', 27: '1', 28: '2', 
            29: '3', 30: '4', 31: '5', 32: '6', 33: '7', 34: '8', 35: '9'}
    

def test(PATH, out_features, TASK):
    # load test data
    test_data = []
    with open(f'{TEST_PATH}/../sample_submission.csv', newline='') as csvfile:
        for row in csv.reader(csvfile, delimiter=','):
            if row[0].startswith(TASK):
                test_data.append(row)

    test_ds = Task1Dataset(test_data, root=TEST_PATH, return_filename=True)
    test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, drop_last=False, shuffle=False)

    # open submission.csv for writing predictions
    if os.path.exists('submission.csv'):
        file = open('submission.csv', 'a', newline='')
        csv_writer = csv.writer(file)
    else:
        file = open('submission.csv', 'w', newline='')
        csv_writer = csv.writer(file)
        csv_writer.writerow(["filename", "label"])

    model = torchvision.models.resnet18()
    model.fc = nn.Linear(in_features=512, out_features=out_features, bias=True)
    model.load_state_dict(torch.load(PATH))
    model = model.to(device)
    
    # testing loop
    model.eval()
    with torch.no_grad():
        for image, filenames in test_dl:
            image = preprocess(image)
            image = image.to(device)

            pred = model(image)
            temp = None
            if TASK == "task1":
                pred = torch.argmax(pred, dim=1)
                for i in range(len(filenames)):
                    csv_writer.writerow([filenames[i], str(pred[i].item())])
            elif TASK == "task2":
                pred = torch.softmax(pred, dim=1)
                pred = pred.view(-1, 2, 36)
                pred = torch.argmax(pred, dim=2)
                for i in range(len(filenames)):
                    temp = mapping2[pred[i][0].item()] + mapping2[pred[i][1].item()]
                    csv_writer.writerow([filenames[i], temp])
            else:
                pred = torch.softmax(pred, dim=1)
                pred = pred.view(-1, 4, 36)
                pred = torch.argmax(pred, dim=2)
                for i in range(len(filenames)):
                    temp = mapping2[pred[i][0].item()] + mapping2[pred[i][1].item()] + mapping2[pred[i][2].item()] + mapping2[pred[i][3].item()]
                    csv_writer.writerow([filenames[i], temp])
               
            # delete variable to avoid excess cuda memory
            del image, pred, temp

    file.close()
    
    print("Finish", TASK, "!!")

#### Task 1

In [8]:
# define hyperparameters for task 1
TASK = "task1"
BATCH_SIZE = 100
NUM_EPOCHS = 10
LR = 1e-3
PATH = "task1_weight.pth"
out_features = 10

train_dl, val_dl = get_dataLoader(TASK)

# define model using resnet18 with pretrained weight
task1Model = torchvision.models.resnet18(pretrained=True)
task1Model.fc = nn.Linear(in_features=512, out_features=out_features, bias=True)
task1Model = task1Model.to(device)

# define Adam as optimizer and CrossEntropyLoss as loss function
optimizer = torch.optim.Adam(task1Model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()

# define transforms for data preprocessing
preprocess = transforms.Compose([transforms.Resize((288, 288)), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

train(task1Model, preprocess, optimizer, loss_fn, train_dl, TASK, PATH)
# test(PATH, out_features, TASK)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/captcha-hacker/train/annotations.csv'

#### Task 2

In [None]:
# define hyperparameters for task 2
TASK = "task2"
BATCH_SIZE = 100
NUM_EPOCHS = 10
LR = 1e-3
PATH = "task2_weight.pth"
out_features = 36 + 36

train_dl, val_dl = get_dataLoader(TASK)

# define model using resnet18 with pretrained weight
task2Model = torchvision.models.resnet18(pretrained=True)
task2Model.fc = nn.Linear(in_features=512, out_features=out_features, bias=True)
task2Model = task2Model.to(device)

# define Adam as optimizer and MultiLabelSoftMarginLoss as loss function
optimizer = torch.optim.Adam(task2Model.parameters(), lr=LR)
loss_fn = nn.MultiLabelSoftMarginLoss()

# define transforms for data preprocessing
preprocess = transforms.Compose([transforms.Resize((288, 288)), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

train(task2Model, preprocess, optimizer, loss_fn, train_dl, TASK, PATH)
# test(PATH, out_features, TASK)

Epoch [0] -> accuracy (validation): 0.072
Epoch [1] -> accuracy (validation): 0.648
Epoch [2] -> accuracy (validation): 0.98
Epoch [3] -> accuracy (validation): 0.988
Epoch [4] -> accuracy (validation): 0.996
Epoch [5] -> accuracy (validation): 0.996
Epoch [6] -> accuracy (validation): 1.0
Epoch [7] -> accuracy (validation): 1.0
Epoch [8] -> accuracy (validation): 1.0
Epoch [9] -> accuracy (validation): 1.0
Finish Training !!
Finish Testing !!


#### Task 3

In [None]:
# define hyperparameters for task 3
TASK = "task3"
BATCH_SIZE = 50
NUM_EPOCHS = 15
LR = 1e-3
PATH = "task3_weight.pth"
out_features = 36 + 36 + 36 + 36

train_dl, val_dl = get_dataLoader(TASK)

# define model using resnet18 with pretrained weight
task3Model = torchvision.models.resnet18(pretrained=True)
task3Model.fc = nn.Linear(in_features=512, out_features=out_features, bias=True)
task3Model = task3Model.to(device)

# define Adam as optimizer and MultiLabelSoftMarginLoss as loss function
optimizer = torch.optim.Adam(task3Model.parameters(), lr=LR)
loss_fn = nn.MultiLabelSoftMarginLoss()

# define transforms for data preprocessing
preprocess = transforms.Compose([transforms.Resize((384, 288)), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

train(task3Model, preprocess, optimizer, loss_fn, train_dl, TASK, PATH)
# test(PATH, out_features, TASK)

Epoch [0] -> accuracy (validation): 0.0
Epoch [1] -> accuracy (validation): 0.0
Epoch [2] -> accuracy (validation): 0.1258741258741259
Epoch [3] -> accuracy (validation): 0.548951048951049
Epoch [4] -> accuracy (validation): 0.7692307692307693
Epoch [5] -> accuracy (validation): 0.8881118881118881
Epoch [6] -> accuracy (validation): 0.9055944055944056
Epoch [7] -> accuracy (validation): 0.9440559440559441
Epoch [8] -> accuracy (validation): 0.9440559440559441
Epoch [9] -> accuracy (validation): 0.965034965034965
Epoch [10] -> accuracy (validation): 0.958041958041958
Epoch [11] -> accuracy (validation): 0.9755244755244755
Epoch [12] -> accuracy (validation): 0.9825174825174825
Epoch [13] -> accuracy (validation): 0.9790209790209791
Epoch [14] -> accuracy (validation): 0.965034965034965
Finish Training !!
Finish Testing !!
