# Imports

In [None]:
# General
from os import listdir, makedirs
from collections import defaultdict
from os.path import join, isdir, splitext
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from plotcm import plot_confusion_matrix

# Math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Image processing
import pytesseract
from PIL import Image
import cv2

# PyTorch
import torch
from torch.nn.modules.activation import ReLU
from torch.nn.modules.pooling import MaxPool2d
from torchvision import datasets
import torchvision
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary

# Parameters / configuration

In [None]:
# Blur parameters
kernel = (3, 3)
level = 2

# Dataset path
raw_path = r"dataset\raw12"
seg_path = r"dataset\segmented"
pad_path = r"dataset\padded"
test_path = r'dataset\test'
test_data_path = r'dataset\test_resize_test'

# Data view of samples - choose the dimensions of plot. Total CAPTHAs = row*col
row = 2
col = 4

# CNN
DATA_DIR = "archive\samples"

BATCH_SIZE = 8
IMAGE_WIDTH = 300
IMAGE_HEIGHT = 75
NUM_WORKERS = 8
EPOCHS = 200
DEVICE = "cuda"

GBS = 1000 # Global batch size
CH1 = 25 # First channel selection (out)

# Loading images and counting characters

In [None]:
allowed_chars = string.ascii_lowercase + string.digits # abcdefghijklmnopqrstuvwxyz0123456789

if not isdir(seg_path):
    makedirs(seg_path)

    for i in allowed_chars:
        makedirs(seg_path + "/" + i)

files = listdir(raw_path) # Files in directory with raw CAPTCHAs

counts = defaultdict(int) # Default dict to count number of each symbol

# Show unique characters/labels
temp = []
for file in files:
    temp.append(splitext(file)[0])
characters = set(char for label in temp for char in label)
print(f'Unique characters/labels: \n{characters}')
print(f'Number of unique characters/labels: {len(characters)}')

UC = len(characters) # Keeping number of unique characters for CNN output features.

# Filtering and segmentation

In [None]:
print('Number of files:', len(files))
for file in files:
    image = cv2.imread(raw_path + '\\' + file, 0)
    letters = splitext(file)[0]

    # blur
    k = np.ones((5,5),np.float32)/25
    dst = cv2.filter2D(image,-1,k)

    # threshold
    ret, image = cv2.threshold(dst, 110, 255, cv2.THRESH_BINARY_INV)
    image = cv2.erode(image, kernel, iterations = level)

    connectivity = 4
    output = cv2.connectedComponentsWithStats(image, connectivity, cv2.CV_32S)

    num_labels = output[0]
    labels = output[1]
    stats = output[2]
    centroids = output[3]

    objects = []

    for i in range(1, num_labels):
        a = stats[i, cv2.CC_STAT_AREA]

        if a > 50:
            x = stats[i, cv2.CC_STAT_LEFT]
            y = stats[i, cv2.CC_STAT_TOP]
            w = stats[i, cv2.CC_STAT_WIDTH]
            h = stats[i, cv2.CC_STAT_HEIGHT]

            objects.append((x, y, w, h))

    objects.sort(key=lambda t: t[0])

    num_detected = min(len(objects), 4)

    for i in range(num_detected):
        o = objects[i]
        x = o[0]
        y = o[1]
        w = o[2]
        h = o[3]

        img = image[y:y+h, x:x+w]
        rgb = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)

        letter = letters[i]
        
        filename = str(counts[letter]).zfill(5) + ".png" #"\\" + str(counts[letter]).zfill(5) + ".png" # <symbol><number of symbols with zero fill>.png
        
        path = seg_path + "\\" + letter + filename
        cv2.imwrite(path, img)
        counts[letter] += 1
   

# Data view

### CAPTCHA samples

In [None]:

fig = plt.figure(figsize=(14,4))

for i in range(1, col*row + 1):
    img = cv2.imread(raw_path + '\\' + files[i],0)
    plot = fig.add_subplot(row, col, i)
    plot.title.set_text(splitext(files[i])[0])
    plt.imshow(img, cmap='binary')
plt.show()

### CAPTCHA segmentation samples

In [None]:
samples = row*col # Choosen at the top (parameters / configuration)
col = 4
orig_row = row
row = samples

# Get path to letters of <samples> CAPTCHAs
letter_paths = []
letter_sample = [[char for char in splitext(sample)[0]] for sample in files[1:samples + 1]]
letter_count = defaultdict(int)
for w in letter_sample:
    for i, l in enumerate(w):
        letter_paths.append(seg_path + '\\' + l + str(letter_count[l]).zfill(5) + '.png')
        letter_count[l] += 1

# Plot segmented CAPTCHAs
fig = plt.figure(figsize = (10,30))
for i in range(1, col*row + 1):
    img = cv2.imread(letter_paths[i-1])
    plot = fig.add_subplot(row, col, i)
    plot.title.set_text(letter_paths[i-1][18:-9])
    plt.axis('equal')
    plt.imshow(img, cmap='binary')
plt.show()
row = orig_row # Resetting it so it doesn't multiply rows

# Padding

In [None]:
for file in listdir(seg_path):

       img = cv2.imread(seg_path + '\\' + file)

       old_image_height, old_image_width, channels = img.shape

       # create new image of desired size and color (blue) for padding
       new_image_width = 71
       new_image_height = 71
       color = (0,0,0) #Black
       result = np.full((new_image_height,new_image_width, channels), color, dtype=np.uint8)

       # compute center offset
       x_center = (new_image_width - old_image_width) // 2
       y_center = (new_image_height - old_image_height) // 2

       # copy img image into center of result image
       result[y_center:y_center+old_image_height, 
              x_center:x_center+old_image_width] = img

       path = pad_path + '\\' + file
       cv2.imwrite(path, result)

plt.imshow(result)

# Encoding and dataframe (train data)

In [None]:
dir = test_path
df = pd.DataFrame(columns = ['data','targets'])
for filename in os.listdir(dir):
    if '.png' in filename:
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            
            data = Image.open(f).convert('L') # Converts to one channel grayscale
            trans = ToTensor()
            data = trans(data)#np.squeeze(trans(data)) # Formats and converts image to tensor
            label = filename[0]
            to_append = [data, label]
            a_series = pd.Series(to_append, index = df.columns)
            df = df.append(a_series, ignore_index=True)


le = LabelEncoder()
df.targets = le.fit_transform(df.targets) # https://vitalflux.com/labelencoder-example-single-multiple-columns/
map = dict(zip(le.classes_, range(len(le.classes_))))
inv_map = {v: k for k, v in map.items()}

print('Number of targets/labels/classes:',len(le.classes_))
print('Number of keys in map:',len(map))
print('Map:\n', map)
print('Inverse map:', inv_map)

# Encoding and dataframe (test data)

In [None]:
dir = test_data_path
df_test = pd.DataFrame(columns = ['data','targets'])
for filename in os.listdir(dir):
    if '.png' in filename:
        f = os.path.join(dir, filename)
        # checking if it is a file
        if os.path.isfile(f):
            
            data = Image.open(f).convert('L') # Converts to one channel grayscale
            trans = ToTensor()
            data = trans(data)#np.squeeze(trans(data)) # Formats and converts image to tensor
            label = filename[0]
            to_append = [data, label]
            a_series = pd.Series(to_append, index = df.columns)
            df_test = df_test.append(a_series, ignore_index=True)


le2 = LabelEncoder()
df_test.targets = le2.fit_transform(df_test.targets) # https://vitalflux.com/labelencoder-example-single-multiple-columns/
map2 = dict(zip(le.classes_, range(len(le.classes_))))
inv_map2 = {v: k for k, v in map.items()}

print('Number of targets/labels/classes:',len(le2.classes_))
print('Number of keys in map:',len(map2))
print('Map:\n', map2)
print('Inverse map:', inv_map2)

# Dataset & dataloader

In [None]:
# DATA LOADER TESTING
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        return (
            row['data'],
            torch.tensor(row['targets'])
        )

dataset = MyDataset(df)
testset = MyDataset(df_test)
# Data example
#x,y = next(iter(dataset))
print(dataset[:][1])

test_loader = DataLoader(dataset, batch_size=GBS, shuffle = True) # Shuffle makes sure the loss function doesn't spike
train_loader = DataLoader(testset, batch_size=GBS, shuffle = True)
print(f'Train datapoints: {len(train_loader.dataset)}')
print(f'Test datapoints: {len(test_loader.dataset)}')

# CNN

In [None]:
CH1 = 50

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Create Neural Network (Convolutional)
class CaptchaSolver(torch.nn.Module):
    CH1 = 25 #####################################
    def __init__(self):
        super(CaptchaSolver,self).__init__()
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=CH1, kernel_size=2)
        self.relu1 = torch.nn.ReLU()
        self.maxpool1 = torch.nn.MaxPool2d(kernel_size=2)
        self.conv2 = torch.nn.Conv2d(in_channels=CH1, out_channels=20, kernel_size=3)
        self.bnorm = torch.nn.BatchNorm2d(20)
        self.drop = torch.nn.Dropout(0.25)
        self.relu2 = torch.nn.ReLU()
        self.maxpool2 = torch.nn.MaxPool2d(kernel_size=2)
        self.flatten = torch.nn.Flatten()
        self.linear = torch.nn.Linear(in_features=500, out_features=len(le.classes_))
        self.bnorm2 = torch.nn.BatchNorm1d(len(le.classes_))
    
    def forward(self,x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.bnorm(x)
        #x = self.drop(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        x = self.flatten(x)
        x = self.linear(x)
        #x = self.bnorm2(x)
        return x

# Creating model
model = CaptchaSolver()
model.to(device)
summary(model, (1,28,28), device='cuda')

# Defining loss with nn.CrossEntropyLoss
loss = torch.nn.CrossEntropyLoss()

# Defining optimizer optim.Adam
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 101
all_loss = []
acc = []
test_loss_plt = []
for epoch in range(num_epochs):
    train_loss = 0.
    for i,(x,y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        y = y.to(torch.uint8)
        out = model(x)
        l = loss(out, y)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        train_loss += l
        
        if i%100 == 0:
            print(f'Train loss = {l:.5f}')
            #plt.plot(np.array(all_loss))
            #plt.show()
    all_loss += [train_loss.cpu().detach().numpy()/len(train_loader.dataset)]

    test_loss = 0.
    for x, y in test_loader:
        x = x.to(device)
        y = y.to(device)
        y = y.to(torch.uint8)
        out = model(x)
        l = loss(out,y)
        pred = out.max(1).indices
        acc = (sum(pred == y)/GBS)*100
        test_loss += l
        

    test_loss_plt += [test_loss.cpu().detach().numpy()/len(test_loader.dataset)]
    print(f'Epoch: {epoch}          Batch size = {GBS}')
    print(f'Batch accuracy: {acc:.2f}%')
    plt.plot(np.array(all_loss),label='Train loss')
    plt.plot(np.array(test_loss_plt), label='Validation loss')
    plt.legend(loc="upper left")
    plt.show()


# Rendering individual images and predictions
for _ in range(50):
    x = x.to('cpu') # Preventing CUDA compatibility errors
    model.to('cpu')
    plt.figure()
    plt.title(torch.argmax(model(x)[_])) # Adding title of prediction
    plt.imshow(x[_][0]) # Rendering image of label
    plt.show()



# Confusion matrix

### Formatting predictions

In [None]:
# In case model didn't finish training
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

def get_all_preds(net,loader):
    all_preds = torch.tensor([])
    for batch in loader:
        images, labels = batch
        preds = net(images)
        all_preds = torch.cat(
            (all_preds,preds)
            ,dim=0
        )
    return all_preds

with torch.no_grad():
    prediction_loader = DataLoader(testset, batch_size=1000)
    train_preds = get_all_preds(model.to('cpu'), prediction_loader)
    
preds_correct = get_num_correct(train_preds, testset[:][1])
print(f'Total correct: {preds_correct} out of {len(testset[:][1])}')
print('Accuracy: {:.2f}%'.format(preds_correct / len(testset[:][1])*100))

### Building confusion matrix

In [None]:
stacked = torch.stack(
    (
        testset[:][1]
        , train_preds.argmax(dim=1)
    )
    , dim=1
)
stacked[0].tolist()

cmt = torch.zeros(36,36,dtype=torch.int32)

for s in stacked:
    tl, pl = s.tolist()
    cmt[tl,pl] = cmt[tl,pl] + 1

### Plotting confusion matrix

In [None]:
cm = confusion_matrix(testset[:][1], train_preds.argmax(dim=1))
names = []
for l in map:
    names.append(str(l))
names = tuple(names)

plt.figure(figsize=(20,20))
plot_confusion_matrix(cm, names)

# Confidence interval

In [None]:
# Confidence interval function
def CT(p, n, a):
    iv = a * sqrt( ((p*(1-p))/n) )
    #return (p-iv)*100, (p+iv)*100
    return iv
# Values
CIs = []
#Model1 with bartch norm
crit_val = 1.96 # 95% confidence
proportion1 = 0.900191152375751 # This is from confusion matrix accuracy calculation
sample_observations = 21972
CIs.append(CT(proportion1, sample_observations, crit_val))

#Model 2 without batch norm
crit_val = 1.96 # 95% confidence
proportion2 = 0.8667394866193336 # This is from confusion matrix accuracy calculation
sample_observations = 21972
CIs.append(CT(proportion2, sample_observations, crit_val))

#Model 3 without batch norm but with dropout
crit_val = 1.96 # 95% confidence
proportion3 = 0.8840342253777536 # This is from confusion matrix accuracy calculation
sample_observations = len(testset[:][1])
CIs.append(CT(proportion3, sample_observations, crit_val))

print(CIs)

In [None]:
from math import sqrt
crit_val = 1.96 # 95% confidence
proportion = preds_correct / len(testset[:][1]) # This is from confusion matrix accuracy calculation
sample_observations = len(testset[:][1])


labels = [f'M1: {CIs[1]*100:.2f}%', f'M2: {CIs[2]*100:.2f}%',f'M3: {CIs[0]*100:.2f}%']
means = [proportion2,proportion3,proportion1]
positions = [0,1,2]
plt.ylim(0.85,0.92)
plt.bar(positions, means,color='tab:blue', yerr=CIs, width = 0.8, align='center', ecolor='black', capsize=50)
plt.ylabel('Proportion mellem 0 og 1')
plt.title('KONFIDENSINTERVAL')
plt.xticks(positions, labels)
plt.show()

# CAPTCHA human benchmark

In [None]:
steps = 900
col = 6
row = 4
fig = plt.figure(figsize=(14,8*(row/2)))
for i in range(1, col*row + 1):
    TL = inv_map[int(testset[i*steps][1])]
    AIP = inv_map[int(train_preds.argmax(dim=1)[i*steps])]
    plot = fig.add_subplot(row, col, i)
    plot.title.set_text(f'TL: {TL}  AIP: {AIP}')
    plt.imshow(np.squeeze(testset[i*steps][0]))  
plt.show()

# Custom Data

In [None]:
train_loader = DataLoader(dataset, batch_size=GBS)

x,y = next(iter(train_loader))

print(x.shape)
print(y.shape)
print('TEST:',type(y))
plt.imshow(x[0,0])

# MNIST data load test

In [None]:
train_data_mnist = datasets.MNIST(root='data', train=True, transform=ToTensor(), download=True)
test_data_mnist = datasets.MNIST(root='data', train=False, transform=ToTensor(), download=True)

#Example of data
x_mnist = train_data_mnist.data[0]
y_mnist = train_data_mnist.targets[0]

# Make train and test dataloaders
train_loader_mnist = DataLoader(train_data_mnist, batch_size=GBS)
test_loader_mnist= DataLoader(test_data_mnist, batch_size=GBS)

# Extract batch using next(iter(...))
xm, ym = next(iter(train_loader_mnist))

print(xm.shape)
print(ym.shape)
print(ym)
plt.imshow(xm[0,0])