### Convolutional Model for identifying Captchas from the given dataset.
1. After taking a close look at the labels in the dataset it is observed that they are all of same length. 
2. As the labels are of same lengths it is not required to use complex networks like LSTM . So I built a custom architecture with 2 convolutional blocks . This architecture is trained to spit out 5 categories. The network is trained using adam optimiser and used categorical_crossentropy as the loss function. 
3. The data was split into 30000 for training , 10000 for validation and 2971 for test data set.
4. The model achieved validation accuracy of above 97% and test accuracy of 94.88 %
5. Tried models like resnet with 4 blocks but it turned to be expensive for the given dataset.


In [1]:
import pandas as pd
from glob import glob
from PIL import Image
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.np_utils import to_categorical
from keras.models import Sequential,Model
from keras.layers import Dense,Activation,merge,AveragePooling2D,SeparableConv2D,Conv2D,MaxPooling2D,Dense,Lambda,Flatten,BatchNormalization,Input,Dropout
%matplotlib inline

Using TensorFlow backend.


In [88]:
labels = pd.read_csv('labels2.txt',names=['img','label'])
files = glob('dataset50000/*.png')

In [89]:
char_list = set()
for x in labels.label:
    char_list.update(list(x))
char_list = list(char_list)

In [90]:
labels['img'] = labels['img'].apply(lambda x : 'dataset50000/'+x)
labels['outputarr'] = labels['label'].apply(lambda x:list(x))
test_data = labels.iloc[40000:]

In [31]:
images = []
for f in labels.img:
    ima = np.asarray(Image.open(f))
    images.append(ima)
images = np.array(images)

In [32]:
targets = np.vstack(labels.outputarr.values)

In [35]:
def to_categorical_(targs):
    index_list = []
    targs = [targets[:,i:i+1] for i in range(5)]
    for t in targs:
        index = [char_list.index(v) for v in t]
        categorical = to_categorical(index,37)
        index_list.append(categorical)
    return index_list

In [39]:
Y = to_categorical_(targets)

In [43]:
x_train = images[:30000]
x_val = images[30000:40000]
x_test = images[40000:]
y_train = [y[:30000] for y in Y]
y_val = [y[30000:40000] for y in Y]
y_test = [y[40000:] for y in Y]


In [58]:
def conv_block(x,filters,n):
    for _ in range(n):
        x = Conv2D(filters,3,3,activation='relu')(x)
        x = BatchNormalization()(x)
        x = Conv2D(filters,3,3,activation='relu')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D()(x)
    return x
d = 0.4
input = Input(shape = (80,215,3))
x = Lambda(lambda x: x/255)(input)
for f in [64,128]:
    x = conv_block(x,f,2)    
x = Flatten()(x)
output = []
x = Dense(500,activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(d)(x)
x = Dense(250,activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(d)(x)
for _ in range(5):
    output.append(Dense(37,activation='softmax')(x))
model = Model(input,output)

In [59]:
loss_weights = [0.2] * 5

In [60]:
model.compile('adam','categorical_crossentropy',loss_weights=loss_weights,metrics=['accuracy'])

In [161]:
model.load_weights('captcha.h5')

In [151]:
model.optimizer.lr = model.optimizer.lr/10

In [152]:
model.fit(x_train,y_train,batch_size=128,nb_epoch=1 , validation_data=(x_val,y_val),verbose=1)

Train on 30000 samples, validate on 10000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f6466d3df28>

In [140]:
#model.save_weights('captcha.h5')

In [162]:
predictions = model.predict(x_test)

In [163]:
predicted_index = [np.argmax(predictions[i],1) for i in range(5)]

In [164]:
pred_labels = np.dstack(predicted_index)

In [165]:
preds = []
for label in np.dstack(predicted_index)[0]:
    l = [char_list[l] for l in label]
    preds.append(''.join(l))
    

In [166]:
test_data['preds'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [167]:
correct = np.sum(test_data['label'] == test_data['preds'])

In [168]:
total = test_data.count()[0]

In [169]:
100 * correct/total 

94.883877482329183