In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss, confusion_matrix
import matplotlib.pyplot as plt

np.random.seed(100)
LEVEL = 'level_2'

In [2]:
def read_all(folder_path, key_prefix=""):
    '''
    It returns a dictionary with 'file names' as keys and 'flattened image arrays' as values.
    '''
    print("Reading:")
    images = {}
    files = os.listdir(folder_path)
    for i, file_name in tqdm_notebook(enumerate(files), total=len(files)):
        file_path = os.path.join(folder_path, file_name)
        image_index = key_prefix + file_name[:-4]
        image = Image.open(file_path)
        image = image.convert("L")
        images[image_index] = np.array(image.copy()).flatten()
        image.close()
    return images

In [3]:
languages = ['ta', 'hi', 'en']

images_train = read_all("../input/level_2_train/"+LEVEL+"/"+"background", key_prefix='bgr_') # change the path
for language in languages:
  images_train.update(read_all("../input/level_2_train/"+LEVEL+"/"+language, key_prefix=language+"_" ))
print(len(images_train))

images_test = read_all("../input/level_2_test/kaggle_"+LEVEL, key_prefix='') # change the path
print(len(images_test))

Reading:


HBox(children=(IntProgress(value=0, max=450), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


900
Reading:


HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


300


In [4]:
list(images_test.keys())[:5]

['145', '34', '90', '261', '48']

In [5]:
X_train = []
Y_train = []
for key, value in images_train.items():
    X_train.append(value)
    if key[:4] == "bgr_":
        Y_train.append(0)
    else:
        Y_train.append(1)

ID_test = []
X_test = []
for key, value in images_test.items():
  ID_test.append(int(key))
  X_test.append(value)
  

X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)

print(X_train.shape, Y_train.shape)
print(X_test.shape)

(900, 256) (900,)
(300, 256)


In [6]:
X_train[10]

array([165, 205, 144, 174, 205, 245,  53, 127, 217, 221,  21, 177, 206,
       236,   8,  75,  20, 115,  91,  87, 169,  29,  43,   0,  37, 192,
        47, 225, 103,   4, 157, 131, 255, 206, 218, 198,   5,  45, 105,
        96, 163,  92, 199, 117,  94,  41, 251, 217, 108, 228,  13,  99,
       238, 192, 165, 165,  36, 254,  40, 166,  55, 117,  66,  26,  94,
       196, 134, 158, 131,  76, 225, 188,  78, 165,  60, 141, 204,  70,
       147, 152, 215,  82,  93, 107, 144, 101, 189, 255, 110, 117, 207,
        60, 247,  33, 101,  64,  78,  18,   5, 204,  25, 144, 206,   4,
        12, 124,  57, 159, 210,   2, 163,  31, 236, 216, 144, 132, 129,
       242, 138, 178, 161, 107, 156,   5, 241,  62, 132, 114, 178, 143,
       165, 174, 195,  25, 161, 229, 201, 112, 120,  82, 233,   8,  81,
       149, 181, 207,  57, 210,  40,   4, 124, 255,  35,  46, 139, 123,
        27, 176,  61,  64,  69, 152, 176, 198,  38, 195,  40, 177,   0,
        57, 148,  70,  67, 255,  97,  27, 243,  82, 190, 143,  2

In [7]:
X_train[0]

array([229, 255, 182, 203,  16,  59, 147, 208, 190, 188, 141,   8, 148,
        81, 237,  22, 228,  60, 188, 233,   9, 190, 225, 126, 193,  20,
        80, 185, 239, 151, 207,  40,  78, 111,  95, 114, 222, 195, 151,
       214, 183,  57, 100, 176,  78, 202, 218, 115, 230,   9, 251, 171,
        19, 163,  96, 219,  35,  31, 253, 107, 247,  38,  45,  15, 254,
        49,  38, 118, 119,  31, 245, 123, 132, 178, 225,  38, 112,  40,
       116, 189,  58, 191, 177, 122, 114,  67, 135,  35,  64,  66, 150,
       104,  75,   0, 122, 116, 206, 160, 150, 128, 159, 125,   2, 173,
        67, 229, 220, 122, 245,  38,  52, 134,   0, 156, 159, 222,  20,
       237, 213, 145, 144, 213, 138, 241,  32,  64,  84,  93, 201,  35,
       123, 154, 103,  85,  93, 230,  44, 238, 241,  91,  37, 113,  61,
       101, 238, 124, 225, 123, 187,  78,  13, 151, 244, 224, 180,  48,
        29,  22,  14, 202,  24,  48, 211, 193,  44,  60, 255,   5, 117,
       117,  31, 115, 200, 255, 198,   0,  79, 232, 106, 156, 12

In [8]:
def binarise(X):
    for i in range(len(X)):
        X[i] = 1 if (X[i] == 255) else 0
    return X

In [9]:
X_bin_train = np.array(list(map(binarise, X_train)))
X_bin_test = np.array(list(map(binarise, X_test)))

In [10]:
# Just a white background so MP Neuron should be enough...
class MPNeuron:
    
    def __init__(self):
        self.b = None
        
    def model(self, x):
        return int(sum(x) >= self.b)
    
    def predict(self, X):
        Y_pred = []
        for x in X:
            Y_pred.append(self.model(x))
        return np.array(Y_pred)
        
    def fit(self, X, Y):
        accuracies = []
        max_acc = 0
        
        for b in range(X.shape[1] + 1):
            self.b = b
            Y_pred = self.predict(X)
            accuracy = accuracy_score(Y_pred, Y)
            accuracies.append(accuracy)
            if (max_acc < accuracy):
                max_acc = accuracy
                chkpt_b = b
                
        self.b = chkpt_b
        return accuracies

In [11]:
mpn = MPNeuron()

In [12]:
accuracy = mpn.fit(X_train, Y_train)

In [13]:
print(mpn.b)
print(accuracy)

11
[0.5, 0.5, 0.5266666666666666, 0.6033333333333334, 0.6944444444444444, 0.7955555555555556, 0.8922222222222222, 0.9377777777777778, 0.9666666666666667, 0.9755555555555555, 0.9833333333333333, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9988888888888889, 0.9988888888888889, 0.9988888888888889, 0.9988888888888889, 0.9988888888888889, 0.9977777777777778, 0.9977777777777778, 0.9977777777777778, 0.9977777777777778, 0.9955555555555555, 0.9955555555555555, 0.9933333333333333, 0.9922222222222222, 0.99, 0.9877777777777778, 0.9822222222222222, 0.9788888888888889, 0.9788888888888889, 0.9777777777777777, 0.9777777777777777, 0.9766666666666667, 0.9744444444444444, 0.9733333333333334, 0.97, 0.9666666666666667, 0.9655555555555555, 0.9611111111111111, 0.9588888888888889, 0.9555555555555556, 0.9544444444444444, 0.95, 0.95, 0.9466666666666667, 0.94, 0.93, 0.9233333333333333, 0.9188888888888889, 0.9122222222222223, 0.91, 0.8977777777777778, 0.8855555555555555, 0.8811111111111111, 0.8677777777777778,

## Sample Submission

In [14]:
Y_pred_test = mpn.predict(X_bin_test)
# Y_pred_binarised_test = (Y_pred_test >= 0.5).astype("int").ravel()

submission = {}
submission['ImageId'] = ID_test
submission['Class'] = Y_pred_test

submission = pd.DataFrame(submission)
submission = submission[['ImageId', 'Class']]
submission = submission.sort_values(['ImageId'])
submission.to_csv("submission_MPN.csv", index=False)

In [15]:
submission

Unnamed: 0,ImageId,Class
289,0,1
147,1,0
245,2,0
175,3,1
43,4,0
224,5,1
115,6,0
90,7,1
148,8,0
71,9,1


In [16]:
sum(submission['Class'])

150