In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import PIL as pil
import hashlib
from pickle import dump, load

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

def evaluate_model(model, train_ds, validation_ds, n_jobs = -1, verbose = 1):
    cross_scores = cross_val_score(model, train_ds[0], train_ds[1], cv=5, 
        scoring='accuracy', n_jobs=n_jobs, verbose=verbose)
    
    model.fit(train_ds[0], train_ds[1])
    validation_score = accuracy_score(validation_ds[1], model.predict(validation_ds[0]))
    return model, cross_scores, validation_score

def create_submission(model, test_index, test_set, filename = 'submission.csv'):
    ans = pd.DataFrame({
        'id': test_index['image'].apply(lambda img: img.replace('test/', '')), 
        'label': model.predict(test_set)
    })
    
    ans.to_csv(filename, index=False)

def BasicPipeline(model_class):
    return Pipeline([
        ('norm', PixelNormalizer()),
        ('train', model_class())
    ])

def PixelNormalizer():
    return FunctionTransformer(lambda pixel: pixel / 255) 

def save_dataset(name, obj):
    with open(f'{name}.pickle', 'wb') as f:
        dump(obj, f)
def reload_dataset(name):
    with open(f'{name}.pickle', 'rb') as f:
        return load(f)

def hash_obj(obj):
    r = repr(obj).encode('utf-8')
    h = hashlib.md5()
    h.update(r)
    return h.hexdigest()

def read_img(filename):
    img = np.array(pil.Image.open(filename), dtype=np.uint8)
    img = np.reshape(img, (-1))
    return img

def check_hash(data, h):
    return hash_obj(data) == h

def check_img_list_hash(imgs, hashes):
    return len(imgs) == len(hashes) and all([hash_obj(img) == h for img, h in zip(imgs, hashes)])

def load_dataset(dataset_name, load_labels = True):
    index = pd.read_csv(f"{dataset_name}.txt", header=None, names=['image', 'label', 'hash'])
    index['image'] = index['image'].apply(lambda filename: f"{dataset_name}/{filename}")
    
    assert len(index['image']) == 30001 or len(index['image']) == 5000
    
    if load_labels:
        labels = np.array(index["label"], dtype=np.uint8)
        labels = np.reshape(labels, (-1))
    else:
        labels = None
    
    images = [read_img(filename) for filename in index['image']]
    images = np.array(images)
    
    index['hash'] = [hash_obj(img) for img in images]
    
    return index, images, labels

In [3]:
# Reload data
train_index, train_set, train_labels = reload_dataset('train')
validation_index, validation_set, validation_labels = reload_dataset('validation')
test_index, test_set, _ = reload_dataset('test')

# Check integrity
check_img_list_hash(test_set, test_index['hash'])

True

In [8]:
basic = BasicPipeline(SGDClassifier)
results = evaluate_model(basic, (train_set, train_labels), (validation_set, validation_labels), n_jobs=2, verbose=10)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   36.2s
[Parallel(n_jobs=2)]: Done   3 out of   5 | elapsed:  1.2min remaining:   46.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:  1.7min finished


In [15]:
results

(Pipeline(steps=[('norm',
                  FunctionTransformer(func=<function PixelNormalizer.<locals>.<lambda> at 0x7f1fe28cf4d0>)),
                 ('train', SGDClassifier())]),
 array([0.53491085, 0.53833333, 0.50633333, 0.53416667, 0.5405    ]),
 0.5574)

In [14]:
create_submission(basic, test_index, test_set, 'sgdc_basic.csv')

In [18]:
svm_basic = BasicPipeline(SVC)
results_svm = evaluate_model(svm_basic, (train_set, train_labels), (validation_set, validation_labels), n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 14.4min remaining: 21.6min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 14.6min remaining:  9.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 20.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 20.7min finished


In [21]:
results_svm

(Pipeline(steps=[('norm',
                  FunctionTransformer(func=<function PixelNormalizer.<locals>.<lambda> at 0x7f200d58c440>)),
                 ('train', SVC())]),
 array([0.72854524, 0.73766667, 0.73383333, 0.72816667, 0.72816667]),
 0.735)

In [20]:
create_submission(svm_basic, test_index, test_set, 'svm_basic.csv')