In [None]:
import numpy as np
import pandas as pd
import progressbar
import os
from urllib.request import urlopen
import matplotlib.pyplot as pyplot
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import LinearSVC 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
# For reproducibility, set the initial seed
seed = 7
np.random.seed(7)
def load_dataset(dataset_fp, delimiter=",",chunksize=1000):
    if not os.path.isfile(dataset_fp):
        response = urlopen("http://cs.mcgill.ca/~ksinha4/datasets/kaggle/" + dataset_fp)
        CHUNK = 16 * chunksize
        with open(dataset_fp, 'wb') as f:
            while True:
                chunk = response.read(CHUNK)
                if not chunk:
                    break
                f.write(chunk)
    
    
    chunks = []
    pb = progressbar.ProgressBar()
    for chunk in pb(pd.read_csv(dataset_fp, delimiter=delimiter, chunksize=chunksize, header=None)):
        chunks.append(chunk)
        
    dataset = pd.concat(chunks)
    return dataset.as_matrix()


def train_validation_set_split(trainset_x, trainset_y, **kwargs):
    trainset = np.concatenate((trainset_x, trainset_y),axis=1)
    return train_test_split(trainset, **kwargs)


In [None]:
# Doing some preprocessing 

import math
import skimage 
import cv2
from skimage.filters import threshold_otsu
from skimage.measure import label, regionprops
from skimage.morphology import closing, square
from skimage.util import pad
from skimage.transform import warp, AffineTransform
from scipy import ndimage
from PIL import Image, ImageOps

def get_regions_otsu_method(image):
    bw = closing(image > 0.99, square(1))
    
    # label image regions
    label_image = label(bw)
    return [region.image for region in regionprops(label_image)]

def max_region_by_area(regions):
    return max(regions, key = lambda x : max(x.shape[0] * x.shape[0], x.shape[1] * x.shape[1]))


def to_squre(region):
    #convert rectangular image to square, keeping the ratio
    (h, w) = region.shape
    desired_size = 32
    delta_w = desired_size - w
    delta_h = desired_size - h
    padding = (delta_w//2, delta_h//2, delta_w-(delta_w//2), delta_h-(delta_h//2))
    im = Image.fromarray(region.astype('uint8')*255)
    new_im = ImageOps.expand(im, padding)
    im_array = np.array(new_im)
    transformed_im = skimage.transform.resize(im_array, (desired_size,desired_size))
    return transformed_im

def preprocess_image(image):
    p_image = image.reshape(64,64)
    p_image = p_image.astype('float32')
    regions = get_regions_otsu_method(p_image)
    max_area_region = max_region_by_area(regions)
    return to_squre(max_area_region)
    

In [None]:
xtrainload = load_dataset("train_x.csv")
ytrainload = load_dataset("train_y.csv")

In [None]:
xtrain = xtrainload / 255.0
ytrain = ytrainload

In [None]:
pb = progressbar.ProgressBar()

# preprocess x 
xtrain_preprocessed = []
for x in pb(xtrain):
    result = preprocess_image(x)
    result = result.reshape(1024)
    xtrain_preprocessed.append(result)
xtrain = np.asarray(xtrain_preprocessed)

num_classes = ytrain.shape[1]


xtrainset = xtrain[:-10000]
ytrainset = ytrain[:-10000]
xvalidset = xtrain[-10000:]
yvalidset = ytrain[-10000:]

In [None]:
clf = LogisticRegression() 
clf = clf.fit(xtrainset,ytrainset.ravel())
y_pred_log = clf.predict(xvalidset)

In [None]:
print("Accuracy: ",metrics.accuracy_score(yvalidset, y_pred_log))