## Loading Original Data

In [2]:
from scipy import ndimage
from skimage.transform import resize
import numpy as np 
import matplotlib.pyplot as plt # to visualize only 
import pandas as pd
import gc
import numpy as np 
import cv2

In [None]:
original_x = pd.read_csv("./data/train_x.csv", delimiter=",", header = None) # load from text 

In [None]:
y = pd.read_csv("./data/train_y.csv", delimiter = ",", header = None)

In [None]:
#have to reformat from data frame 

original_x = original_x.values

In [None]:
y = (y.values).ravel()

In [None]:
original_x = original_x.astype(np.float32)

In [3]:
#visualizing 5 images indexes in array images by start and shows corresponding labels
def plotter(images, labels, start):
    plt.figure(figsize=(20,4))
    for index, (image, label) in enumerate(zip(images[start:start+5], labels[start:start+5])):
        plt.subplot(1, 5, index + 1)
        plt.imshow(np.reshape(image, (64,64)), cmap= 'gray')
        plt.title('Training: %i\n' % label, fontsize = 20)

In [None]:
#plotter(original_x, y, 0)

## Processing the data

In [4]:
#since we know the numbers are white, can set a high threshold
#and set anything below that threshold to be black (0)

def binarize_element(x, threshold = 250):
    #make this less than to get black digits and white background
    if x > threshold:
        x = 255
        return x
    else:
        x = 0
        return x

In [5]:
def array_map(func, x):
    return map(func, x)

In [None]:
#making the data black and white

new_x = np.zeros((original_x.shape[0], original_x.shape[1]))
for i in range(len(new_x)):
    new_x[i] = list(array_map(binarize_element, original_x[i]))

In [None]:
#plotter(new_x, y, 0)

In [None]:
#writing the black and white images to a file
df = pd.DataFrame(new_x)
df.to_csv("./data/bw_x.csv", header = False, index = False)

In [None]:
#more preprocessing on the black and white images

x = pd.read_csv("./data/bw_x.csv", delimiter=",", header = None) # load from processed images

In [None]:
x = x.values

x = x.astype(np.float32)

In [None]:
#making sure the background is indeed black and the digits are white
#plotter(x, y, 5)

### Getting the largest digit

In [None]:
new_x = np.zeros((x.shape[0], x.shape[1]))

In [6]:
#first argument is original data, second argument is where new images will be stored

def find_largest_digit(images, new_images):
    for i in range(len(new_images)):
        im = np.reshape(images[i], (64,64))
        # Label objects
        labeled_image, num_features = ndimage.label(im)
        # Find the location of all objects
        objs = ndimage.find_objects(labeled_image)
        # Get the height and width
        measurements = []
        for ob in objs:
            measurements.append((int(ob[0].stop - ob[0].start), int(ob[1].stop - ob[1].start)))

        index = max(enumerate(measurements), key=lambda x: max(x[1]))[0]

        min_y, max_y = objs[index][0].start, objs[index][0].stop
        min_x, max_x = objs[index][1].start, objs[index][1].stop

        #adding padding around number
        if(min_y > 1):
            min_y -= 2
        if(max_y < 63):
            max_y += 2
        if(min_x > 1):
            min_x -= 2
        if(max_x < 63):
            max_x += 2

        image = im[min_y:max_y,min_x:max_x]
        image = image.astype(np.uint32)
        h,w = image.shape
        img = resize(image, (64, 64), preserve_range=True, mode =  'reflect')



        new_images[i] = img.reshape(4096)
    

In [None]:
find_largest_digit(x, new_x)

Comparing the original images, the black and white, and the fully processed

In [7]:
#better to make the digits black and the background white

def invert_element(x):
    if x != 0:
        x = 0
        return x
    else:
        x = 255
        return x

In [None]:
processed_x = np.zeros((new_x.shape[0],new_x.shape[1]))
for i in range(len(processed_x)):
    processed_x[i] = list(array_map(invert_element, new_x[i]))

## Preprocessing Pipeline

In [None]:
plotter(original_x, y, 1100)

In [None]:
plotter(x, y, 1100)

In [None]:
plotter(new_x, y, 1100)

In [None]:
plotter(processed_x, y, 1100)

In [None]:
#saving fully preprocessed training file to a csv so that we never have to do all of that again

df = pd.DataFrame(processed_x)
df.to_csv("./data/processed_x.csv", header = False, index = False)

# Can start from here

In [None]:
import numpy as np 
import matplotlib.pyplot as plt # to visualize only 
import pandas as pd
from scipy import ndimage
from skimage.transform import resize
import cv2
import gc

In [None]:
x = pd.read_csv("./data/processed_x.csv", delimiter=",", header = None) # load from processed images

In [None]:
x = x.values

In [None]:
y = pd.read_csv("./data/train_y.csv", delimiter = ",", header = None)
y = (y.values).ravel()

In [None]:
x = x.astype(np.float32)
x /= 255

In [None]:
plotter(x, y, 4140)

### Augmentation

In [None]:
#kernel = np.ones((5,5),np.float32)/25
    #dst = cv2.filter2D(img,-1,kernel)
    #img = cv2.GaussianBlur(img,(5,5),0)

# Preprocessing Test Data

In [8]:
test_x = pd.read_csv("./data/test_x.csv", delimiter=",", header = None) # load from text 

In [9]:
#have to reformat from data frame 

test_x = test_x.values

In [10]:
test_x = test_x.astype(np.float32)

In [12]:
#visualizing a random 5 images
indices = list(range(test_x.shape[0]))
#plotter(test_x, indices, 0)

In [13]:
#making the data black and white
new_test_x = np.zeros((test_x.shape[0], test_x.shape[1]))
for i in range(len(new_test_x)):
    new_test_x[i] = list(array_map(binarize_element, test_x[i]))

In [15]:
#plotter(new_test_x, indices, 0)

In [16]:
digit_test_x = np.zeros((new_test_x.shape[0], new_test_x.shape[1]))

In [17]:
find_largest_digit(new_test_x, digit_test_x)

In [19]:
#plotter(digit_test_x, indices, 0)

In [20]:
processed_test_x = np.zeros((digit_test_x.shape[0],digit_test_x.shape[1]))
for i in range(len(processed_test_x)):
    processed_test_x[i] = list(array_map(invert_element, digit_test_x[i]))

In [24]:
#plotter(processed_test_x, indices, 0)

In [None]:
df = pd.DataFrame(processed_test_x)

In [None]:
df.to_csv("./data/processed_test_x.csv", header = False, index = False)