In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import neural_network

from os import listdir
from os.path import isfile, join

from sklearn.datasets import load_digits  

# For picture manipulation
from PIL import Image

# For creating directories
import os

In [2]:
X,y = load_digits(return_X_y=True)

# Do a crude test/train split to evaluate classifiers
NUM_TEST = 100
X_train = X[:-NUM_TEST] # use these for training
y_train = y[:-NUM_TEST]
X_test = X[-NUM_TEST:]  # use these for testing
y_true = y[-NUM_TEST:]  # the right answers to check classifier's performance

print("Data:  ", X.shape, y.shape)
print("Train: ", X_train.shape, y_train.shape)
print("Test:  ", X_test.shape, y_true.shape)

Data:   (1797, 64) (1797,)
Train:  (1697, 64) (1697,)
Test:   (100, 64) (100,)


### Getting list of files in dir

In [3]:
folder = "raw_datasets/handwrittenmathsymbols"
sub_folders = ['+', ')', '(', ',', '-']
file_lists = {'+':[], ')':[], '(':[], ',':[], '-':[]}

for sub_folder in sub_folders:
    mypath = folder +"/"+ sub_folder
    file_lists[sub_folder] = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [4]:
file_lists["+"]

['exp25404.jpg',
 '+_68682.jpg',
 'exp63451.jpg',
 'exp32178.jpg',
 'exp73642.jpg',
 '+_78491.jpg',
 'exp34509.jpg',
 '+_27195.jpg',
 'exp79409.jpg',
 '+_91541.jpg',
 'exp67637.jpg',
 'exp68504.jpg',
 'exp932.jpg',
 '+_32598.jpg',
 'exp21104.jpg',
 'exp82863.jpg',
 '+_23495.jpg',
 'exp39993.jpg',
 'exp16970.jpg',
 'exp75555.jpg',
 'exp23713.jpg',
 '+_70967.jpg',
 'exp23707.jpg',
 'exp22419.jpg',
 '+_58033.jpg',
 'exp5367.jpg',
 'exp78065.jpg',
 '+_98571.jpg',
 'exp56398.jpg',
 'exp20568.jpg',
 'exp3070.jpg',
 '+_24488.jpg',
 'exp5401.jpg',
 '+_35585.jpg',
 'exp63445.jpg',
 'exp6108.jpg',
 'exp25410.jpg',
 '+_79943.jpg',
 '+_78485.jpg',
 'exp73656.jpg',
 'exp1667.jpg',
 'exp72548.jpg',
 '+_4745.jpg',
 'exp8125.jpg',
 'exp79435.jpg',
 'exp10483.jpg',
 'exp21886.jpg',
 'exp60.jpg',
 '+_12112.jpg',
 'exp39011.jpg',
 '+_9249.jpg',
 'exp26131.jpg',
 'exp9213.jpg',
 'exp6120.jpg',
 'exp12294.jpg',
 'exp64302.jpg',
 'exp87099.jpg',
 'exp72560.jpg',
 '+_77946.jpg',
 'exp1897.jpg',
 'exp24726.jp

### Scaling Down Images

In [5]:
# Scales all images down to 28x28 to match the mnist dataset
images_scaled = {'+':[], ')':[], '(':[], ',':[], '-':[]}
for sub_folder in sub_folders:
    print(sub_folder)
    for file in file_lists[sub_folder]:
        mypath = folder + "/" + sub_folder + "/" + file
        # Getting original file
        image = Image.open(mypath)
        # Scaling image down
        images_scaled[sub_folder].append(image.resize((28, 28)))


+
)
(
,
-


In [6]:
digits_mono = {'+':[], ')':[], '(':[], ',':[], '-':[]}
# Turning images into numerical values representing
#  brightness of each pixel
digit_mono = []
for sub_folder in sub_folders:
    for i in images_scaled[sub_folder]:
        digits_mono[sub_folder].append(np.asarray( i, dtype="int32" ))

In [7]:
# Correcting digit pixel values from 0-255 (dark->bright), to 0-255 (bright->dark)
digits_scaled = digits_mono.copy()
for sub_folder in sub_folders:
    print(sub_folder)
    for digit in range(len(digits_scaled[sub_folder])):
        for row in range(len(digits_scaled[sub_folder][digit])):
            for val in range(len(digits_scaled[sub_folder][digit][row])):
                digits_scaled[sub_folder][digit][row][val] = (digits_scaled[sub_folder][digit][row][val]*-1)+255

+
)
(
,
-


In [8]:
# Reshaping digits
digits_shaped = digits_scaled.copy()
for sub_folder in sub_folders:
    for digit in range(len(digits_scaled[sub_folder])):
        digits_shaped[sub_folder][digit] = digits_shaped[sub_folder][digit].reshape(-1,28*28)

In [9]:
for sub_folder in sub_folders:
    print(sub_folder)
    df = pd.DataFrame([digits_shaped[sub_folder][i][0] for i in range(len(digits_shaped[sub_folder]))])
    df.to_csv("CSV/"+sub_folder+".csv")

+
)
(
,
-


### Digits
This will be a bit different than the math symbols. For instance, we won't have to scale down the images and we don't have to re-format the pixel values. But we do have to do stuff we didn't have to for the math digits. Specifically, we will have to change the column names, and the class type.

The column names for the math symbols are 0,1,2,3...

In [10]:
digits_df = pd.read_csv("raw_datasets/mnist_train.csv")
digits_df.columns

Index(['label', '1x1', '1x2', '1x3', '1x4', '1x5', '1x6', '1x7', '1x8', '1x9',
       ...
       '28x19', '28x20', '28x21', '28x22', '28x23', '28x24', '28x25', '28x26',
       '28x27', '28x28'],
      dtype='object', length=785)

In [11]:
# Changing column names
column_change = {"label":"class"}

# Changing column names (excluding class id) to numerical values
for old_name, new_name in zip(digits_df.columns[1:], range(len(digits_df.columns[1:]))):
    column_change[old_name] = str(new_name)
    
digits_df = digits_df.rename(columns=column_change )
digits_df.columns

Index(['class', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '774', '775', '776', '777', '778', '779', '780', '781', '782', '783'],
      dtype='object', length=785)

In [12]:
# changing class from int to string values
digits_df["class"] = digits_df["class"].astype(str)

# Exporting to CSVs and dropping class column
class_names = ['0','1','2','3','4','5','6','7','8','9']
for class_name in class_names:
    # Dropping class because that will be signified by the file name
    temp = digits_df[digits_df["class"] == class_name]
    temp = temp.drop(columns=["class"])
    print(class_name)
    temp.to_csv("CSV/"+class_name+".csv")

0
1
2
3
4
5
6
7
8
9
