In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from os import listdir
from os.path import isfile, join

from sklearn.datasets import load_digits  

# For picture manipulation
from PIL import Image

# For creating directories
import os

---
# Math Symbols
---

### Getting list of files in dir

In [2]:
folder = "../../Data/raw_datasets/handwrittenmathsymbols"
sub_folders = ['+', ')', '(', ',', '-']
file_lists = {'+':[], ')':[], '(':[], ',':[], '-':[]}

for sub_folder in sub_folders:
    mypath = folder +"/"+ sub_folder
    file_lists[sub_folder] = [f for f in listdir(mypath) if isfile(join(mypath, f))]

FileNotFoundError: [Errno 2] No such file or directory: '../../Data/raw_datasets/handwrittenmathsymbols/+'

### Reading in images

In [None]:
# Only reads in up to K images from each class (+, -, ),etc)
K = 1906

# CHANGE THIS FOR DIFFERENT IMAGE SIZES
math_dimensions = 28

# Getting images
images_scaled = {'+':[], ')':[], '(':[], ',':[], '-':[]}
for sub_folder in sub_folders:
    print("Reading "+sub_folder+" Images")
    for file in file_lists[sub_folder][:K]:
        mypath = folder + "/" + sub_folder + "/" + file
        # Getting original file
        image = Image.open(mypath)
        # Scaling image down
        images_scaled[sub_folder].append(image.resize((math_dimensions, math_dimensions)))

print("Done.")

### Converting the RGB images to Monochrome
So each pixel will be represented with a single number

In [None]:
digits_mono = {'+':[], ')':[], '(':[], ',':[], '-':[]}
# Turning images into numerical values representing
#  brightness of each pixel
digit_mono = []
for sub_folder in sub_folders:
    for i in images_scaled[sub_folder]:
        digits_mono[sub_folder].append(np.asarray( i, dtype="int32" ))

### Reshaping the digits

In [None]:
# Reshaping digits
digits_shaped = digits_mono.copy()

for sub_folder in sub_folders:
    for digit in range(len(digits_shaped[sub_folder])):
        digits_shaped[sub_folder][digit] = digits_shaped[sub_folder][digit].reshape(-1,math_dimensions**2)

### Converting the pixel values
Currently the numbers are stored in the form `(0->255, dark->bright)`. For storage purposes, I want the pixels to be represented as `(0->8, bright->dark)`. So, that's what I'm doing here.

In [None]:
dfs = {}
for sub_folder in sub_folders:
    print("Math Symbol: "+sub_folder)
    print("Creating DF")
    dfs[sub_folder] = pd.DataFrame([digits_shaped[sub_folder][i][0] for i in range(len(digits_shaped[sub_folder]))])
    print("Converting...")
    dfs[sub_folder] = dfs[sub_folder].apply(lambda x : ((x*-1 + 256)/32).astype(int) )
    print("")

### Putting the pixel values into a CSV file

In [None]:
 for sub_folder in sub_folders:
    print("\""+sub_folder+"\" -> CSV...\n")
    dfs[sub_folder].to_csv("../../Data/CSV/"+sub_folder+"28.csv")
print("done")

### Checking counts

In [None]:
df_list_math = []
for sub_folder in sub_folders:
    df_temp = dfs[sub_folder]
    df_temp["class"] = sub_folder
    df_list_math.append(df_temp)
    
df_concat_math = pd.concat(df_list_math)
df_concat_math.groupby("class").count()

---
# Digits
---

### Reading in the dataset

In [None]:
digits_df = pd.read_csv("../../Data/raw_datasets/mnist_train.csv")
digits_df.columns

### Changing column names

In [None]:
column_change = {"label":"class"}

# Changing column names (excluding class id) to numerical values
for old_name, new_name in zip(digits_df.columns[1:], range(len(digits_df.columns[1:]))):
    column_change[old_name] = str(new_name)
    
digits_df = digits_df.rename(columns=column_change )
digits_df.columns

#### Changing type of class variable

In [None]:
# changing class from int to string values
digits_df["class"] = digits_df["class"].astype(str)

### Converting to same numerical representation as math symbols

In [None]:
# Exporting to CSVs and dropping class column
class_names = ['0','1','2','3','4','5','6','7','8','9']
dfs = {}
for class_name in class_names:
    print("\nDigit: "+class_name)
    # Dropping class because that will be signified by the file name
    print("Seperating and dropping")
    dfs[class_name] = digits_df[digits_df["class"] == class_name]
    dfs[class_name] = dfs[class_name].drop(columns=["class"])
    print("Converting...")
    dfs[class_name] = dfs[class_name].apply(lambda x: ((x+1)/32).astype(int))

In [None]:
dfs2 = {}
for class_name in class_names:
    temp = dfs[class_name]
    temp = pd.DataFrame(temp.head(1906))
    dfs2[class_name] = temp

In [None]:
df_list_digits = []
for cls in class_names:
    df_temp = dfs2[cls]
    df_temp["class"] = cls
    df_list_digits.append(df_temp)
    
df_concat_digits = pd.concat(df_list_digits)
df_concat_digits.groupby("class").count()

In [None]:
# Exporting to CSVs and dropping class column
for class_name in class_names:
    print("\""+class_name+"\" -> CSV...")
    dfs2[class_name].to_csv("../../Data/CSV/"+class_name+".csv")