In [72]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from os import listdir
from os.path import isfile, join

from sklearn.datasets import load_digits  

# For picture manipulation
from PIL import Image

# For creating directories
import os

---
# Math Symbols
---

### Getting list of files in dir

In [73]:
folder = "../../Data/raw_datasets/handwrittenmathsymbols"
sub_folders = ['+', ')', '(', ',', '-']
file_lists = {'+':[], ')':[], '(':[], ',':[], '-':[]}

for sub_folder in sub_folders:
    mypath = folder +"/"+ sub_folder
    file_lists[sub_folder] = [f for f in listdir(mypath) if isfile(join(mypath, f))]

### Reading in images

In [92]:
# Only reads in up to K images from each class (+, -, ),etc)
K = 1906

# CHANGE THIS FOR DIFFERENT IMAGE SIZES
math_dimensions = 28

# Getting images
images_scaled = {'+':[], ')':[], '(':[], ',':[], '-':[]}
for sub_folder in sub_folders:
    print("Reading "+sub_folder+" Images")
    for file in file_lists[sub_folder][:K]:
        mypath = folder + "/" + sub_folder + "/" + file
        # Getting original file
        image = Image.open(mypath)
        # Scaling image down
        images_scaled[sub_folder].append(image.resize((math_dimensions, math_dimensions)))

print("Done.")

Reading + Images
Reading ) Images
Reading ( Images
Reading , Images
Reading - Images
Done.


### Converting the RGB images to Monochrome
So each pixel will be represented with a single number

In [93]:
digits_mono = {'+':[], ')':[], '(':[], ',':[], '-':[]}
# Turning images into numerical values representing
#  brightness of each pixel
digit_mono = []
for sub_folder in sub_folders:
    for i in images_scaled[sub_folder]:
        digits_mono[sub_folder].append(np.asarray( i, dtype="int32" ))

### Reshaping the digits

In [94]:
# Reshaping digits
digits_shaped = digits_mono.copy()

for sub_folder in sub_folders:
    for digit in range(len(digits_shaped[sub_folder])):
        digits_shaped[sub_folder][digit] = digits_shaped[sub_folder][digit].reshape(-1,math_dimensions**2)

### Converting the pixel values
Currently the numbers are stored in the form `(0->255, dark->bright)`. For storage purposes, I want the pixels to be represented as `(0->8, bright->dark)`. So, that's what I'm doing here.

In [95]:
dfs = {}
for sub_folder in sub_folders:
    print("Math Symbol: "+sub_folder)
    print("Creating DF")
    dfs[sub_folder] = pd.DataFrame([digits_shaped[sub_folder][i][0] for i in range(len(digits_shaped[sub_folder]))])
    print("Converting...")
    dfs[sub_folder] = dfs[sub_folder].apply(lambda x : ((x*-1 + 256)/32).astype(int) )
    print("")

Math Symbol: +
Creating DF
Converting...

Math Symbol: )
Creating DF
Converting...

Math Symbol: (
Creating DF
Converting...

Math Symbol: ,
Creating DF
Converting...

Math Symbol: -
Creating DF
Converting...



### Putting the pixel values into a CSV file

In [96]:
 for sub_folder in sub_folders:
    print("\""+sub_folder+"\" -> CSV...\n")
    dfs[sub_folder].to_csv("../../Data/CSV/"+sub_folder+".csv")
print("done")

"+" -> CSV...

")" -> CSV...

"(" -> CSV...

"," -> CSV...

"-" -> CSV...

done


### Checking counts

In [79]:
df_list_math = []
for sub_folder in sub_folders:
    df_temp = dfs[sub_folder]
    df_temp["class"] = sub_folder
    df_list_math.append(df_temp)
    
df_concat_math = pd.concat(df_list_math)
df_concat_math.groupby("class").count()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
),1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
+,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
",",1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
-,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906


---
# Digits
---

### Reading in the dataset

In [80]:
digits_df = pd.read_csv("../../Data/raw_datasets/mnist_train.csv")
digits_df.columns

Index(['label', '1x1', '1x2', '1x3', '1x4', '1x5', '1x6', '1x7', '1x8', '1x9',
       ...
       '28x19', '28x20', '28x21', '28x22', '28x23', '28x24', '28x25', '28x26',
       '28x27', '28x28'],
      dtype='object', length=785)

### Changing column names

In [81]:
column_change = {"label":"class"}

# Changing column names (excluding class id) to numerical values
for old_name, new_name in zip(digits_df.columns[1:], range(len(digits_df.columns[1:]))):
    column_change[old_name] = str(new_name)
    
digits_df = digits_df.rename(columns=column_change )
digits_df.columns

Index(['class', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '774', '775', '776', '777', '778', '779', '780', '781', '782', '783'],
      dtype='object', length=785)

#### Changing type of class variable

In [82]:
# changing class from int to string values
digits_df["class"] = digits_df["class"].astype(str)

### Converting to same numerical representation as math symbols

In [83]:
# Exporting to CSVs and dropping class column
class_names = ['0','1','2','3','4','5','6','7','8','9']
dfs = {}
for class_name in class_names:
    print("\nDigit: "+class_name)
    # Dropping class because that will be signified by the file name
    print("Seperating and dropping")
    dfs[class_name] = digits_df[digits_df["class"] == class_name]
    dfs[class_name] = dfs[class_name].drop(columns=["class"])
    print("Converting...")
    dfs[class_name] = dfs[class_name].apply(lambda x: ((x+1)/32).astype(int))


Digit: 0
Seperating and dropping
Converting...

Digit: 1
Seperating and dropping
Converting...

Digit: 2
Seperating and dropping
Converting...

Digit: 3
Seperating and dropping
Converting...

Digit: 4
Seperating and dropping
Converting...

Digit: 5
Seperating and dropping
Converting...

Digit: 6
Seperating and dropping
Converting...

Digit: 7
Seperating and dropping
Converting...

Digit: 8
Seperating and dropping
Converting...

Digit: 9
Seperating and dropping
Converting...


In [84]:
dfs2 = {}
for class_name in class_names:
    temp.reindex()
    temp = dfs[class_name]
    temp = pd.DataFrame(temp.head(1906))
    dfs2[class_name] = temp

In [85]:
df_list_digits = []
for cls in class_names:
    df_temp = dfs2[cls]
    df_temp["class"] = cls
    df_list_digits.append(df_temp)
    
df_concat_digits = pd.concat(df_list_digits)
df_concat_digits.groupby("class").count()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
1,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
2,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
3,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
4,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
5,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
6,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
7,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
8,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906
9,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906,...,1906,1906,1906,1906,1906,1906,1906,1906,1906,1906


In [86]:
# Exporting to CSVs and dropping class column
for class_name in class_names:
    print("\""+class_name+"\" -> CSV...")
    dfs2[class_name].to_csv("../../Data/CSV/"+class_name+".csv")

"0" -> CSV...
"1" -> CSV...
"2" -> CSV...
"3" -> CSV...
"4" -> CSV...
"5" -> CSV...
"6" -> CSV...
"7" -> CSV...
"8" -> CSV...
"9" -> CSV...
