### processing inkml as images:

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import inkml2img_pictures as ink
import os
import time
from tqdm import tqdm_notebook

#### parsing:
I put the files into a folder called math and the file unzipped is the ICFHR_package. (I manually unzipped that one). Note, there are zipped folders inside the zipped folder lol

In [7]:
#made a directory named math first and unzipped ICFHR to that
path = "./math/ICFHR_package/"
os.listdir(path)

['crohme.css',
 'CROHME2011_data',
 'CROHME2012_data',
 'CROHME_papers',
 'evaluationTools',
 'index.html',
 'InkmlViewer',
 'ParticipantsResults2012']

In [8]:
os.listdir(path + 'CROHME2011_data/CROHME_test')

['CROHME_test.zip']

In [10]:
#all locations of inkml files:

train_2011 = path + "CROHME2011_data/CROHME_training/CROHME_training/"
train_2012 = path + "CROHME2012_data/trainData/trainData/"

test_2011 = path + "CROHME2011_data/CROHME_test/CROHME_test/"
test_2012 = path + "CROHME2012_data/testData/testData/"

testGT_2011 = path + "CROHME2011_data/CROHME_test/CROHME_test/"
testGT_2012 = path + "CROHME2012_data/testDataGT/" #notice, slightly different path...

paths = [train_2011, train_2012, test_2011, test_2012, testGT_2011, testGT_2012]
paths

['./math/ICFHR_package/CROHME2011_data/CROHME_training/CROHME_training/',
 './math/ICFHR_package/CROHME2012_data/trainData/trainData/',
 './math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/',
 './math/ICFHR_package/CROHME2012_data/testData/testData/',
 './math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/',
 './math/ICFHR_package/CROHME2012_data/testDataGT/']

### unzipping
lots of files are zipped (train files) so this is to unzip them:

In [21]:
import zipfile

for file in tqdm_notebook(paths[:-1]): #for some reason testGT_2012 is not zipped??? Idk why but just exclude that one in unzipping
    with zipfile.ZipFile(file[:-1] + ".zip", 'r') as zip_ref:
        zip_ref.extractall(file)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




##### getting image paths:

In [15]:
#dictionary with all file paths to inkml files (some other files present, so be warry, not all inkmls, just what's in file)
ink_names = {}
for ink_file in paths:
    ind_paths = [ink_file + (ink).strip(".inkml") for ink in os.listdir(ink_file)]
    ink_names[ink_file] = ind_paths


## converting images:

In [34]:
#making new folders for converted images:
folders =  [
    "data/raw_images/train_data_2011", 
    "data/raw_images/train_data_2012", 
    "data/raw_images/test_data_2011", 
    "data/raw_images/test_data_2012", 
    "data/raw_images/testGT_data_2011", 
    "data/raw_images/testGT_data_2012"
]

try:
    os.mkdir('data')
    os.mkdir('data/raw_images')
except:
    print(f'Could not create data/raw_images, perhaps the directory already exists?')

for direct in folders:
    try:
         os.mkdir(direct)
    except:
        print("this directory already exists, '{}', come up with a new one or perish".format(direct))


In [35]:
#example:
for places in zip(folders, paths):
    print(places[0], places[1])
    
#example getting 1 image, generalized below:
# [ink.inkml2img('./{}.inkml'.format(file_name), train_2011_folder + '/{}.png'.format(ink_name.strip(".inkml"))) for file_name, ink_name in zip(paths, train_2011_ink)]


data/raw_images/train_data_2011 ./math/ICFHR_package/CROHME2011_data/CROHME_training/CROHME_training/
data/raw_images/train_data_2012 ./math/ICFHR_package/CROHME2012_data/trainData/trainData/
data/raw_images/test_data_2011 ./math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/
data/raw_images/test_data_2012 ./math/ICFHR_package/CROHME2012_data/testData/testData/
data/raw_images/testGT_data_2011 ./math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/
data/raw_images/testGT_data_2012 ./math/ICFHR_package/CROHME2012_data/testDataGT/


just as a heads up, lots of warnings so I suppressed, will take quite a bit of time though. Also didn't run fully on my own computer, but there's a lot of files hehe so should be fun. Maybe zip after???

In [36]:
import warnings
#there's a lot of warnings based on below... just as a warning haha
warnings.filterwarnings('ignore')

In [41]:
%%time
bad_links = []

#converting images:
for places in tqdm_notebook(zip(folders, paths)):
    #hold onto labels
    traces_dict = {}
    
    print(places[0], places[1])
    new_directory = places[0]
    inkml_directory = places[1]
    
    #I don't know how to parallelize this or if it's even worth it tbh since we're only doing it once:
    for ink_name in tqdm_notebook(ink_names[inkml_directory]):#ink_names[inkml_directory]:
        try:
            file_name = inkml_directory
            #print(file_name, ink_name)
            #print(new_directory)
            #process image:
            ink.inkml2img('{}.inkml'.format(ink_name), output_path = new_directory  + '/{}.png'.format(ink_name.strip(".inkml").split("/")[-1]))

            #get traces:
            trace_data = ink.get_traces_data(ink_name + ".inkml")
            traces_dict[ink_name] = trace_data
        except Exception as e:
            bad_links.append([ink_name, str(e)])
            print(ink_name, str(e))
            
    traces_labels = pd.Series(traces_dict)
    file_prefix = str(new_directory.split('/')[-1])
    traces_labels.to_csv(path_or_buf = new_directory + "/" + file_prefix + "_traces.csv", index = True, header = False)
    
if bad_links:
    pd.DataFrame(bad_links, columns=['Filename', 'Exception']).to_csv("bad_links.csv")
        


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0â€¦

data/raw_images/train_data_2011 ./math/ICFHR_package/CROHME2011_data/CROHME_training/CROHME_training/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=921.0), HTML(value='')))


data/raw_images/train_data_2012 ./math/ICFHR_package/CROHME2012_data/trainData/trainData/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1338.0), HTML(value='')))

./math/ICFHR_package/CROHME2012_data/trainData/trainData/tegral05 [Errno 2] No such file or directory: './math/ICFHR_package/CROHME2012_data/trainData/trainData/tegral05.inkml'

data/raw_images/test_data_2011 ./math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=348.0), HTML(value='')))

./math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/tegral01 [Errno 2] No such file or directory: './math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/tegral01.inkml'

data/raw_images/test_data_2012 ./math/ICFHR_package/CROHME2012_data/testData/testData/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=488.0), HTML(value='')))


data/raw_images/testGT_data_2011 ./math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=348.0), HTML(value='')))

./math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/tegral01 [Errno 2] No such file or directory: './math/ICFHR_package/CROHME2011_data/CROHME_test/CROHME_test/tegral01.inkml'

data/raw_images/testGT_data_2012 ./math/ICFHR_package/CROHME2012_data/testDataGT/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=489.0), HTML(value='')))

./math/ICFHR_package/CROHME2012_data/testDataGT/statSR.txt [Errno 2] No such file or directory: './math/ICFHR_package/CROHME2012_data/testDataGT/statSR.txt.inkml'


Wall time: 3min 57s


<Figure size 432x288 with 0 Axes>

#### getting traces:

just sample code to see how to get traces and output:

In [None]:
sample1 = ink_names[paths[0]][0]
sample1
sample2 = ink_names[paths[0]][1]
sample2

In [None]:
hi = pd.Series({0: ink.get_traces_data(sample1 + ".inkml"), 1: ink.get_traces_data(sample2 + ".inkml")})
hi.index = ink_names[paths[0]][:2]
hi