In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# print(os.listdir('../input'))

# Any results you write to the current directory are saved as output.


import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from skimage.transform import resize
import xml.etree.ElementTree as ET
import os
import numpy as np
from tqdm import tqdm
import cv2 as cv
import pandas as pd
import splitfolders

In [2]:
def get_traces_data(inkml_filepath):

    	traces_data = []
    
    	tree = ET.parse(inkml_filepath)
    	root = tree.getroot()
    	doc_namespace = "{http://www.w3.org/2003/InkML}"

    	'Stores traces_all with their corresponding id'
    	traces_all = [{'id': trace_tag.get('id'),
    					'coords': [[round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord) * 10000) \
    									for axis_coord in coord[1:].split(' ')] if coord.startswith(' ') \
    								else [round(float(axis_coord)) if float(axis_coord).is_integer() else round(float(axis_coord) * 10000) \
    									for axis_coord in coord.split(' ')] \
    							for coord in (trace_tag.text).replace('\n', '').split(',')]} \
    							for trace_tag in root.findall(doc_namespace + 'trace')]

    	'Sort traces_all list by id to make searching for references faster'
    	traces_all.sort(key=lambda trace_dict: int(trace_dict['id']))

    	'Always 1st traceGroup is a redundant wrapper'
    	traceGroupWrapper = root.find(doc_namespace + 'traceGroup')

    	if traceGroupWrapper is not None:
    		for traceGroup in traceGroupWrapper.findall(doc_namespace + 'traceGroup'):

    			label = traceGroup.find(doc_namespace + 'annotation').text

    			'traces of the current traceGroup'
    			traces_curr = []
    			for traceView in traceGroup.findall(doc_namespace + 'traceView'):

    				'Id reference to specific trace tag corresponding to currently considered label'
    				traceDataRef = int(traceView.get('traceDataRef'))

    				'Each trace is represented by a list of coordinates to connect'
    				single_trace = traces_all[traceDataRef]['coords']
    				traces_curr.append(single_trace)


    			traces_data.append({'label': label, 'trace_group': traces_curr})

    	else:
    		'Consider Validation data that has no labels'
    		[traces_data.append({'trace_group': [trace['coords']]}) for trace in traces_all]

    	return traces_data

In [3]:
capital_list = ['A','B','C','F','X','Y']
def inkml2img(input_path, output_path, linewidth):
    traces = get_traces_data(input_path)
    file_name = 0
    ind_output_path = None
    for elem in traces:
        plt.gca().invert_yaxis()
        plt.gca().set_aspect('equal', adjustable='box')
        plt.axes().get_xaxis().set_visible(False)
        plt.axes().get_yaxis().set_visible(False)
        plt.axes().spines['top'].set_visible(False)
        plt.axes().spines['right'].set_visible(False)
        plt.axes().spines['bottom'].set_visible(False)
        plt.axes().spines['left'].set_visible(False)
        ls = elem['trace_group']
        
        for subls in ls:
            
            data = np.array(subls)
            x,y=zip(*data)
            plt.plot(x, y, linewidth=linewidth, c='black')
        
        if 'label' in elem.keys():
            label = None
            if elem['label'] in capital_list:
                label = 'capital' + elem['label']
            else:
                label = elem['label']
                
                
            ind_output_path = output_path + '_' + label       
    
        else:
            ind_output_path = output_path
            
            
            
        ind_output_path = ind_output_path.replace('\\', '_')
        if not os.path.exists(ind_output_path):
            os.mkdir(ind_output_path)
    
        if(os.path.exists(ind_output_path + '/' + str(file_name) + '.png')):
            # print('1111')
            file_name += 1
            plt.savefig(ind_output_path + '/' + str(file_name) + '.png', bbox_inches='tight', dpi=100)
        else:
            plt.savefig(ind_output_path + '/' + str(file_name) + '.png', bbox_inches='tight', dpi=100)
        plt.gcf().clear()

In [4]:
def create_all_images_from_crohme_inkml_data():
    current_path = './'
    directories = ['test', 'train', 'testGT']
    versions = ['2011', '2012']
    linewidth = 15
    for version in versions:
        for directory in directories:
            location = current_path + 'data/crohme/CROHME' + version + '_data/' + directory  + 'Data'
            files = os.listdir(location)
            print("Creating images from INKML files of ", location)
            does_images_folder_exists = os.path.exists(location + '/Images')
            if not does_images_folder_exists:
                os.mkdir(location + '/Images')
                does_folder_exists = True
            for filename in tqdm(files):
                if filename[-5:] == 'inkml':
                    print("Creating images of ", filename)
                    position_extension_in_filename = filename.find('.')
                    filename_output = filename[:position_extension_in_filename]
                    filename_output = filename_output.replace('\\', '_')
                    inkml2img(location + '/' + filename, location + '/Images/' + filename_output, linewidth)

In [5]:
def create_final_data():
    if not os.path.exists('./data/crohme/labellized_data'):
        os.mkdir('./data/crohme/labellized_data')
    if not os.path.exists('./crohme/unlabellized_data'):
        os.mkdir('./data/crohme/unlabellized_data')
        
    counter_images_from_specific_label = 0
    counter_unlabellized_images = 0
        
    labels_directories_created = []
    current_path = './'
    directories = ['train', 'test', 'testGT']
    versions = ['2011', '2012']   
    for version in versions:
        for directory in directories:
            location_images = current_path + 'data/crohme/CROHME' + version + '_data/' + directory  + 'Data/Images'
            print("Fetching data from ", location_images)
            if directory == 'test':
                for foldername in tqdm(os.listdir(location_images)):
                    specific_location_images = location_images +  '/' + foldername
                    for image_filename in os.listdir(specific_location_images):
                        image_path = specific_location_images + '/' + image_filename
                        image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)
                        image = cv.resize(image, (45, 45))
                        cv.imwrite('./data/crohme/unlabellized_data/' + str(counter_unlabellized_images) + '.jpg', image)
                        counter_unlabellized_images += 1
                        
            # We do have labellized images in testGT
            else:
                for foldername in tqdm(os.listdir(location_images)):
                    label = foldername[foldername.rfind('_') + 1:]
                    if label not in labels_directories_created and not os.path.exists('./crohme/labellized_data/' + label):
                        os.mkdir('./data/crohme/labellized_data/' + label)
                    specific_location_images = location_images +  '/' + foldername
                    for image_filename in os.listdir(specific_location_images):
                        image_path = specific_location_images + '/' + image_filename
                        image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)
                        image = cv.resize(image, (45, 45)) 
                        cv.imwrite('./data/crohme/labellized_data/' + label + '/' + str(counter_images_from_specific_label) + '.jpg', image)
                        counter_images_from_specific_label += 1


In [6]:
create_final_data()

  0%|                                                                                         | 0/6663 [00:00<?, ?it/s]

Fetching data from  ./crohme/CROHME2011_data/trainData/Images


100%|█████████████████████████████████████████████████████████████████████████████| 6663/6663 [00:51<00:00, 129.92it/s]
  0%|▏                                                                                 | 1/348 [00:00<00:44,  7.87it/s]

Fetching data from  ./crohme/CROHME2011_data/testData/Images


100%|████████████████████████████████████████████████████████████████████████████████| 348/348 [00:16<00:00, 20.79it/s]
  0%|▏                                                                                | 5/2133 [00:00<00:43, 48.54it/s]

Fetching data from  ./crohme/CROHME2011_data/testGTData/Images


100%|██████████████████████████████████████████████████████████████████████████████| 2133/2133 [00:50<00:00, 42.62it/s]


Fetching data from  ./crohme/CROHME2012_data/trainData/Images


100%|████████████████████████████████████████████████████████████████████████████| 10168/10168 [03:16<00:00, 51.67it/s]
  0%|                                                                                          | 0/488 [00:00<?, ?it/s]

Fetching data from  ./crohme/CROHME2012_data/testData/Images


100%|████████████████████████████████████████████████████████████████████████████████| 488/488 [00:43<00:00, 11.13it/s]
  1%|▍                                                                              | 20/3877 [00:00<00:19, 198.02it/s]

Fetching data from  ./crohme/CROHME2012_data/testGTData/Images


100%|██████████████████████████████████████████████████████████████████████████████| 3877/3877 [01:59<00:00, 32.36it/s]
