### processing inkml as images:

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import inkml2img_pictures as ink
import pickle
import os
import time
from tqdm.notebook import tqdm

In [15]:
# Manually unzip the dataset from Kaggle, and set root_dir to point at it.

root_dir = os.path.join(os.getcwd(), 'math', 'handwritten_math_expressions_kaggle')
os.listdir(root_dir)

['CROHME_test_2011',
 'CROHME_training_2011',
 'MatricesTest2014',
 'MatricesTrain2014',
 'testData_2012',
 'TestINKML_2013',
 'trainData_2012_part1',
 'trainData_2012_part2',
 'TrainINKML_2013']

In [16]:
train_folders = [
    'CROHME_training_2011',
    'trainData_2012_part1',
    'trainData_2012_part2',
    'TrainINKML_2013'
]

test_folders = [
    'CROHME_test_2011',
    'testData_2012',
    'TestINKML_2013',
]

output_dir = os.path.join(os.getcwd(), 'data', 'all_years')

In [50]:
import warnings

warnings.filterwarnings('ignore')

def get_latex_from_inkml(inkml_path, doc_namespace='{http://www.w3.org/2003/InkML}'):
    tree = ET.parse(test_inkml_path)
    root = tree.getroot()
    
    latex_labels = [a.text for a in root.findall(doc_namespace + 'annotation') if a.get('type') == 'truth']
    
    if len(latex_labels) == 0:
        raise ValueError('Could not find a latex label.')
        
    if len(latex_labels) > 1:
        raise ValueError('Found multiple latex labels.')
    
    return latex_labels[0]


def get_tokens_from_traces_data(traces_data):
    return [t['label'] for t in traces_data]
    

def process_inkml(inkml_path, dataset_name):
    filename = os.path.basename(inkml_path)
    img_path = os.path.join(output_dir, dataset_name, 'images', f'{filename}.png')
    img_path = os.path.relpath(img_path)

    # Generate image from INKML.
    ink.inkml2img(inkml_path, output_path=img_path)

    # Save trace group / label / other data.
    traces_data = ink.get_traces_data(inkml_path)
    
    if dataset_name == 'train':
        tokens = get_tokens_from_traces_data(traces_data)

        latex = get_latex_from_inkml(inkml_path)

        return {
            'traces_data': traces_data,
            'tokens': tokens,
            'latex': latex,
            'img_path': img_path,
            'inkml_path': inkml_path
        }
    elif dataset_name == 'test':
        return {
            'traces_data': traces_data,
            'img_path': img_path,
            'inkml_path': inkml_path
        }
    
    raise NotImplementedError(f'Unknown dataset_name \'{dataset_name}\'')


def build_dataset(folder_names, output_dir, dataset_name):
    """
    Builds a dataset from the INKML files in the given folders.
    Output directory structure will be
    
    <output_dir>:
        <dataset_name>:
            images - Directory containing PNG files for each INKML file.
            <dataset_name>.csv - Dataset in CSV format.
            <dataset_name>.pk - Dataset in pickle format.
            
    
    Args:
        dataset_name (string): Name of dataset (probably 'train' or 'test').
    """
    
    # Make a list of all paths for INKML files in the given folders.
    all_inkml_paths = []
    for f in folder_names:
        folder_path = os.path.join(root_dir, f)
        for inkml_file in os.listdir(folder_path):
            if inkml_file.endswith('.inkml'):
                full_inkml_path = os.path.join(folder_path, inkml_file)
                all_inkml_paths.append(full_inkml_path)
             
            
    # Create directory to store dataset / images.
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    
    dataset_dir = os.path.join(output_dir, dataset_name)
    if not os.path.exists(dataset_dir):
        os.mkdir(dataset_dir)
    
    img_dir = os.path.join(dataset_dir, 'images')
    if not os.path.exists(img_dir):
        os.mkdir(img_dir)
    
    
    # Generate images / trace data for all INKML files. Apparently Python multiprocessing
    # is not supported in a notebook environment.
    data = []
    errors = []
    for inkml_path in tqdm(all_inkml_paths):
        try:
            row = process_inkml(inkml_path, dataset_name)
            data.append(row)
        except Exception as e:            
            errors.append({
                'inkml_path': inkml_path,
                'error': str(e),
            })
        
    
    
    # Save data to CSV and pickle.
    csv_path = os.path.join(output_dir, dataset_name, f'{dataset_name}.csv')
    pd.DataFrame(data).to_csv(csv_path, index=False)
    
    pickle_path = os.path.join(output_dir, dataset_name, f'{dataset_name}.pickle')
    with open(pickle_path, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
        
    errors_path = os.path.join(output_dir, dataset_name, f'errors.csv')
    pd.DataFrame(errors).to_csv(errors_path, index=False)
        
    print(f'Created dataset \'{dataset_name}\' with {len(data)} examples.')
    print(f'Encountered {len(errors)} errors while processing.')
    print(f'Wrote images to {img_dir}')
    print(f'Wrote CSV to {csv_path}')
    print(f'Wrote pickle to {pickle_path}')
    print(f'Wrote errors to {errors_path}')
    print()

In [48]:
%%time
build_dataset(folder_names=train_folders, output_dir=output_dir, dataset_name='train')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11095.0), HTML(value='')))


Created dataset 'train' with 9510 examples.
Encountered 1585 errors while processing.
Wrote images to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\train\images
Wrote CSV to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\train\train.csv
Wrote pickle to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\train\train.pickle
Wrote errors to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\train\errors.csv

Wall time: 9min 24s


<Figure size 432x288 with 0 Axes>

In [51]:
%%time
build_dataset(folder_names=test_folders, output_dir=output_dir, dataset_name='test')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1507.0), HTML(value='')))


Created dataset 'test' with 1507 examples.
Encountered 0 errors while processing.
Wrote images to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\test\images
Wrote CSV to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\test\test.csv
Wrote pickle to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\test\test.pickle
Wrote errors to C:\Users\Jamin Chen\Development\10617_Project\data\all_years\test\errors.csv

Wall time: 1min 23s


<Figure size 432x288 with 0 Axes>