In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

In [4]:
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import numpy as np
from preprocessing import *
from features_extraction import *

from glob import glob

In [5]:
def extract_strokes(sample):
  tree = ET.parse(sample)
  r = tree.getroot()
  stroke_set = r.find("StrokeSet")
  strokes = []
  for stroke_node in stroke_set:
      for point in stroke_node:
          x = int(point.attrib['x'])
          y = int(point.attrib['y'])
          time = float(point.attrib['time'])
          strokes.append([x,y,0])
      strokes[-1][-1] = 1 # pen-up
  return strokes

In [6]:
!tar -xf "/content/drive/MyDrive/Colab Notebooks/data/ascii-all.tar.gz"
!tar -xf "/content/drive/MyDrive/Colab Notebooks/data/lineStrokes-all.tar.gz"

In [7]:
PATH= '/content/lineStrokes'
xml_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.xml'))]
PATH= '/content/ascii'
txt_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]

In [8]:
xml_files

['/content/lineStrokes/a07/a07-431/a07-431z-08.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-01.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-07.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-04.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-05.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-02.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-06.xml',
 '/content/lineStrokes/a07/a07-431/a07-431z-03.xml',
 '/content/lineStrokes/a07/a07-421/a07-421z-01.xml',
 '/content/lineStrokes/a07/a07-421/a07-421z-02.xml',
 '/content/lineStrokes/a07/a07-421/a07-421z-03.xml',
 '/content/lineStrokes/a07/a07-421/a07-421z-05.xml',
 '/content/lineStrokes/a07/a07-421/a07-421z-04.xml',
 '/content/lineStrokes/a07/a07-413/a07-413z-05.xml',
 '/content/lineStrokes/a07/a07-413/a07-413z-06.xml',
 '/content/lineStrokes/a07/a07-413/a07-413z-04.xml',
 '/content/lineStrokes/a07/a07-413/a07-413z-01.xml',
 '/content/lineStrokes/a07/a07-413/a07-413z-03.xml',
 '/content/lineStrokes/a07/a07-413/a07-413z-02

In [9]:
data=[]
for txt_file in txt_files:
  with open(txt_file) as f:
    lines = f.readlines()
    try:
      indx = lines.index('CSR:\n')
    except:
      indx = lines.index('CSR: \n')
    lines = lines[indx+2:]
    for i,line in enumerate(lines):
      xml_file = txt_file.replace('ascii','lineStrokes').replace('.txt','')
      xml_file = xml_file + f'-{i+1:02}.xml'
      data.append({'file_path':xml_file,'transcript':line.replace('\n','')})

In [10]:
df = pd.DataFrame(data)
df['exists'] = df['file_path'].apply(lambda x :os.path.exists(x))

In [11]:
df['exists'].value_counts()

Unnamed: 0_level_0,count
exists,Unnamed: 1_level_1
True,12187
False,1021


In [12]:
df.head()

Unnamed: 0,file_path,transcript,exists
0,/content/lineStrokes/a07/a07-431/a07-431z-01.xml,The Conference could,True
1,/content/lineStrokes/a07/a07-431/a07-431z-02.xml,"end in, at least, a postponement",True
2,/content/lineStrokes/a07/a07-431/a07-431z-03.xml,of the strike notices. For the,True
3,/content/lineStrokes/a07/a07-431/a07-431z-04.xml,players' leaders are keen to tell,True
4,/content/lineStrokes/a07/a07-431/a07-431z-05.xml,"the league: ""We wish to negotiate",True


In [13]:
data =df[df['exists']]

In [14]:
NORM_ARGS = ["origin","smooth", "slope", "resample", "slant", "height"]
FEAT_ARGS = ["x_cor","y_cor","penup","dir", "curv", "vic_aspect", "vic_curl", "vic_line", "vic_slope", "bitmap"]

In [16]:
for i,sample in data.iterrows():
  strokes = extract_strokes(sample['file_path'])
  strokes = np.array(strokes)
  ink = preprocess_handwriting(strokes, NORM_ARGS)
  feat = calculate_feature_vector_sequence(ink, FEAT_ARGS)
  outfilename = sample['file_path'].split('/')[-1].replace('.xml','.bin')
  outfilename= '/content/drive/MyDrive/Colab Notebooks/bin_files/'+ outfilename
  feat.tofile(outfilename)

std by feature axis: [0.2811032  0.1857446  0.00564997 0.0111906  0.01333243 0.01141299
 0.00863974 0.00792686 0.01007401 0.15146288 0.00793437 0.00461434
 0.00438797 0.00450087 0.00498307 0.00414627 0.00434981 0.00555868
 0.00497837 0.00281532]
std by feature axis: [0.28288916 0.13760495 0.00583223 0.00952135 0.012345   0.01062764
 0.00896723 0.00659479 0.00577556 0.15790735 0.00527389 0.0046027
 0.00411182 0.00385467 0.00394909 0.00386576 0.00314747 0.00450302
 0.00358609 0.00299239]
std by feature axis: [0.39135894 0.18965992 0.00727316 0.01105671 0.01243437 0.01057748
 0.00831369 0.00825755 0.00902047 0.1368056  0.00709302 0.00438533
 0.00423991 0.00457288 0.00510266 0.0047352  0.00370992 0.00488569
 0.00427474 0.00247138]
std by feature axis: [0.26674542 0.16450892 0.00621473 0.00897788 0.0127049  0.00889268
 0.00864054 0.00658791 0.00556388 0.15189396 0.00517206 0.00463563
 0.00411815 0.0042178  0.00366651 0.00376848 0.00323135 0.00457373
 0.00355201 0.00299749]
std by feature ax

In [None]:
data.head()

In [None]:
data.to_excel('/content/drive/MyDrive/Colab Notebooks/iam_data.xlsx')