In [7]:
import json
import numpy as np

In [11]:
with open('data/face.ndjson') as f:
    drawings = [json.loads(line) for line in f]

In [12]:
print(json.dumps(drawings[0], indent=4))

{
    "word": "face",
    "countrycode": "CH",
    "timestamp": "2017-03-19 15:52:39.229 UTC",
    "recognized": true,
    "key_id": "5152530285723648",
    "drawing": [
        [
            [
                318,
                312,
                306,
                300,
                294,
                289,
                283,
                278,
                270,
                264,
                260,
                256,
                254,
                250,
                247,
                244,
                242,
                239,
                237,
                234,
                232,
                229,
                227,
                225,
                223,
                221,
                220,
                218,
                216,
                215,
                214,
                213,
                211,
                209,
                208,
                207,
                206,
                205,
        

In [41]:
def process_drawing(drawing):
    """
    Converts the raw drawing data into the SketchRNN format:
    (∆x, ∆y, p1, p2, p3) where p1 is pen down, p2 is pen up, p3 is end of drawing.
    
    Parameters:
        drawing (list): The 'drawing' field from the ndjson data, representing strokes.

    Returns:
        List of tuples: Converted strokes with pen states.
    """
    processed_data = []
    
    for stroke in drawing:
        x = np.array(stroke[0], dtype=int)  # x-coordinates of the stroke
        y = np.array(stroke[1], dtype=int)  # y-coordinates of the stroke
        
        # Compute the deltas for x and y
        delta_x = np.diff(x, prepend=x[0])
        delta_y = np.diff(y, prepend=y[0])
        
        # Pen state: pen is down for all points except for the last point of the stroke
        for i, (dx, dy) in enumerate(zip(delta_x, delta_y)):
            dx, dy = int(dx), int(dy)

            if i < len(delta_x) - 1:
                # Pen down (p1 = 1, p2 = 0, p3 = 0)
                processed_data.append((dx, dy, 1, 0, 0))
            else:
                # Pen up (p1 = 0, p2 = 1, p3 = 0) after finishing the stroke
                processed_data.append((dx, dy, 0, 1, 0))
    
    # End of the drawing (p1 = 0, p2 = 0, p3 = 1)
    processed_data.append((0, 0, 0, 0, 1))
    
    return list(processed_data)

In [42]:
type(process_drawing(drawings[0]["drawing"])[0][0])

int

In [43]:
output = [process_drawing(drawing["drawing"]) for drawing in drawings]

In [46]:
len(output)

161666

In [45]:
with open("data/processed_face.json", 'w') as f_out:
    json.dump(output, f_out)

KeyboardInterrupt: 