In [1]:
import ndjson
import os
import numpy as np
import cv2
import pandas as pd

In [21]:
# Define the output directory for preprocessed images
output_dir = r'C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the directory where the raw .ndjson files are located
raw_data_dir = r'D:\QuickDrawDataset'  # Path to the raw data folder

In [22]:
def coordinates_to_image(coordinates, image_size=28):
    """
    Convert the list of coordinates into a grayscale image (28x28 or any size).
    Coordinates are expected as a list of (x, y) tuples.
    """
    image = np.ones((image_size, image_size), dtype=np.uint8) * 255  # White background
    
    # Loop through each stroke (a stroke is a list of (x, y) pairs)
    for stroke in coordinates:
        # Ensure that each stroke has at least two points to connect
        if len(stroke) > 1:
            for i in range(1, len(stroke)):
                # Ensure stroke contains valid (x, y) pairs
                try:
                    x1, y1 = stroke[i-1]  # Previous point
                    x2, y2 = stroke[i]    # Current point

                    # Ensure x1, y1, x2, y2 are integers
                    x1, y1 = int(x1), int(y1)
                    x2, y2 = int(x2), int(y2)

                    # Draw a line between consecutive coordinates
                    cv2.line(image, (x1, y1), (x2, y2), (0), 1)  # Black line
                except ValueError:
                    # If the stroke doesn't have (x, y) pairs, we skip it
                    continue
    
    # Resize image to ensure consistent dimensions (e.g., 28x28)
    image = cv2.resize(image, (image_size, image_size), interpolation=cv2.INTER_AREA)
    
    return image


In [23]:
def preprocess_and_save_raw_data(input_file, label, image_size=28, batch_size=100):
    """Stream through the NDJSON file, convert data, and save images one by one"""
    with open(input_file, 'r') as file:
        reader = ndjson.reader(file)
        
        images = []
        labels = []
        batch_index = 1  # Start with batch 1

        # Iterate through the NDJSON file (drawing each item one by one)
        for i, entry in enumerate(reader):
            drawing = entry['drawing']
            
            # Skip empty or malformed data (e.g., empty drawings or invalid strokes)
            if not drawing or any(len(stroke) == 0 for stroke in drawing):
                continue  # Skip this entry if the drawing is empty or malformed
            
            # Convert strokes to an image
            image = coordinates_to_image(drawing, image_size)
            images.append(image)
            labels.append(label)  # Use the category name as the label
            
            # Save images and labels after processing every batch (e.g., every 100 entries)
            if (i + 1) % batch_size == 0:
                # Save images and labels into a numpy array and csv
                images_np = np.array(images)
                labels_df = pd.DataFrame(labels, columns=['label'])
                
                # Save images as .npy files in the output directory
                np.save(os.path.join(output_dir, f'{label}_images_{batch_index}.npy'), images_np)
                
                # Save labels as .csv files in the output directory
                labels_df.to_csv(os.path.join(output_dir, f'{label}_labels_{batch_index}.csv'), index=False)
                
                # Reset images and labels for the next batch
                images.clear()
                labels.clear()
                batch_index += 1  # Increment batch number

        # Save remaining images if they exist (in case the total number of items is not divisible by batch_size)
        if images:
            images_np = np.array(images)
            labels_df = pd.DataFrame(labels, columns=['label'])
            np.save(os.path.join(output_dir, f'{label}_images_{batch_index}.npy'), images_np)
            labels_df.to_csv(os.path.join(output_dir, f'{label}_labels_{batch_index}.csv'), index=False)

# Process all raw NDJSON files in the raw_data folder
def process_all_files():
    for filename in os.listdir(raw_data_dir):
        if filename.endswith('.ndjson'):
            label = filename.split('.')[0]  # Use the filename (category) as the label
            input_file = os.path.join(raw_data_dir, filename)
            print(f"Processing {label} data...")
            preprocess_and_save_raw_data(input_file, label)

# Call the function to process all files
process_all_files()

Processing full_raw_airplane data...
Processing full_raw_bicycle data...
Processing full_raw_bus data...
Processing full_raw_car data...
Processing full_raw_cat data...
Processing full_raw_computer data...
Processing full_raw_dog data...
Processing full_raw_elephant data...
Processing full_raw_fish data...
Processing full_raw_flower data...
Processing full_raw_horse data...
Processing full_raw_house data...
Processing full_raw_moon data...
Processing full_raw_rabbit data...
Processing full_raw_smiley face data...
Processing full_raw_star data...
Processing full_raw_sun data...
Processing full_raw_tree data...
Processing full_raw_truck data...


In [17]:
# List of your 19 classes (these should match the labels in your dataset)
class_names = [
    "cat", "dog", "fish", "rabbit", "horse", "elephant",  # Animals
    "car", "truck", "airplane", "bicycle", "bus",           # Vehicles
    "house", "tree", "flower", "computer",                 # Tools & Objects
    "star", "sun", "moon", "smiley face"                   # Miscellaneous Objects
]

# Create a DataFrame to store the mapping of numeric labels to object names
labels_df = pd.DataFrame([(i, name) for i, name in enumerate(class_names)], columns=["Label", "Class"])

# Save this DataFrame as a CSV file
labels_df.to_csv(r'C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\labels.csv', index=False)