In [21]:
### Imports
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.transforms import Bbox
import random
import json
import os
import shutil


In [2]:
labels = [
    'Length [nm]', 'Size [um]', 'Length [mm]', 'Size [cm]', 'Distance [m]', 'Distance [km]',
    'Weight [kg]', 'Temperature [K]', 'Temperature [C]', 'Area [m^2]', 'Area [cm^2]', 'Volume [m^3]',
    'Volume [cm^3]', 'Speed [m/s]', 'Speed [km/h]', 'Time [s]', 'Time [min]', 'Time [hr]',
    'Frequency [Hz]', 'Energy [J]', 'Power [W]', 'Pressure [Pa]', 'Pressure [kPa]', 'Voltage [V]',
    'Current [A]', 'Resistance [ohm]', 'Capacitance [F]', 'Inductance [H]', 'Force [N]', 'Torque [Nm]',
    'Velocity [m/s]', 'Acceleration [m/s^2]', 'Density [kg/m^3]', 'Density [g/cm^3]', 'Viscosity [Pa.s]',
    'Viscosity [cP]', 'Flow rate [m^3/s]', 'Flow rate [L/min]', 'Concentration [mol/m^3]',
    'Concentration [mg/L]', 'Luminance [cd/m^2]', 'Illuminance [lux]', 'Magnetic field [T]',
    'Magnetic flux [Wb]', 'Radiation dose [Gy]', 'Radiation dose rate [Gy/s]', 'Angle [rad]',
    'Angle [deg]'
]


In [3]:
adjectives = [
    "Happy", "Sad", "Excited", "Mysterious", "Playful",
    "Enthusiastic", "Cautious", "Brave", "Shy", "Energetic",
    "Lazy", "Curious", "Confident", "Clumsy", "Grateful",
    "Generous", "Ambitious", "Compassionate", "Wise", "Silly",
    "Serious", "Sincere", "Gentle", "Tough", "Carefree",
    "Charming", "Radiant", "Elegant", "Witty", "Artistic",
    "Dynamic", "Resilient", "Daring", "Friendly", "Loyal",
    "Mellow", "Vibrant", "Vivacious", "Creative", "Humble",
    "Sassy", "Reckless", "Pensive", "Candid", "Adaptable",
    "Tenacious", "Resourceful", "Modest", "Charismatic", "Nurturing",
    "Fierce", "Optimistic", "Pessimistic", "Eccentric", "Charming",
    "Dazzling", "Bewildered", "Spirited", "Relaxed", "Cooperative",
    "Outgoing", "Introverted", "Quirky", "Sensitive", "Inquisitive",
    "Stoic", "Dramatic", "Whimsical", "Stoic", "Zesty", "Thoughtful",
    "Exuberant", "Spontaneous", "Candid", "Modest", "Confused",
    "Tenacious", "Resourceful", "Determined", "Jovial", "Playful",
    "Adventurous", "Reserved", "Hilarious", "Perceptive", "Easygoing",
    "Observant", "Reckless", "Enigmatic", "Witty", "Daring",
    "Spirited", "Hopeful", "Caring", "Bold", "Sincere", "Chivalrous",
    "Dynamic", "Courageous", "Grounded"
]


In [4]:
nouns = [
    "Sun", "Moon", "Ocean", "Mountain", "River",
    "Tree", "Cloud", "Bird", "Fish", "Flower",
    "Star", "Book", "Pen", "Key", "Door",
    "Window", "Table", "Chair", "Lamp", "Phone",
    "Computer", "Car", "Bicycle", "Train", "Plane",
    "City", "Country", "Friend", "Family", "Love",
    "Dream", "Adventure", "Journey", "Song", "Dance",
    "Art", "Science", "Math", "History", "Future",
    "Past", "Present", "Hope", "Fear", "Courage",
    "Wisdom", "Knowledge", "Truth", "Lie", "Freedom",
    "Justice", "Peace", "War", "Happiness", "Sadness",
    "Joy", "Sorrow", "Laughter", "Tear", "Smile",
    "Frown", "Success", "Failure", "Victory", "Defeat",
    "Challenge", "Reward", "Risk", "Adventure", "Discovery",
    "Wonder", "Imagination", "Creativity", "Innovation", "Silence",
    "Noise", "Nature", "Cityscape", "Island", "Desert",
    "Forest", "Meadow", "Valley", "Castle", "Kingdom",
    "Village", "Market", "Cafe", "Restaurant", "Library",
    "School", "Hospital", "Factory", "Office", "Home",
    "Ship", "Rocket", "Planet", "Galaxy", "Universe"
]


In [5]:
legends = [
    'Quantum', 'Aether', 'Nucleus', 'Neutron', 'Photon', 'Spectrum', 'Catalyst',
    'Polarity', 'Ionization', 'Isotope', 'Helix', 'Luminous', 'Kinetics', 'Entropy',
    'Synthesis', 'Amino', 'Genome', 'Proton', 'Orbit', 'Radiance', 'Cognate',
    'Inertia', 'Momentum', 'Inversion', 'Fluorescence', 'Catalysis', 'Receptor',
    'Polymer', 'Chromatin', 'Matrix', 'Ergonomics', 'Thermodynamics', 'Zeta', 'Delta',
    'Alpha', 'Beta', 'Sigma', 'Gamma', 'Omega', 'Psi', 'Kappa', 'Lambda', 'Omicron',
    'Epsilon', 'Enzyme', 'Quasar', 'Hydrogen', 'Oxygen', 'Astron', 'Electron',
    'Plasma', 'Cortex', 'Neuron', 'Entropy', 'Eclipse', 'Asteroid', 'Kinematics'
]


In [6]:
markers = [
    '.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd'
]


In [7]:
colors = [
    'b', 'g', 'r', 'c', 'm', 'y', 'k',  # Single-letter abbreviations for basic colors
    'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'white',  # Full color names
    'skyblue', 'tomato', 'gold', 'purple', 'lime', 'orange', 'pink', 'brown',  # Some named colors
    '#FF5733', '#33FF57', '#5733FF',  # Hexadecimal color values
    (0.1, 0.2, 0.3), (0.4, 0.5, 0.6), (0.7, 0.8, 0.9),  # RGB tuples
]


In [41]:
### Create data function
def create_data(start: int = 0, end: int = 10, folder: str = './raw_data', purpose: str = 'train'):
    '''This generates a number of Train, Evaluate or Test data in a specific folder
    start: number of first scatterplot picture
    end: number of last scatterplot picture
    folder: location for saving final pictures
    '''

    metadata_list = []

    for j in range(start,end):
        # Generate random data
        xlim = np.random.randint(low=0, high=1000, size=1)
        ylim = np.random.randint(low=0, high=1000, size=1)
        num_series = np.random.randint(2, 3)
        num_points = np.random.randint(10, 40)

        # Create an empty list to store series data
        series = []

        # Generate and plot random data for each series
        plt.figure(figsize=(3.2,2.4), dpi=100)

        for i in range(num_series+1):

            # Create series
            name = random.choice(legends)
            points_serie = max(num_points // num_series + np.random.randint(-2, 2),1)
            x_values = np.round(np.random.rand(points_serie) * xlim, decimals=1)
            y_values = np.round(np.random.rand(points_serie) * ylim, decimals=1)
            series.append({'name': name, 'x': list(x_values), 'y': list(y_values)})

            # Create a scatter plot for the current series
            plt.scatter(x=x_values, y=y_values, label=name, marker=random.choice(markers), color=random.choice(colors))

        # Add legend
        plt.legend(loc='upper right',framealpha=0.3)#, bbox_to_anchor=(0.6,0.5))

        # Add labels and title
        x_label = random.choice(labels)
        y_label = random.choice(labels)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plot_title = random.choice(adjectives) +' ' + random.choice(nouns)
        plt.title(plot_title)

        # Create file names
        fname = folder+purpose+'_'+str(j).zfill(4)

        # Save the plot with smaller margins
        plt.savefig(fname+'.jpg', dpi=100, bbox_inches=Bbox.from_bounds(-0.26, -0.2, 3.2, 2.56))

        # Create ground truth dictionary
        ground_truth = {
            "title": plot_title,
            "x_label": x_label,
            "y_label": y_label,
            "series": series
        }

        # Create metadata
        metadata = {
            "file_name": purpose+'_'+str(j).zfill(4)+'.jpg',
            "ground_truth":'{\"gt_parse\": ' + json.dumps(ground_truth) + '}'
        }

        metadata_list.append(metadata)

        # Clear figure and axes
        plt.clf()
        plt.cla()
        plt.close()

    # File path for the JSONL file
    file_path = folder+'/metadata.jsonl'


    # Writing data to the JSONL file
    with open(file_path, 'w') as file:
        for item in metadata_list:
            json.dump(item, file, default=str)  # Use str() for non-serializable objects
            file.write('\n')  # Add a newline character to separate JSON objects


In [23]:
### Constants
train_size = 8
val_split = 0.125
test_split = 0.125

dataset = './raw_data'

train_dir = dataset + '/1. train/'
val_dir = dataset + '/2. validation/'
test_dir = dataset + '/3. test/'


In [44]:
### Generate data

os.makedirs(train_dir, exist_ok=True) if not os.path.exists(train_dir) else None
create_data(0, train_size, train_dir, 'train')

os.makedirs(val_dir, exist_ok=True) if not os.path.exists(val_dir) else None
create_data(0, round(train_size*val_split), val_dir, 'val')

os.makedirs(test_dir, exist_ok=True) if not os.path.exists(test_dir) else None
create_data(0, round(train_size*test_split), test_dir, 'test')


In [43]:
### Delete data
shutil.rmtree(train_dir) if os.path.exists(train_dir) else None
shutil.rmtree(val_dir) if os.path.exists(val_dir) else None
shutil.rmtree(test_dir) if os.path.exists(test_dir) else None
