# Generate Data
This is a simple notebook used for generating all the data we will be training our models on. For $1 \le n \le 10$, generate CSV data files. The header line should be `parenthesization,valid` and each subsequent row is a parenthesization followed by either `0` or `1`, where `0` denotes invalid and `1` denotes valid.

The functions `parenthensizations.valid()` and `parenthesizations.invalid()` have been implemented for you, but make sure to balance the two classes so they have a equal number of data points.

In [1]:
import importlib
import os
import random
# Create the data directory if it does not exist
os.makedirs("data", exist_ok=True)
parenthesizations = importlib.import_module('parenthesizations')

for n in range(1, 10):
    valid_parenthesizations = parenthesizations.valid(n)
    invalid_parenthesizations = list(parenthesizations.invalid(n))

    # Ensure the number of invalid parenthesizations matches the number of valid ones
    if len(invalid_parenthesizations) > len(valid_parenthesizations):
        invalid_parenthesizations = random.sample(invalid_parenthesizations, len(valid_parenthesizations))
    elif len(valid_parenthesizations) > len(invalid_parenthesizations):
        valid_parenthesizations = random.sample(valid_parenthesizations, len(invalid_parenthesizations))

    with open(f"data/parenthesizations_{n}.csv", "w") as f:
        f.write("parenthesization,valid\n")
        # TODO Implement data generation code.
        # Write valid parenthesizations
        for vp in valid_parenthesizations:
            f.write(f"{vp},1\n")
        # Write invalid parenthesizations
        for ip in invalid_parenthesizations:
            f.write(f"{ip},0\n")
        print(f"Generated data for n = {n}")

Generated data for n = 1
Generated data for n = 2
Generated data for n = 3
Generated data for n = 4
Generated data for n = 5
Generated data for n = 6
Generated data for n = 7
Generated data for n = 8
Generated data for n = 9
