In [1]:
import json
import random
import datetime

# Helper Functions for Each Category

In [2]:
def generate_numeric_example():
    # Choose a number of ticks between 5 and 15.
    n = random.randint(5, 15)
    
    # Random starting value between 0 and 100 and a step between 0.5 and 20.
    start = random.uniform(0, 1000)
    step = random.uniform(0.5, 50)
    
    # With 50% chance, round to integer values.
    if random.random() < 0.5:
        start = round(start)
        step = round(step)
        
    # Generate the full sequence.
    ticks = [start + i * step for i in range(n)]
    
    # Format ticks: if the value is an integer then no decimals; otherwise one decimal place.
    formatted_ticks = []
    for t in ticks:
        if t == int(t):
            formatted_ticks.append(str(int(t)))
        else:
            formatted_ticks.append(f"{t:.1f}")
    return make_table(formatted_ticks)

In [3]:
def generate_datetime_example():
    # Randomly choose a datetime variant, now including "datetime" for full date–time stamps.
    dt_type = random.choice(["daily", "monthly", "hourly", "yearly", "quarterly", "datetime"])
    n = random.randint(5, 15)
    formatted = []
    
    if dt_type == "daily":
        start_date = datetime.date(2000, 1, 1) + datetime.timedelta(days=random.randint(0, 10000))
        step = random.randint(1, 14)
        dates = [start_date + datetime.timedelta(days=i * step) for i in range(n)]
        # Diversify daily formats.
        daily_formats = ["%Y-%m-%d", "%m/%d/%Y", "%d-%b-%Y"]
        fmt = random.choice(daily_formats)
        formatted = [d.strftime(fmt) for d in dates]
        
    elif dt_type == "monthly":
        year = random.randint(1990, 2025)
        month = random.randint(1, 12)
        step = random.randint(1, 3)
        dates = []
        for i in range(n):
            m = month + i * step
            y = year + (m - 1) // 12
            m = ((m - 1) % 12) + 1
            dates.append(datetime.date(y, m, 1))
        # Diversify monthly formats.
        monthly_formats = ["%Y-%m", "%b %Y", "%m/%Y"]
        fmt = random.choice(monthly_formats)
        formatted = [d.strftime(fmt) for d in dates]
        
    elif dt_type == "hourly":
        # Generate times (only hours, minutes and optional seconds) without the date.
        start_time = datetime.datetime(2010, 1, 1, random.randint(0, 23), random.randint(0, 59))
        step = random.randint(1, 3)
        times = [start_time + datetime.timedelta(hours=i * step) for i in range(n)]
        # Diversify hourly formats.
        hourly_formats = ["%H:%M", "%I:%M %p", "%H:%M:%S"]
        fmt = random.choice(hourly_formats)
        formatted = [t.strftime(fmt) for t in times]
        
    elif dt_type == "yearly":
        year = random.randint(1980, 2020)
        step = random.randint(1, 10)
        years = [year + i * step for i in range(n)]
        # Diversify yearly formats.
        yearly_formats = ["%Y", "FY %Y", "Year %Y"]
        fmt = random.choice(yearly_formats)
        if fmt == "%Y":
            formatted = [str(y) for y in years]
        else:
            formatted = [fmt.replace("%Y", str(y)) for y in years]
        
    elif dt_type == "quarterly":
        start_year = random.randint(2000, 2020)
        start_quarter = random.randint(1, 4)
        quarters = []
        for i in range(n):
            total = (start_quarter - 1) + i
            q = (total % 4) + 1
            y = start_year + total // 4
            quarters.append((q, y))
        # Diversify quarterly formats.
        quarterly_formats = [
            lambda q, y: f"Q{q} {y}",
            lambda q, y: f"Quarter {q}, {y}",
            lambda q, y: f"{y} Q{q}"
        ]
        fmt_func = random.choice(quarterly_formats)
        formatted = [fmt_func(q, y) for (q, y) in quarters]
        
    elif dt_type == "datetime":
        # Generate full datetime stamps including date and time.
        start_dt = datetime.datetime(2000, 1, 1, random.randint(0, 23), random.randint(0, 59), random.randint(0, 59))
        step = random.randint(1, 4)  # step in hours
        dt_list = [start_dt + datetime.timedelta(hours=i * step) for i in range(n)]
        # Diversify full datetime formats.
        datetime_formats = [
            "%Y-%m-%d %H:%M:%S", 
            "%m/%d/%Y %I:%M %p", 
            "%d-%b-%Y %H:%M:%S", 
            "%Y/%m/%d %H:%M"
        ]
        fmt = random.choice(datetime_formats)
        formatted = [dt.strftime(fmt) for dt in dt_list]
        
    return make_table(formatted)


In [4]:
def generate_categorical_example():
    # Randomly decide between using a fixed sequence (with optional alterations)
    # or generating a custom sequence using a random prefix and numeric progression.
    
    if random.random() < 0.5:
        fixed_sequences = [
            ["Very Unsatisfied", "Unsatisfied", "Neutral", "Satisfied", "Very Satisfied"],
            ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"],
            ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
            ["Mild", "Medium", "Hot", "Extra Hot"],
            ["Level 1", "Level 2", "Level 3", "Level 4", "Level 5", "Level 6"],
            ["Part 1", "Part 2", "Part 3", "Part 4", "Part 5"],
            ["Round 1", "Round 2", "Round 3", "Round 4"],
            ["Beginner", "Intermediate", "Advanced", "Expert"],
            ["Low", "Medium", "High"],
            ["Bronze", "Silver", "Gold", "Platinum"],
            ["Small", "Medium", "Large", "Extra Large"],
            ["Poor", "Fair", "Good", "Very Good", "Excellent"],
            ["Freshman", "Sophomore", "Junior", "Senior"],
            ["Stage 1", "Stage 2", "Stage 3", "Stage 4", "Stage 5"],
            ["Grade F", "Grade D", "Grade C", "Grade B", "Grade A"],
            ["Novice", "Competent", "Proficient", "Master"],
            ["Phase 1", "Phase 2", "Phase 3"],
            ["Step 1", "Step 2", "Step 3", "Step 4", "Step 5", "Step 6"],
            ["Tier 1", "Tier 2", "Tier 3", "Tier 4"],
            ["Rank 1", "Rank 2", "Rank 3", "Rank 4", "Rank 5"],
            ["Option A", "Option B", "Option C", "Option D"],
            ["One Star", "Two Stars", "Three Stars", "Four Stars", "Five Stars"],
            ["Level I", "Level II", "Level III", "Level IV", "Level V"],
            ["Section 1", "Section 2", "Section 3", "Section 4"],
            ["Part A", "Part B", "Part C", "Part D", "Part E"]
        ]
        
        seq = random.choice(fixed_sequences)
        
        # Occasionally reverse the order.
        if random.random() < 0.3:
            seq = list(reversed(seq))
            
        # Occasionally choose a contiguous subset.
        if len(seq) > 3:
            start_idx = random.randint(0, len(seq) - 3)
            end_idx = random.randint(start_idx + 3, len(seq))
            seq = seq[start_idx:end_idx]
        return make_table(seq)
        
    else:
        # Generate a custom sequence.
        prefixes = ["Item", "Segment", "Option", "Choice", "Level", "Phase", "Step", "Part", "Round", "Category", "Group"]
        prefix = random.choice(prefixes)
        length = random.randint(3, 10)
        start = random.randint(1, 20)
        step = random.randint(1, 4)
        seq = [f"{prefix} {start + i * step}" for i in range(length)]
        return make_table(seq)

In [5]:
def generate_ordered_bins_example():
    # Choose a random number of bins between 5 and 15, a random starting value between -1000 and 1000,
    # and a bin width (step) between 5 and 50.
    
    n = random.randint(5, 15)
    start = random.randint(-1000, 1000)
    step = random.randint(5, 50)
    bins = []
    
    for i in range(n):
        bin_start = start + i * step
        bin_end = bin_start + step
        bins.append(f"[{bin_start}, {bin_end}]")
    return make_table(bins)

# Main Dataset Generation

In [6]:
def make_table(values):
    """
    Given a list of tick values (as strings), create a mapping table as a string.
    The first line is a header with tab-separated columns.
    For the input version, every even mapping value's tick is left blank.
    Returns a tuple (input_str, target_str).
    """
    header = "Mapping Value\tX Axis Tick"
    input_lines = [header]
    target_lines = [header]
    
    for i, val in enumerate(values, start=1):
        # For the input version, only include the tick value for odd mapping values.
        if i % 2 == 1:
            input_lines.append(f"{i}\t{val}")
        else:
            input_lines.append(f"{i}\t")
        target_lines.append(f"{i}\t{val}")
        
    input_table = "\n".join(input_lines)
    target_table = "\n".join(target_lines)
    
    return input_table, target_table

In [7]:
# Total examples per category.
NUM_EXAMPLES_PER_CATEGORY = 20000

# Define the four categories with their corresponding generator functions.
categories = [
    ("Numeric", generate_numeric_example),
    ("DateTime", generate_datetime_example),
    ("Categorical", generate_categorical_example),
    ("OrderedBins", generate_ordered_bins_example)
]

# For each category, create a separate JSON Lines file.
for cat_name, gen_func in categories:
    filename = f"training_data_{cat_name}.jsonl"
    
    with open(filename, mode="w", encoding="utf-8") as outfile:
        
        for ex_id in range(1, NUM_EXAMPLES_PER_CATEGORY + 1):
            
            input_table, target_table = gen_func()
            
            input_text = f"fill in the missing X Axis ticks:\n{input_table}"
            target_text = f"{target_table}"
            
            record = {"input_text": input_text, "target_text": target_text}
            outfile.write(json.dumps(record) + "\n")
            
    print(f"File '{filename}' with {NUM_EXAMPLES_PER_CATEGORY} records generated.")

print("All files have been generated.")

File 'training_data_Numeric.jsonl' with 20000 records generated.
File 'training_data_DateTime.jsonl' with 20000 records generated.
File 'training_data_Categorical.jsonl' with 20000 records generated.
File 'training_data_OrderedBins.jsonl' with 20000 records generated.
All files have been generated.
