In [1]:
import random

def generate_number(integer=True, digits=1):
    if integer:
        if random.choice([True, False]):
            return random.randint(10**(digits-1), 10**digits - 1)
        else:
            return -random.randint(10**(digits-1), 10**digits - 1)
    else:
        if random.choice([True, False]):
            return random.uniform(10**(digits-1), 10**digits - 1)
        else:
            return -random.uniform(10**(digits-1), 10**digits - 1)

def format_number(n, index):
    if index == 0:
        return str(n)
    return f"({n})" if n < 0 else str(n)

def generate_addition(integer=True, digits=1):
    a = generate_number(integer, digits)
    b = generate_number(integer, digits)
    if integer:
        question = f"{format_number(a, 0)} + {format_number(b, 1)} ="
        answer = a + b
    else:
        question = f"{format_number(a, 0)} + {format_number(b, 1)} ="
        answer = a + b
    return question, answer

def generate_subtraction(integer=True, digits=1):
    a = generate_number(integer, digits)
    b = generate_number(integer, digits)
    if integer:
        question = f"{format_number(a, 0)} - {format_number(b, 1)} ="
        answer = a - b
    else:
        question = f"{format_number(a, 0)} - {format_number(b, 1)} ="
        answer = a - b
    return question, answer

def generate_multiplication(integer=True, digits=1):
    a = generate_number(integer, digits)
    b = generate_number(integer, digits)
    if integer:
        question = f"{format_number(a, 0)} * {format_number(b, 1)} ="
        answer = a * b
    else:
        question = f"{format_number(a, 0)} * {format_number(b, 1)} ="
        answer = a * b
    return question, answer

def generate_division(integer=True, digits=1):
    b = generate_number(integer, digits)
    a = b * generate_number(integer, digits)
    if integer:
        question = f"{format_number(a, 0)} / {format_number(b, 1)} ="
        answer = a / b
    else:
        question = f"{format_number(a, 0)} / {format_number(b, 1)} ="
        answer = a / b
    return question, answer

def generate_mixed_operation(integer=True, num_terms=3, digits=1):
    operations = ['+', '-', '*', '/']
    terms = [generate_number(integer, digits) for _ in range(num_terms)]
    chosen_operations = random.choices(operations, k=num_terms-1)
    if integer:
        question = " ".join(f"{format_number(terms[i], i)} {chosen_operations[i]}" for i in range(num_terms-1)) + f" {format_number(terms[-1], num_terms-1)} ="
        answer = terms[0]
        for i in range(num_terms-1):
            if chosen_operations[i] == '+':
                answer += terms[i+1]
            elif chosen_operations[i] == '-':
                answer -= terms[i+1]
            elif chosen_operations[i] == '*':
                answer *= terms[i+1]
            elif chosen_operations[i] == '/':
                answer /= terms[i+1]
    else:
        question = " ".join(f"{format_number(terms[i], i)} {chosen_operations[i]}" for i in range(num_terms-1)) + f" {format_number(terms[-1], num_terms-1)} ="
        answer = terms[0]
        for i in range(num_terms-1):
            if chosen_operations[i] == '+':
                answer += terms[i+1]
            elif chosen_operations[i] == '-':
                answer -= terms[i+1]
            elif chosen_operations[i] == '*':
                answer *= terms[i+1]
            elif chosen_operations[i] == '/':
                answer /= terms[i+1]

    return question, answer

def generate_problems(num_problems):
    problems = []
    answers = []

    for _ in range(num_problems):
        problem_type = random.choice(['addition', 'subtraction', 'multiplication', 'division', 'mixed'])
        integer = random.choice([True, False])
        num_terms = random.randint(2, 5)  # 2から5の項数をランダムに選択
        digits = random.randint(1, 5)  # 1から5桁をランダムに選択
        
        if problem_type == 'addition':
            q, a = generate_addition(integer, digits)
        elif problem_type == 'subtraction':
            q, a = generate_subtraction(integer, digits)
        elif problem_type == 'multiplication':
            q, a = generate_multiplication(integer, digits)
        elif problem_type == 'division':
            q, a = generate_division(integer, digits)
        elif problem_type == 'mixed':
            q, a = generate_mixed_operation(integer, num_terms, digits)

        problems.append(q)
        answers.append(f"{q} {a:.1f}" if not integer else f"{q} {a}")

    return problems, answers


In [2]:
from tqdm import tqdm

q_list = []
a_list=[]
n_problems=10**6

for i in tqdm(range(n_problems)):
    num_problems = random.randint(1,20)  # 任意の問題数に調整可能
    problems, answers = generate_problems(num_problems)
    problems = "\n".join(problems)
    answers = "\n".join(answers)
    q_list.append(problems)
    a_list.append(answers)

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [00:53<00:00, 18605.97it/s]


In [3]:
print(problems)

-573 - (-513) =
-81575 - 86451 =
4.011852999147356 / 8.71314970064429 * (-6.9017175955423316) =
-43926.70337446031 * 37463.2934646677 =
-332.8889664710741 + 119.51777745340755 / (-786.9374006450711) + 556.0103472829967 + 285.7562562582849 =
5 - 9 =
63.2333983923888 + (-49.32729351506989) =
-606.1658151973718 + 656.298850170222 =
156.91047225774162 + (-291.09474036136385) - 233.29376001585717 =
6 + (-7) =
564.8362418271255 * (-600.3704923973667) =
18090.159595631387 + (-54750.8129175688) - (-79357.6271833576) - 62559.19125298541 =
6.092756955324712 + 5.28978354196366 =
-790622.4704388035 / (-985.1941490024643) =
2 - 3 =
-54320 * (-38592) =
-5 + (-3) =


In [4]:
import pandas as pd
df = pd.DataFrame({'question': q_list, 'answer': a_list})
df.to_parquet("data/sansu.parquet")

In [5]:
from huggingface_hub import HfApi, logging
hf = HfApi()
hf.upload_file(path_or_fileobj="data/sansu.parquet",
                path_in_repo=f"1.parquet",
                repo_id="kanhatakeyama/Sansu", repo_type="dataset")




  from .autonotebook import tqdm as notebook_tqdm
sansu.parquet: 100%|██████████| 621M/621M [08:22<00:00, 1.24MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/kanhatakeyama/Sansu/commit/d30a7fc3f7dd0984d69f873bc5fe4550e5acb8d6', commit_message='Upload 1.parquet with huggingface_hub', commit_description='', oid='d30a7fc3f7dd0984d69f873bc5fe4550e5acb8d6', pr_url=None, pr_revision=None, pr_num=None)