In [1]:
import random

def generate_number(integer=True, digits=1):
    if integer:
        if random.choice([True, False]):
            return random.randint(10**(digits-1), 10**digits - 1)
        else:
            return -random.randint(10**(digits-1), 10**digits - 1)
    else:
        # ランダムにTrueまたはFalseを選択して正または負の数を生成
        if random.choice([True, False]):
            number = random.uniform(10**(digits-1), 10**digits - 1)
        else:
            number = -random.uniform(10**(digits-1), 10**digits - 1)
        
        # 四捨五入する桁数をランダムに決定
        rounding_digits = random.randint(-digits, digits-1)
        
        # 四捨五入
        rounded_number = round(number, rounding_digits)
        return rounded_number

def format_number(n, index):
    if index == 0:
        return str(n)
    return f"({n})" if n < 0 else str(n)

def generate_addition(integer=True, digits=1):
    a = generate_number(integer, digits)
    b = generate_number(integer, digits)
    question = f"{format_number(a, 0)} + {format_number(b, 1)} ="
    answer = a + b
    return question, answer

def generate_subtraction(integer=True, digits=1):
    a = generate_number(integer, digits)
    b = generate_number(integer, digits)
    question = f"{format_number(a, 0)} - {format_number(b, 1)} ="
    answer = a - b
    return question, answer

def generate_multiplication(integer=True, digits=1):
    a = generate_number(integer, digits)
    b = generate_number(integer, digits)
    question = f"{format_number(a, 0)} * {format_number(b, 1)} ="
    answer = a * b
    return question, answer

def generate_division(integer=True, digits=1):
    while True:
        b = generate_number(integer, digits)
        if b != 0:  # 分母がゼロでないことを確認
            break
    a = b * generate_number(integer, digits)
    question = f"{format_number(a, 0)} / {format_number(b, 1)} ="
    answer = a / b
    return question, answer

def generate_mixed_operation(integer=True, num_terms=3, digits=1):
    operations = ['+', '-', '*', '/']
    terms = [generate_number(integer, digits) for _ in range(num_terms)]
    chosen_operations = random.choices(operations, k=num_terms-1)
    answer = terms[0]
    question_parts = [format_number(terms[0], 0)]

    for i in range(num_terms-1):
        operation = chosen_operations[i]
        if operation == '/' and terms[i+1] == 0:
            terms[i+1] = 1  # 分母がゼロの場合は1に置き換え
        question_parts.append(f"{operation} {format_number(terms[i+1], i+1)}")
        if operation == '+':
            answer += terms[i+1]
        elif operation == '-':
            answer -= terms[i+1]
        elif operation == '*':
            answer *= terms[i+1]
        elif operation == '/':
            answer /= terms[i+1]

    question = " ".join(question_parts) + " ="
    return question, answer

def generate_problems(num_problems):
    problems = []
    answers = []

    for _ in range(num_problems):
        problem_type = random.choice(['addition', 'subtraction', 'multiplication', 'division', 'mixed'])
        integer = random.choice([True, True, True, True, True, True, False])
        num_terms = random.randint(2, 4)  # 2から4の項数をランダムに選択
        digits = random.randint(1, 4)  # 1から4桁をランダムに選択
        
        if problem_type == 'addition':
            q, a = generate_addition(integer, digits)
        elif problem_type == 'subtraction':
            q, a = generate_subtraction(integer, digits)
        elif problem_type == 'multiplication':
            q, a = generate_multiplication(integer, digits)
        elif problem_type == 'division':
            q, a = generate_division(integer, digits)
        elif problem_type == 'mixed':
            q, a = generate_mixed_operation(integer, num_terms, digits)
        problems.append(q)
        answers.append(f"{q} {a:.1f}" if not integer else f"{q} {a}")

    return problems, answers


In [2]:
from tqdm import tqdm

q_list = []
a_list=[]
n_problems=2*10**6

for i in tqdm(range(n_problems)):
    num_problems = random.randint(1,20)  # 任意の問題数に調整可能
    problems, answers = generate_problems(num_problems)
    problems = "\n".join(problems)
    answers = "\n".join(answers)
    q_list.append(problems)
    a_list.append(answers)

  0%|          | 0/2000000 [00:00<?, ?it/s]

100%|██████████| 2000000/2000000 [01:37<00:00, 20513.22it/s]


In [3]:
import pandas as pd
df = pd.DataFrame({'question': q_list, 'answer': a_list})
df.to_parquet("data/sansu.parquet")

In [4]:
from huggingface_hub import HfApi, logging
hf = HfApi()
hf.upload_file(path_or_fileobj="data/sansu.parquet",
                path_in_repo=f"1.parquet",
                repo_id="kanhatakeyama/Sansu", repo_type="dataset")




  from .autonotebook import tqdm as notebook_tqdm
sansu.parquet: 100%|██████████| 537M/537M [06:49<00:00, 1.31MB/s]    


CommitInfo(commit_url='https://huggingface.co/datasets/kanhatakeyama/Sansu/commit/a78b39b9837c0278000872c8df6403733c805c91', commit_message='Upload 1.parquet with huggingface_hub', commit_description='', oid='a78b39b9837c0278000872c8df6403733c805c91', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
print(problems)

348 + (-594) =
1404 * 4089 / 6837 =
10000.0 * 3782.55 / (-9666.7) * 6538.413 =
-2 + 6 =
58118208 / 8628 =
-2543 - (-2872) =
5399 / (-9218) + (-9427) =
-3 - (-8) * 9 =
-2865 + (-1018) =
75366 / 237 =
-8805 - 1524 =
3709 + 2167 =
794.64 * 930.0 =
8300.0 + 6259.9 =
-30 - 53 =
-272 - 946 =


In [6]:
print(answers)

348 + (-594) = -246
1404 * 4089 / 6837 = 839.6893374286968
10000.0 * 3782.55 / (-9666.7) * 6538.413 = -25584609.1
-2 + 6 = 4
58118208 / 8628 = 6736.0
-2543 - (-2872) = 329
5399 / (-9218) + (-9427) = -9427.58570188761
-3 - (-8) * 9 = 45
-2865 + (-1018) = -3883
75366 / 237 = 318.0
-8805 - 1524 = -10329
3709 + 2167 = 5876
794.64 * 930.0 = 739015.2
8300.0 + 6259.9 = 14559.9
-30 - 53 = -83
-272 - 946 = -1218
