In [9]:
import os
import pandas as pd
import glob
from pathlib import Path

base_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset"
output_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset"

In [12]:
import os
import pandas as pd
import glob
import argparse
from pathlib import Path

def process_difficulty(difficulty, base_dir, samples_per_topic):
    """Process a single difficulty level with the specified number of samples per topic."""
    print(f"Processing {difficulty} difficulty with {samples_per_topic} samples per topic...")
    
    # Path to the difficulty folder
    difficulty_dir = os.path.join(base_dir, f"train-{difficulty}")
    
    # List to store all sampled data
    all_sampled_data = []
    
    # Get all text files in the difficulty directory
    txt_files = glob.glob(os.path.join(difficulty_dir, "*.txt"))
    
    for file_path in txt_files:
        # Extract topic from filename
        topic = os.path.basename(file_path).replace(".txt", "")
        print(f"  Processing topic: {topic}")
        
        # List to store data for this topic
        topic_data = []
        
        # Read only enough lines to get samples_per_topic examples
        with open(file_path, 'r', encoding='utf-8') as f:
            samples_collected = 0
            
            while samples_collected < samples_per_topic:
                # Read question line
                question_line = f.readline()
                if not question_line:  # End of file
                    break
                
                # Read answer line
                answer_line = f.readline()
                if not answer_line:  # Unexpected end of file
                    break
                
                question = question_line.strip()
                answer = answer_line.strip()
                
                # Add to topic data
                topic_data.append({
                    "topic": topic,
                    "question": question,
                    "answer": answer
                })
                
                samples_collected += 1
        
        # Add collected samples to the main list
        all_sampled_data.extend(topic_data)
        print(f"    Collected {len(topic_data)} examples")
    
    # Create DataFrame from all sampled data
    df = pd.DataFrame(all_sampled_data)
    
    # Create output directory if it doesn't exist
    output_dir = os.path.join(base_dir, "processed")
    os.makedirs(output_dir, exist_ok=True)
    
    # Save to TSV
    output_file = os.path.join(output_dir, f"{difficulty}_{samples_per_topic}.tsv")
    df.to_csv(output_file, sep='\t', index=False)
    print(f"Saved {len(df)} examples to {output_file}")
    
    return len(df)

def build_math_datasets(sample_sizes=None):
    """Build math datasets with different sample sizes per topic."""
    if sample_sizes is None:
        sample_sizes = [100, 1000, 10000]
    
    # Base directory
    base_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset"
    
    # Difficulty levels
    difficulties = ["easy", "medium", "hard"]
    
    # Process each sample size
    for samples_per_topic in sample_sizes:
        print(f"\nProcessing datasets with {samples_per_topic} samples per topic")
        
        # Process each difficulty level with this sample size
        for difficulty in difficulties:
            total_examples = process_difficulty(difficulty, base_dir, samples_per_topic)
            print(f"Completed {difficulty} difficulty with {total_examples} total examples")

In [14]:
SAMPLES_PER_TOPIC = 100
build_math_datasets( [100, 1000, 10000])


Processing datasets with 100 samples per topic
Processing easy difficulty with 100 samples per topic...
  Processing topic: algebra__linear_1d
    Collected 100 examples
  Processing topic: arithmetic__mul
    Collected 100 examples
  Processing topic: calculus__differentiate
    Collected 100 examples
  Processing topic: comparison__pair
    Collected 100 examples
  Processing topic: numbers__gcd
    Collected 100 examples
  Processing topic: numbers__is_prime_composed
    Collected 100 examples
  Processing topic: polynomials__add
    Collected 100 examples
  Processing topic: numbers__list_prime_factors
    Collected 100 examples
  Processing topic: polynomials__evaluate_composed
    Collected 100 examples
  Processing topic: algebra__polynomial_roots_composed
    Collected 100 examples
  Processing topic: algebra__linear_2d
    Collected 100 examples
  Processing topic: comparison__kth_biggest
    Collected 100 examples
  Processing topic: polynomials__collect
    Collected 100 ex

## MVP Dataset

In [16]:
import random

base_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset/processed"
easy_fp = os.path.join(base_dir, 'easy_10000.tsv')
sample_size = 100

topics = ['calculus__differentiate_composed', 'algebra__polynomial_roots_composed', 'numbers__list_prime_factors_composed']

df = pd.read_csv(easy_fp, sep='\t')
    
# Sample from each topic and combine
samples = []
for topic in topics:
    topic_df = df[df['topic'] == topic]
    if len(topic_df) > 0:
        topic_sample = topic_df.sample(min(sample_size, len(topic_df)), random_state=42)
        samples.append(topic_sample)
        print(f"Added {len(topic_sample)} examples from {topic}")

# Combine and save
mvp_df = pd.concat(samples, ignore_index=True)
output_file = os.path.join(base_dir, "mvp.tsv")
mvp_df.to_csv(output_file, sep='\t', index=False)
print(f"Saved {len(mvp_df)} examples to {output_file}")

Added 100 examples from calculus__differentiate_composed
Added 100 examples from algebra__polynomial_roots_composed
Added 100 examples from numbers__list_prime_factors_composed
Saved 300 examples to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/mvp.tsv
