In [49]:
import os
import pandas as pd
import random
from pathlib import Path

In [50]:
# Define paths
base_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset/processed"
cot_examples_path = os.path.join(base_dir, 'cot_examples_with_solutions.tsv')
prompts_base_dir = "/home/kalkiek/projects/instruction-retrieval/src/data/prompts/mvp"

output_dir = os.path.join(base_dir, "prompt_variants")

# Load the CoT examples
df_cot = pd.read_csv(cot_examples_path, sep='\t')
print(f"Loaded {len(df_cot)} examples with CoT solutions")

# Define the instruction variants
instruction_variants = ["baseline", "concise", "student"]

Loaded 30 examples with CoT solutions


In [51]:
# Function to load instruction prompts for a specific topic and variant
def load_instruction(topic, variant):
    prompt_path = os.path.join(prompts_base_dir, variant, f"{topic}.txt")
    try:
        with open(prompt_path, 'r') as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Warning: Instruction file not found for {topic} in {variant} variant")
        return ""

# Function to get CoT examples for a specific topic
def get_cot_examples(topic, num_examples=3):
    topic_examples = df_cot[df_cot['topic'] == topic]
    
    if len(topic_examples) < num_examples:
        print(f"Warning: Only {len(topic_examples)} CoT examples available for {topic}")
        return topic_examples
    
    # Sample the specified number of examples
    return topic_examples.sample(num_examples, random_state=42)

# Function to format CoT examples as text
def format_cot_examples(examples):
    formatted_examples = []
    
    for _, example in examples.iterrows():
        formatted = f"Question: {example['question']}\n\n"
        formatted += f"Solution:\n{example['cot_solution']}\n\n"
        formatted += f"Answer: {example['answer']}\n\n"
        formatted += "-" * 50 + "\n"
        formatted_examples.append(formatted)
    
    return "\n".join(formatted_examples)

In [52]:

# Create dictionaries to store each variant
variant_dfs = {
    'base': [],  # For topic, question, answer
    'few_show_cot': []  # For CoT examples
}

# Add instruction variants
for variant in instruction_variants:
    variant_dfs[f'instructions_{variant}'] = []
    variant_dfs[f'instructions_{variant}_few_show_cot'] = []

# Process each topic and question
topics = df_cot['topic'].unique()

for topic in topics:
    # Get questions for this topic
    topic_questions = df_cot[df_cot['topic'] == topic]
    
    # Get CoT examples for this topic
    cot_examples = get_cot_examples(topic)
    cot_text = format_cot_examples(cot_examples)
    
    # Load instructions for each variant
    instructions = {}
    for variant in instruction_variants:
        instructions[variant] = load_instruction(topic, variant)
    
    # For each question in this topic
    for _, row in topic_questions.iterrows():
        question = row['question']
        answer = row['answer']
        
        # Base data (topic, question, answer)
        base_data = {
            'topic': topic,
            'question': question,
            'answer': answer
        }
        variant_dfs['base'].append(base_data)
        
        # CoT examples
        cot_data = base_data.copy()
        cot_data['prompt'] = cot_text + "\n\n" + question
        variant_dfs['few_show_cot'].append(cot_data)
        
        # Instruction-only variants
        for variant in instruction_variants:
            inst_data = base_data.copy()
            inst_data['prompt'] = f"{instructions[variant]}\n\n{question}"
            variant_dfs[f'instructions_{variant}'].append(inst_data)
        
        # Instruction + CoT variants
        for variant in instruction_variants:
            inst_cot_data = base_data.copy()
            inst_cot_data['prompt'] = f"{instructions[variant]}\n\n{cot_text}\n\n{question}"
            variant_dfs[f'instructions_{variant}_few_show_cot'].append(inst_cot_data)

for variant_name, rows in variant_dfs.items():
        
    df = pd.DataFrame(rows)
    
    # Create a directory for this variant
    variant_dir = os.path.join(output_dir, variant_name)
    os.makedirs(variant_dir, exist_ok=True)
    
    # Save as JSON
    output_path = os.path.join(variant_dir, f"{variant_name}.json")
    df.to_json(output_path, orient='records', indent=2)
    print(f"Saved {len(df)} {variant_name} prompts to {output_path}")

# Also save a combined version with all variants
combined_rows = []
for i in range(len(variant_dfs['base'])):
    row = variant_dfs['base'][i].copy()
    
    # Add the prompt for each variant
    for variant_name, rows in variant_dfs.items():
        if variant_name != 'base':
            row[variant_name] = rows[i]['prompt']
    
    combined_rows.append(row)

df_combined = pd.DataFrame(combined_rows)
combined_path = os.path.join(output_dir, "all_variants.json")
df_combined.to_json(combined_path, orient='records', indent=2)
print(f"\nSaved combined dataset with all variants to {combined_path}")

# Display the structure of the saved files
print("\nSaved the following variant files:")
for variant_name in variant_dfs.keys():
    if variant_name != 'base':
        print(f"- {variant_name}.json")

# Display a sample row from the combined file
print("\nSample row from combined file (truncated):")
sample_row = df_combined.iloc[0]
for col in df_combined.columns:
    value = str(sample_row[col])
    if len(value) > 100:
        value = value[:100] + "..."
    print(f"{col}: {value}")

Saved 30 base prompts to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/prompt_variants/base/base.json
Saved 30 few_show_cot prompts to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/prompt_variants/few_show_cot/few_show_cot.json
Saved 30 instructions_baseline prompts to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/prompt_variants/instructions_baseline/instructions_baseline.json
Saved 30 instructions_baseline_few_show_cot prompts to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/prompt_variants/instructions_baseline_few_show_cot/instructions_baseline_few_show_cot.json
Saved 30 instructions_concise prompts to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/prompt_variants/instructions_concise/instructions_concise.json
Saved 30 instructions_concise_few_show_cot prompts to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/prompt_variants/instruction