In [1]:
import os
import pandas as pd
import random
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import time

### Sample problems for CoT

**Load in the data**

In [2]:
# Define paths
base_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset/processed"
easy_10k_path = os.path.join(base_dir, 'easy_10000.tsv')
mvp_path = os.path.join(base_dir, 'mvp.tsv')

# Load datasets
df_10k = pd.read_csv(easy_10k_path, sep='\t')
df_mvp = pd.read_csv(mvp_path, sep='\t')

print(f"Loaded {len(df_10k)} examples from 10k dataset")
print(f"Loaded {len(df_mvp)} examples from MVP dataset")

# Get unique topics from the 10k dataset
topics = df_10k['topic'].unique()
print(f"Found {len(topics)} unique topics")

selected_topics = [
    'algebra__polynomial_roots_composed', 
    'calculus__differentiate_composed', 
    'numbers__list_prime_factors_composed'
]

# Create a set of (question, answer) tuples from the MVP dataset for fast lookup
mvp_set = set(zip(df_mvp['question'], df_mvp['answer']))

Loaded 560000 examples from 10k dataset
Loaded 300 examples from MVP dataset
Found 56 unique topics


In [3]:
sampled_problems = {}

# Sample 10 problems from each selected topic
for topic in selected_topics:
    # Filter source dataframe for the current topic
    topic_df = df_10k[df_10k['topic'] == topic]
    
    # Filter out problems that exist in the MVP dataset
    filtered_df = topic_df[~topic_df.apply(lambda row: (row['question'], row['answer']) in mvp_set, axis=1)]
    
    # Sample from the filtered problems
    sampled = filtered_df.sample(10, random_state=42)
    sampled_problems[topic] = sampled
    print(f"Sampled 10 examples from {topic}")

sampled_df = pd.concat(sampled_problems.values(), ignore_index=True)
print(f"\nTotal sampled problems: {len(sampled_df)}")

Sampled 10 examples from algebra__polynomial_roots_composed
Sampled 10 examples from calculus__differentiate_composed
Sampled 10 examples from numbers__list_prime_factors_composed

Total sampled problems: 30


In [4]:
output_file = os.path.join(base_dir, "cot_examples.tsv")
sampled_df.to_csv(output_file, sep='\t', index=False)
print(f"Saved {len(sampled_df)} examples to {output_file}")

Saved 30 examples to /shared/3/projects/instruction-retrieval/mathematics_dataset/processed/cot_examples.tsv


### Generate CoT

In [7]:
dotenv_path = Path("/home/kalkiek/projects/instruction-retrieval/.env")
load_dotenv(dotenv_path=dotenv_path)
api_key = os.environ.get('OPEN_AI_KEY')
client = OpenAI(api_key=os.environ.get('OPEN_AI_KEY'))

In [8]:
base_dir = "/shared/3/projects/instruction-retrieval/mathematics_dataset/processed"
cot_examples_path = os.path.join(base_dir, 'cot_examples.tsv')

# Load the sampled problems
df_problems = pd.read_csv(cot_examples_path, sep='\t')
print(f"Loaded {len(df_problems)} problems for CoT generation")

# Function to generate CoT response for a problem
def generate_cot(question, answer, model="gpt-4o"):
    prompt = f"Solve this math problem step by step:\n\nProblem: {question}\n\nThink step by step to find the answer."
    
    messages = [
        {"role": "system", "content": "You are a helpful math tutor. Provide clear step-by-step solutions to math problems."},
        {"role": "user", "content": prompt}
    ]
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.2,
            max_tokens=4096
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error generating CoT: {e}")
        time.sleep(5)  # Wait before retrying
        return f"Error: {str(e)}"

Loaded 30 problems for CoT generation


In [10]:
# Add a column for CoT responses
df_problems['cot_solution'] = None

# Generate CoT for each problem with progress bar
for i, row in tqdm(df_problems.iterrows(), total=len(df_problems), desc="Generating CoT solutions"):
    # Generate CoT solution
    cot = generate_cot(row['question'], row['answer'])
    
    # Update the dataframe
    df_problems.at[i, 'cot_solution'] = cot
    
    # Save after each batch of 5 to avoid losing progress
    if (i + 1) % 5 == 0:
        df_problems.to_csv(os.path.join(base_dir, "cot_examples_with_solutions.tsv"), sep='\t', index=False)
        print(f"Saved progress after {i+1} examples")
    
    # Add a small delay to avoid rate limits
    time.sleep(1)

# Save the final results
df_problems.to_csv(os.path.join(base_dir, "cot_examples_with_solutions.tsv"), sep='\t', index=False)
print(f"Completed CoT generation for {len(df_problems)} problems")

# Display a few examples with their CoT solutions
sample_indices = random.sample(range(len(df_problems)), min(3, len(df_problems)))
for idx in sample_indices:
    row = df_problems.iloc[idx]
    print(f"\n--- Example from {row['topic']} ---")
    print(f"Question: {row['question']}")
    print(f"CoT Solution:\n{row['cot_solution']}")
    print(f"Correct Answer: {row['answer']}")
    print("-" * 80)

Generating CoT solutions:   0%|                                                                                                            | 0/30 [00:00<?, ?it/s]

Generating CoT solutions:  13%|█████████████▎                                                                                      | 4/30 [01:45<11:04, 25.56s/it]

Saved progress after 5 examples


Generating CoT solutions:  30%|██████████████████████████████                                                                      | 9/30 [04:36<11:48, 33.75s/it]

Saved progress after 10 examples


Generating CoT solutions:  47%|██████████████████████████████████████████████▏                                                    | 14/30 [05:55<04:57, 18.58s/it]

Saved progress after 15 examples


Generating CoT solutions:  63%|██████████████████████████████████████████████████████████████▋                                    | 19/30 [07:21<03:23, 18.52s/it]

Saved progress after 20 examples


Generating CoT solutions:  80%|███████████████████████████████████████████████████████████████████████████████▏                   | 24/30 [08:18<01:18, 13.12s/it]

Saved progress after 25 examples


Generating CoT solutions:  97%|███████████████████████████████████████████████████████████████████████████████████████████████▋   | 29/30 [09:28<00:12, 12.91s/it]

Saved progress after 30 examples


Generating CoT solutions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [09:44<00:00, 19.48s/it]

Completed CoT generation for 30 problems

--- Example from algebra__polynomial_roots_composed ---
Question: Factor o**3 + 15/4*o**2 - 1 + 3*o.
CoT Solution:
To factor the expression \( o^3 + \frac{15}{4}o^2 - 1 + 3o \), we will follow these steps:

### Step 1: Rearrange the Terms
First, let's rearrange the terms in descending order of the powers of \( o \):

\[ o^3 + \frac{15}{4}o^2 + 3o - 1 \]

### Step 2: Group Terms
Next, we will try to group the terms in a way that might make factoring easier. Let's group the first two terms and the last two terms:

\[ (o^3 + \frac{15}{4}o^2) + (3o - 1) \]

### Step 3: Factor by Grouping
Now, we will factor out the greatest common factor from each group:

- From the first group \( o^3 + \frac{15}{4}o^2 \), we can factor out \( o^2 \):

  \[ o^2(o + \frac{15}{4}) \]

- From the second group \( 3o - 1 \), there is no common factor other than 1, so it remains as it is:

  \[ 3o - 1 \]

Now, our expression looks like this:

\[ o^2(o + \frac{15}{4}) + (


