In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Parameters
input_path = './magpie_data/magpie_llama70b_00-of-13.parquet'
output_dir = './processed_magpie'
num_samples = 30000
output_filename = f'magpie_clean_{num_samples//1000}k.parquet' 

# Placeholder process_prompt function
def process_prompt(prompt):
    return f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n{prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>"

def process_response(response):
    return f"{response}<|eot_id|>"

# Load the Parquet file
df = pd.read_parquet(input_path)

In [3]:
# Randomly sample up to num_samples rows
if num_samples < len(df):
    df = df.sample(n=num_samples)
else:
    print(f"Requested {num_samples} samples, but only {len(df)} available. Using all rows.")

# Create new columns and process prompt
df_out = pd.DataFrame()
df_out['prompt'] = df['instruction'].apply(process_prompt)
df_out['completion'] = df['response'].apply(process_response)

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to Parquet
output_path = os.path.join(output_dir, output_filename)
df_out.to_parquet(output_path, index=False)

print(f"Saved {len(df_out)} samples to {output_path}")

Saved 30000 samples to ./processed_magpie\magpie_clean_30k.parquet


# Optionally view our datasets after creating
---


In [87]:
# Set your parquet file path here
# parquet_path = './train/magpie_clean_10k.parquet'
parquet_path = './train/explainer_clean_1250.parquet'
# parquet_path = './train/explanation_traindata_1k_cleaned.parquet'

# Load the parquet file once
view_df = pd.read_parquet(parquet_path)
print(f"Loaded df {parquet_path} with length = {len(view_df)}")

Loaded df ./train/explainer_clean_1250.parquet with length = 1250


In [86]:
# # Old code used to process pre-generated data 
# def split_prompt_completion(text):
#     """
#     Splits the input string into 'prompt' and 'completion' based on the marker.
#     Returns (prompt, completion).
#     """
#     marker = '<|start_header_id|>assistant<|end_header_id|>'
#     idx = text.find(marker)
#     prompt = text[:idx + len(marker)]
#     completion = text[idx + len(marker):]
#     return prompt, completion

# # Loop through all rows, process, and collect results
# prompts = []
# completions = []

# for idx, row in view_df.iterrows():
#     prompt, completion = split_prompt_completion(row['prompt'])
#     prompts.append(prompt)
#     completions.append(completion)

# # Create new DataFrame
# df_processed = pd.DataFrame({
#     'prompt': prompts,
#     'completion': completions
# })

# # Save to Parquet
# output_path = './train/explainer_clean_1k.parquet'
# df_processed.to_parquet(output_path, index=False)
# print(f"Saved processed prompts to {output_path}")

In [95]:
# Helper function
def print_random_sample(n=1):
    """Print n random samples from the dataframe."""
    sample = view_df.sample(n=n)
    for idx, row in sample.iterrows():
        print(f"--- Sample {idx} ---")
        print("Prompt:\n", row['prompt'])
        print("\nResponse:\n", row['completion'])
        print("\n" + "="*40 + "\n")

# Call this function to print a random sample
print_random_sample()  # You can change the argument to print more than one sample

--- Sample 872 ---
Prompt:
 <|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a chess grandmaster currently playing a very strong opponent. Assume they will be playing optimally. 

Shortly you'll be provided with a board state by the user -- please analyze it and think through your possible moves.

You should refer to moves in UCI notation (e.g., d7d5) and should include your thinking in think tags (e.g., <think> your_thinking </think>) and your answer in answer tags (e.g., <answer> UCI_move </answer>). 

As a technique you may want to consider enumerating possible moves and simulating the likely trajectory that would ensue. <|eot_id|>
<|start_header_id|>user<|end_header_id|>
8| . . . . . . . k
7| p . p . . . p .
6| . p . . . p . p
5| . . . . . . . .
4| . . P P . . . P
3| . P . . . K P .
2| P . . . . R . .
1| . . . r . . . .
   _ _ _ _ _ _ _ _
   A B C D E F G H

- It is Black’s turn to move.
- No castling rights available.
- No en passant target square.
- Halfmove c