In [None]:
import json
import pandas as pd
from typing import List, Dict, Any
import re
import numpy as np

def read_jsonl_file(filename: str) -> List[Dict[str, Any]]:
    """Read a JSONL file and return a list of dictionaries."""
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file, 1):
            try:
                line = line.strip()
                if not line:
                    continue
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON on line {line_num}: {e}")
                continue
    return data

def convert_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
    """Convert the JSONL data to a pandas DataFrame."""
    records = []
    
    for game in data:
        record = {
            'target_word': game['metadata']['target_word'],
            'attempts_used': game['metadata']['attempts_used'],
            'solved': game['metadata']['solved'],
            'total_reward': game['metadata']['total_reward'],
            'num_messages': len(game['messages']),
        }
        
        # Extract first guess
        assistant_messages = [msg for msg in game['messages'] if msg['role'] == 'assistant']
        if assistant_messages:
            first_guess = assistant_messages[0]['content']
            match = re.search(r'\\boxed\{(\w+)\}', first_guess)
            record['first_guess'] = match.group(1) if match else 'unknown'
        
        records.append(record)
    
    return pd.DataFrame(records)


In [None]:
# Read the data and create initial solved_df
print("Reading JSONL file...")
data = read_jsonl_file('wordle_examples.jsonl')
df = convert_to_dataframe(data)
print(f"DataFrame shape: {df.shape}")

# Filter to solved games only
solved_df = df[df['solved'] == True]
print(f"Solved games: {len(solved_df)}")

print("\nSolved games by attempts:")
print(solved_df['attempts_used'].value_counts().sort_index())

print("\nTarget word counts (showing words with >25 games):")
target_counts = solved_df['target_word'].value_counts()
high_count_words = target_counts[target_counts > 25]
for word, count in high_count_words.items():
    print(f"{word} {count}")

print(f"\nTotal unique target words: {solved_df['target_word'].nunique()}")
print(f"Max games per target word: {target_counts.max()}")
print(f"Words with >15 games: {sum(target_counts > 15)}")


In [None]:
# METHOD 1: Take first 15 rows for each target word (deterministic)
print("=== METHOD 1: First 15 rows per target word ===")
filtered_df_first15 = solved_df.groupby('target_word').head(15).reset_index(drop=True)

print(f"Original solved_df shape: {solved_df.shape}")
print(f"Filtered (first 15 per target) shape: {filtered_df_first15.shape}")
print(f"Reduction: {len(solved_df) - len(filtered_df_first15)} rows removed")

print("\nTarget word counts after filtering (top 10):")
print(filtered_df_first15['target_word'].value_counts().head(10))

print(f"\nMax rows per target word: {filtered_df_first15['target_word'].value_counts().max()}")
print(f"Min rows per target word: {filtered_df_first15['target_word'].value_counts().min()}")


In [None]:
# METHOD 2: Random sample of up to 15 rows per target word
print("=== METHOD 2: Random 15 rows per target word ===")

# Set random seed for reproducibility
np.random.seed(42)

def sample_max_n(group, n=15):
    if len(group) <= n:
        return group
    else:
        return group.sample(n=n)

filtered_df_random15 = solved_df.groupby('target_word').apply(lambda x: sample_max_n(x, 15)).reset_index(drop=True)

print(f"Original solved_df shape: {solved_df.shape}")
print(f"Filtered (random 15 per target) shape: {filtered_df_random15.shape}")
print(f"Reduction: {len(solved_df) - len(filtered_df_random15)} rows removed")

print("\nTarget word counts after random filtering (top 10):")
print(filtered_df_random15['target_word'].value_counts().head(10))

print(f"\nMax rows per target word: {filtered_df_random15['target_word'].value_counts().max()}")

# Compare attempt distributions
print("\n--- Attempt Distribution Comparison ---")
print("Original:")
print(solved_df['attempts_used'].value_counts().sort_index())
print("\nFiltered (first 15):")
print(filtered_df_first15['attempts_used'].value_counts().sort_index())
print("\nFiltered (random 15):")
print(filtered_df_random15['attempts_used'].value_counts().sort_index())


In [None]:
# FINAL STEP: Choose your preferred filtering method
print("=== CHOOSE YOUR FILTERING METHOD ===")

# Option 1: First 15 (deterministic, preserves original order)
solved_df = filtered_df_first15.copy()

# Option 2: Random 15 (random sampling - uncomment to use)
# solved_df = filtered_df_random15.copy()

print(f"Final solved_df shape: {solved_df.shape}")
print(f"Max rows per target word: {solved_df['target_word'].value_counts().max()}")
print(f"Min rows per target word: {solved_df['target_word'].value_counts().min()}")
print(f"Number of unique target words: {solved_df['target_word'].nunique()}")

print(f"\nDataFrame successfully filtered to max 15 rows per target word!")
print(f"Reduced from {len(data)} total games to {len(solved_df)} solved games with balanced target word distribution.")


In [23]:
import json
import pandas as pd
from typing import List, Dict, Any
import random

def read_jsonl_file(filename: str) -> List[Dict[str, Any]]:
    """
    Read a JSONL file and return a list of dictionaries.
    Each line in the file should be a valid JSON object.
    """
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file, 1):
            try:
                # Strip whitespace and skip empty lines
                line = line.strip()
                if not line:
                    continue
                
                # Parse JSON
                json_obj = json.loads(line)
                data.append(json_obj)
                
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON on line {line_num}: {e}")
                continue
    
    return data

def analyze_wordle_data(data: List[Dict[str, Any]]) -> None:
    """
    Analyze the Wordle game data and print some basic statistics.
    """
    print(f"Total games: {len(data)}")
    
    # Extract metadata
    solved_games = [game for game in data if game['metadata']['solved']]
    failed_games = [game for game in data if not game['metadata']['solved']]
    
    print(f"Games solved: {len(solved_games)}")
    print(f"Games failed: {len(failed_games)}")
    print(f"Success rate: {len(solved_games) / len(data) * 100:.1f}%")
    
    if solved_games:
        avg_attempts = sum(game['metadata']['attempts_used'] for game in solved_games) / len(solved_games)
        print(f"Average attempts for solved games: {avg_attempts:.2f}")
    
    # Show some target words
    target_words = [game['metadata']['target_word'] for game in data[:10]]
    print(f"\nFirst 10 target words: {target_words}")

def sample_max_n_games(games: List[Dict[str, Any]], n: int = 10) -> List[Dict[str, Any]]:
    """
    Sample at most n games from a list, preserving full structure.
    """
    if len(games) <= n:
        return games
    else:
        return random.sample(games, n)

def filter_and_save_jsonl(input_filename: str, output_filename: str, max_per_word: int = 10) -> None:
    """
    Filter JSONL file to keep only solved games, max N per target word, preserving full structure.
    """
    # Read all data
    print("Reading JSONL file...")
    data = read_jsonl_file(input_filename)
    
    # Filter to only solved games
    solved_games = [game for game in data if game['metadata']['solved']]
    print(f"Found {len(solved_games)} solved games out of {len(data)} total games")
    
    # Group by target word
    games_by_target = {}
    for game in solved_games:
        target_word = game['metadata']['target_word']
        if target_word not in games_by_target:
            games_by_target[target_word] = []
        games_by_target[target_word].append(game)
    
    print(f"Found {len(games_by_target)} unique target words")
    
    # Sample max N games per target word
    filtered_games = []
    for target_word, games in games_by_target.items():
        sampled_games = sample_max_n_games(games, max_per_word)
        filtered_games.extend(sampled_games)
        print(f"Target '{target_word}': {len(games)} -> {len(sampled_games)} games")
    
    print(f"\nTotal filtered games: {len(filtered_games)}")
    
    # Save to new JSONL file
    with open(output_filename, 'w', encoding='utf-8') as f:
        for game in filtered_games:
            f.write(json.dumps(game, ensure_ascii=False) + '\n')
    
    print(f"Saved filtered data to {output_filename}")
    
    # Print summary statistics
    target_word_counts = {}
    for game in filtered_games:
        target_word = game['metadata']['target_word']
        target_word_counts[target_word] = target_word_counts.get(target_word, 0) + 1
    
    print(f"\nFinal distribution:")
    for word, count in sorted(target_word_counts.items()):
        print(f"  {word}: {count}")

# Example usage
if __name__ == "__main__":
    # Set random seed for reproducible sampling
    random.seed(42)
    
    # Filter and save
    filter_and_save_jsonl('wordle_examples.jsonl', 'wordle_finetune.jsonl', max_per_word=10)

Reading JSONL file...
Found 8424 solved games out of 8424 total games
Found 503 unique target words
Target 'refer': 23 -> 10 games
Target 'lower': 10 -> 10 games
Target 'shine': 18 -> 10 games
Target 'virus': 16 -> 10 games
Target 'blast': 21 -> 10 games
Target 'storm': 14 -> 10 games
Target 'taste': 25 -> 10 games
Target 'dream': 23 -> 10 games
Target 'trunk': 18 -> 10 games
Target 'chose': 16 -> 10 games
Target 'brief': 21 -> 10 games
Target 'mixed': 21 -> 10 games
Target 'focus': 16 -> 10 games
Target 'lunch': 15 -> 10 games
Target 'spare': 22 -> 10 games
Target 'peter': 19 -> 10 games
Target 'legal': 25 -> 10 games
Target 'draft': 16 -> 10 games
Target 'array': 12 -> 10 games
Target 'dying': 25 -> 10 games
Target 'globe': 21 -> 10 games
Target 'power': 24 -> 10 games
Target 'plant': 23 -> 10 games
Target 'earth': 16 -> 10 games
Target 'lying': 22 -> 10 games
Target 'train': 18 -> 10 games
Target 'human': 14 -> 10 games
Target 'smith': 21 -> 10 games
Target 'steam': 24 -> 10 games
T