In [5]:
import pandas as pd
import re

def extract_selection(text):
    if pd.isna(text):
        return ''
    text = str(text)
    # 1. Look for explicit answer lines (case-insensitive)
    patterns = [
        r'Choice:\s*\**\s*([a-eA-E])\)*',                         # Choice: d) ...
        r'Answer:?\s*\**\s*([a-eA-E])\)*',                        # Answer: c)
        r'Final Answer:?\s*\**\s*([a-eA-E])\)*',                  # Final Answer: e)
        r'The correct answer is\s*\**\s*([a-eA-E])\)*',           # The correct answer is c)
        r'Single letter choice:\s*\**\s*([a-eA-E])\**',           # Single letter choice: c
        r'^[^a-zA-Z0-9]*\**\(?([a-eA-E])\)?\**\s*(?:[).])?\s*$',  # line is just "d" or "(d)" or "**d**"
        r'\b([a-eA-E])\)\s',                                      # matches "d) " inside text
        r'\b([a-eA-E])\.',                                        # matches "d." inside text
        r'option\s*\**([a-eA-E])\**',                             # option c
    ]
    for pattern in patterns:
        m = re.search(pattern, text, flags=re.IGNORECASE | re.MULTILINE)
        if m:
            return m.group(1).lower()
    # 2. Special case for "None of these"/"None of the above"
    if re.search(r'none of (these|the above)', text, re.IGNORECASE):
        return 'e'
    # 3. As fallback, look for the last "**[a-e]**" in the response
    m = re.findall(r'\*\*([a-eA-E])\*\*', text)
    if m:
        return m[-1].lower()
    # 4. As a last resort, look for a) b) c) d) e) with no prefix
    m = re.findall(r'\b([a-eA-E])\)', text)
    if m:
        return m[-1].lower()
    return ''

# Helper to process a file
def process_file(path, colname='answers'):
    df = pd.read_csv(path)
    df['selection_filtered'] = df[colname].apply(extract_selection)
    out_path = path.replace('.csv', '_filtered.csv')
    df.to_csv(out_path, index=False)
    return out_path

# Process all your uploaded files (use correct column names if not 'answers')
files = [
    'aya-expanse-32bresponses_with_selection.csv',
    'Sky-T1-32B-Previewresponses_with_selection.csv',
    'QwQ-32Bresponses_with_selection.csv',
    'DeepHermes-3-Mistral-24B-Previewresponses_with_selection.csv',
]

for f in files:
    # Most files seem to use 'answers' as the column name
    process_file(f, 'answers')

print("Done. All filtered files are saved with _filtered.csv suffix.")


Selections saved to aya-expanse-32bresponses_with_selection.csv
