In [None]:
import pickle
import json

def clean_surrogates_robust(text):
    if not isinstance(text, str):
        return text
    try:
        return text.encode('utf-8', errors='surrogateescape').decode('utf-8', errors='replace')
    except Exception:
        return str(text).encode('utf-8', errors='replace').decode('utf-8', errors='replace')

# Load the raw responses (no encoding issues!)
with open("raw_responses_backup.pkl", "rb") as f:
    responses = pickle.load(f)

print(f"Loaded {len(responses)} responses.")

# Clean ALL responses
responses_cleaned = [clean_surrogates_robust(r) for r in responses]

# Verify no surrogates remain
bad_count = sum(1 for r in responses_cleaned 
                if isinstance(r, str) and any(0xdc00 <= ord(c) <= 0xdfff for c in r))
print(f"Remaining bad surrogates: {bad_count}")


import pandas as pd
test_df = pd.read_json("test_df.json")  # <-- Make sure this exists!

submission_data = []
for i, (_, row) in enumerate(test_df.iterrows()):
    submission_data.append({
        "id": int(row['id']),
        "response": responses_cleaned[i]
    })

submission_file = "submission.json"
with open(submission_file, 'w', encoding='utf-8') as f:
    json.dump(submission_data, f, ensure_ascii=False, indent=2)

print(f"Successfully saved {len(submission_data)} submissions to {submission_file}")