In [1]:
import json
from pathlib import Path
import re

In [2]:
def preprocess_wino_file(input_path, label_value=None):
    data = []

    with open(input_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    filename = Path(input_path).stem  # e.g. 'anti_stereotyped_type1.txt' -> 'anti_stereotyped_type1'

    for i, line in enumerate(lines):
        clean_text = re.sub(r"^\d+\s*", "", line.strip())
        scenario = {
            "scenario_id": f"{filename}_{i}",  # Unique ID
            "scenario_text": clean_text.strip(),
            "label": label_value if label_value is not None else "unknown"
        }
        data.append(scenario)

    return data

In [12]:
# Collect data from two files with different labels
data_1 = []
files = [
    ("anti_stereotyped_type1.txt.dev", "anti_stereotyped"),
    ("pro_stereotyped_type1.txt.dev", "pro_stereotyped"),
]

In [13]:
for file_path, label in files:
    data_1.extend(preprocess_wino_file(file_path, label))

In [14]:
with open ("dev_type1.json", "w", encoding="utf-8") as f:
    json.dump(data_1, f, indent=2, ensure_ascii=False)

In [15]:
print(f"✅ Saved {len(data_1)} scenarios to dev_type1.json")

✅ Saved 792 scenarios to dev_type1.json


### 

### Collect 100 samples from dev_type1.json

In [4]:
with open ("dev_type1.json", mode="r", encoding="utf-8") as f:
    data = json.load(f)
len(data)

792

In [16]:
# Collect the first 50 (anti-stereotyped) and the last 50 (pro-stereotyped) json objects
anti_samples = data[:50]
pro_samples = data[len(data)-50:]

In [20]:
# Concatinate the sample data in a json file
sample_data = anti_samples + pro_samples
with open("sample_data_100.json", mode='w', encoding="utf-8") as f:
    json.dump(sample_data, f, indent=2, ensure_ascii=False)

### Collect pro-stereotyped samples

In [3]:
data_pro_stereotyped = []
files = [
    ("pro_stereotyped_type1.txt.dev", "pro_stereotyped"),
    ("pro_stereotyped_type1.txt.test", "pro_stereotyped")
]

In [4]:
for file_path, label in files:
    data_pro_stereotyped.extend(preprocess_wino_file(file_path, label))

In [6]:
with open ("pro_stereotyped_type1.json", "w", encoding="utf-8") as f:
    json.dump(data_pro_stereotyped, f, indent=2, ensure_ascii=False)

In [7]:
len(data_pro_stereotyped)

792

In [8]:
with open ("pro_stereotyped_samples_500.json", "w", encoding="utf-8") as f:
    json.dump(data_pro_stereotyped[:500], f, indent=2, ensure_ascii=False)