In [2]:
import pandas as pd
import random
import json
import re

In [3]:
EVAL_PATH = "data/eval.csv"
SLANG_PATH = "data/dataset.csv"
OUT_PATH  = "data/eval_reverse.jsonl"

eval_df = pd.read_csv(EVAL_PATH)
slang_df = pd.read_csv(SLANG_PATH)

# normalize slang keys
eval_df["slang_norm"]  = eval_df["slang"].astype(str).str.strip().str.lower()
slang_df["slang_norm"] = slang_df["slang"].astype(str).str.strip().str.lower()

# build slang_norm -> definition mapping (take first per slang)
defs = (
    slang_df[["slang_norm", "definition"]]
    .drop_duplicates()
    .groupby("slang_norm")
    .first()
    .reset_index()
)

slang_to_def = {
    row["slang_norm"]: str(row["definition"]).strip()
    for _, row in defs.iterrows()
}

def clean_def(d: str) -> str:
    d = str(d).strip()
    d = re.sub(r"[\.!\?]+\s*$", "", d)
    return d

records = []

for _, row in eval_df.iterrows():
    slang_norm = row["slang_norm"]
    if slang_norm not in slang_to_def:
        continue

    definition = clean_def(slang_to_def[slang_norm])

    for col in ["example_1", "example_2"]:
        sent = row[col]
        if not isinstance(sent, str):
            continue

        prompt = (
            "You will be given a sentence that contains a modern slang term.\n"
            "Explain what the sentence means in standard English, focusing on the slang.\n\n"
            f"Sentence: \"{sent}\"\n"
            "Meaning:"
        )

        target = definition  # gold meaning to compare against

        records.append({"prompt": prompt, "target": target})

print("Eval examples:", len(records))

with open(OUT_PATH, "w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote:", OUT_PATH)


Eval examples: 394
Wrote: data/eval_reverse.jsonl


In [4]:
# Load training source
slang_df = pd.read_csv('data/dataset.csv')

# Normalize (in case there are stray spaces / case issues)
slang_df['slang_norm'] = slang_df['slang'].astype(str).str.strip().str.lower()

def clean_def(d: str) -> str:
    d = str(d).strip()
    # remove trailing punctuation like "." or "..."
    d = re.sub(r'[\.!\?]+\s*$', '', d)
    return d

records = []

for _, row in slang_df.iterrows():
    example = row['example']
    definition = row['definition']

    if not isinstance(example, str) or not isinstance(definition, str):
        continue

    def_clean = clean_def(definition)

    prompt = (
        "You will be given a sentence that contains a modern slang term.\n"
        "Explain what the sentence means in standard English, focusing on the slang.\n\n"
        f"Sentence: \"{example}\"\n"
        "Meaning:"
    )

    # simple completion based on the definition
    completion = " " + def_clean.lower() + "."

    records.append({
        "prompt": prompt,
        "completion": completion
    })

out_path = 'data/finetune_reverse.jsonl'
with open(out_path, 'w', encoding='utf-8') as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote", len(records), "examples to", out_path)


Wrote 605 examples to data/finetune_reverse.jsonl
