In [5]:
import pandas as pd
import random
import json
import re

In [8]:
import pandas as pd
import random

# Load datasets
eval_df = pd.read_csv('data/eval.csv')
slang_df = pd.read_csv('data/dataset.csv')

# Normalize slang keys
eval_df['slang_norm'] = eval_df['slang'].astype(str).str.strip().str.lower()
slang_df['slang_norm'] = slang_df['slang'].astype(str).str.strip().str.lower()

# Deduplicate slang definitions
slang_defs = slang_df[['slang_norm', 'definition']].drop_duplicates()

# Build dict: slang -> definition
slang_to_def = {row['slang_norm']: row['definition'] for _, row in slang_defs.iterrows()}

# All definitions for distractor pool
all_defs = list(slang_to_def.values())

rows = []

# Template patterns for paraphrases
templates = [
    "Here, {} probably means {}",
    "It suggests that {} is like {}",
    "It implies that {} could be {}",
    "In other words, {} likely means {}"
]

for _, row in eval_df.iterrows():

    slang = row['slang_norm']
    examples = [row['example_1'], row['example_2']]

    # Correct definition
    true_def = slang_to_def[slang]

    # Sample distractor definitions
    distractor_defs = random.sample(
        [d for d in all_defs if d != true_def], 
        3
    )

    for ex in examples:
        sentence = ex

        # Build correct paraphrase
        correct = random.choice(templates).format(slang, true_def.lower())

        # Build distractor paraphrases
        distractors = [
            random.choice(templates).format(slang, d.lower())
            for d in distractor_defs
        ]

        # Randomize option order
        options = [('A', correct)] + [('B', distractors[0]), ('C', distractors[1]), ('D', distractors[2])]
        
        # Shuffle but track correct label
        random.shuffle(options)
        labels = ['A', 'B', 'C', 'D']
        option_map = {labels[i]: options[i][1] for i in range(4)}
        correct_label = next(label for label, text in option_map.items() if text == correct)

        # Add row
        rows.append({
            'sentence': sentence,
            'option_A': options[0][1],
            'option_B': options[1][1],
            'option_C': options[2][1],
            'option_D': options[3][1],
            'correct': correct_label
        })

# Convert to DataFrame and save
reverse_df = pd.DataFrame(rows)
reverse_df.to_csv('data/eval_reverse.csv', index=False)

reverse_df


Unnamed: 0,sentence,option_A,option_B,option_C,option_D,correct
0,His drip was so clean that even the photograph...,"In other words, drip likely means commonly use...",It suggests that drip is like common in textin...,"Here, drip probably means fear of missing out;...",It implies that drip could be stylish or fashi...,D
1,She upgraded her drip after finding that thrif...,"Here, drip probably means common in texting be...","In other words, drip likely means stylish or f...",It implies that drip could be fear of missing ...,"In other words, drip likely means commonly use...",B
2,Her boujee taste had her rejecting every resta...,"In other words, boujee likely means a casual w...",It suggests that boujee is like a short break ...,It suggests that boujee is like commonly used ...,It suggests that boujee is like luxurious or f...,D
3,He acted boujee all night after getting a sing...,"In other words, boujee likely means luxurious ...","In other words, boujee likely means commonly u...",It suggests that boujee is like a short break ...,"Here, boujee probably means a casual way to as...",A
4,He stayed home with popcorn feeling pure jomo ...,It implies that jomo could be refers to a soft...,"Here, jomo probably means joy of missing out.","Here, jomo probably means a call for help or a...","In other words, jomo likely means exciting, fu...",B
...,...,...,...,...,...,...
389,She said smh when she heard the gossip.,It implies that smh could be a short break dur...,"Here, smh probably means good or cool.","Here, smh probably means often used to share a...",It suggests that smh is like used to express d...,D
390,"What's good, fam?","In other words, fam likely means over the top,...","Here, fam probably means refers to the end of ...",It implies that fam could be used in casual co...,It suggests that fam is like angry or frustrated.,C
391,She greeted the whole group like fam.,"It implies that fam could be over the top, exc...","Here, fam probably means used in casual conver...","Here, fam probably means angry or frustrated.",It implies that fam could be refers to the end...,B
392,"Ffs, why does this app keep crashing?",It suggests that ffs is like expresses feeling...,It implies that ffs could be someone who can s...,It suggests that ffs is like a shorthand way t...,"It implies that ffs could be used informally, ...",D


In [None]:
import pandas as pd
import json
import re

# Load training source
slang_df = pd.read_csv('data/dataset.csv')

# Normalize (in case there are stray spaces / case issues)
slang_df['slang_norm'] = slang_df['slang'].astype(str).str.strip().str.lower()

def clean_def(d: str) -> str:
    d = str(d).strip()
    # remove trailing punctuation like "." or "..."
    d = re.sub(r'[\.!\?]+\s*$', '', d)
    return d

records = []

for _, row in slang_df.iterrows():
    example = row['example']
    definition = row['definition']

    if not isinstance(example, str) or not isinstance(definition, str):
        continue

    def_clean = clean_def(definition)

    prompt = (
        "You will be given a sentence that contains a modern slang term.\n"
        "Explain what the sentence means in standard English, focusing on the slang.\n\n"
        f"Sentence: \"{example}\"\n"
        "Meaning:"
    )

    # simple completion based on the definition
    completion = " " + def_clean.lower() + "."

    records.append({
        "prompt": prompt,
        "completion": completion
    })

out_path = 'data/finetune_reverse.jsonl'
with open(out_path, 'w', encoding='utf-8') as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote", len(records), "examples to", out_path)


Wrote 547 examples to data/finetune_reverse.jsonl


In [11]:
with open(out_path, 'r', encoding='utf-8') as f:
    records = f.readlines()
    records = [json.loads(line) for line in records]
    
print("Loaded", len(records), "examples from", out_path)

Loaded 547 examples from data/finetune_reverse.jsonl


In [15]:
print(records[0]['prompt'], records[0]['completion'])

You will be given a sentence that contains a modern slang term.
Explain what the sentence means in standard English, focusing on the slang.

Sentence: "His drip today is on point."
Meaning:  stylish or fashionable, especially in clothing.


In [10]:
import pandas as pd
import json
import random
import re

# ---------- config ----------
SLANG_PATH = "data/dataset.csv"
FINETUNE_JSONL = "data/finetune_slang.jsonl"
random.seed(42)
# ----------------------------

slang_df = pd.read_csv(SLANG_PATH)
slang_df["slang_norm"] = slang_df["slang"].astype(str).str.strip().str.lower()

# build mapping slang_norm -> definition
defs_df = slang_df[["slang_norm", "definition"]].drop_duplicates()
slang_to_def = {
    row["slang_norm"]: str(row["definition"]).strip()
    for _, row in defs_df.iterrows()
}

all_defs = list(set(slang_to_def.values()))

def clean_def(d: str) -> str:
    d = str(d).strip()
    d = re.sub(r"[\.!\?]+\s*$", "", d)
    return d

def make_paraphrase(slang: str, def_text: str) -> str:
    base = clean_def(def_text).lower()
    templates = [
        "Here, {} probably means {}",
        "It suggests that {} is like {}",
        "It implies that {} could be {}",
        "In other words, {} likely means {}"
    ]
    return random.choice(templates).format(slang, base)

records = []

for _, row in slang_df.iterrows():
    slang = row["slang"]
    slang_norm = row["slang_norm"]
    example = row["example"]

    if not isinstance(example, str):
        continue
    if slang_norm not in slang_to_def:
        continue

    true_def = slang_to_def[slang_norm]
    true_paraphrase = make_paraphrase(slang_norm,true_def)

    # choose 3 *different* definitions for distractors
    distractor_pool = [d for d in all_defs if d != true_def]
    if len(distractor_pool) < 3:
        continue
    distractor_defs = random.sample(distractor_pool, 3)
    distractor_paraphrases = [make_paraphrase(slang_norm, d) for d in distractor_defs]

    # assign to A/B/C/D and then shuffle positions
    letters = ["A", "B", "C", "D"]
    opts = [true_paraphrase] + distractor_paraphrases
    positions = list(range(4))
    random.shuffle(positions)

    shuffled_opts = [opts[i] for i in positions]
    correct_index = shuffled_opts.index(true_paraphrase)
    correct_letter = letters[correct_index]

    prompt = (
        "You will be given a sentence that contains a modern slang term and four possible interpretations.\n"
        "Choose the option that best explains the meaning of the sentence in standard English.\n\n"
        f"Sentence: \"{example}\"\n\n"
        "Options:\n"
        f"A) {shuffled_opts[0]}\n"
        f"B) {shuffled_opts[1]}\n"
        f"C) {shuffled_opts[2]}\n"
        f"D) {shuffled_opts[3]}\n\n"
        "Answer with just the letter."
    )

    completion = correct_letter

    records.append({"prompt": prompt, "completion": completion})

print("Total reverse-MCQ finetune examples:", len(records))

with open(FINETUNE_REVERSE_MCQ_JSONL, "w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote:", FINETUNE_REVERSE_MCQ_JSONL)


Total reverse-MCQ finetune examples: 605
Wrote: data/finetune_reverse_mcq.jsonl


In [11]:
with open(FINETUNE_REVERSE_MCQ_JSONL, "r", encoding="utf-8") as f:
    records = f.readlines()
    records = [json.loads(line) for line in records]

record = random.choice(records)
print(record["prompt"])
print(record["completion"])

You will be given a sentence that contains a modern slang term and four possible interpretations.
Choose the option that best explains the meaning of the sentence in standard English.

Sentence: "This project is such a PITA."

Options:
A) It implies that pita could be used to confirm something or agree
B) Here, pita probably means a call for help or assistance
C) In other words, pita likely means used to describe something annoying or frustrating
D) It implies that pita could be under the influence

Answer with just the letter.
C
