In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/eval.csv")
df

Unnamed: 0,slang,example_1,example_2
0,drip,His drip was so clean that even the photograph...,She upgraded her drip after finding that thrif...
1,boujee,Her boujee taste had her rejecting every resta...,He acted boujee all night after getting a sing...
2,jomo,He stayed home with popcorn feeling pure jomo ...,"She curled up with a book, embracing the jomo ..."
3,salty,He got salty when his little brother beat him ...,She stayed salty the whole ride home after los...
4,woke,She stayed woke during the whole discussion an...,He became more woke after taking that sociolog...
...,...,...,...
192,yas,"Yas, that outfit is everything.",She shouted yas when her song came on.
193,wth,Wth was that noise outside?,He looked at me and said wth after reading the...
194,smh,"Smh, he forgot his wallet again.",She said smh when she heard the gossip.
195,fam,"What's good, fam?",She greeted the whole group like fam.


In [7]:
print(df['example_1'].iloc[0])
print(df['example_2'].iloc[0])

His drip was so clean that even the photographer asked him to pose.
She upgraded her drip after finding that thrift-store jacket.


In [8]:
import random
import string

# Get all slang words
slang_words = df['slang'].tolist()

# Process each row to create MCQ format
mcq_data = []

for idx, row in df.iterrows():
    slang = row['slang']
    
    # Process each of the examples
    for example_col in ['example_1', 'example_2']:
        example = row[example_col]
        
        # Find and replace the slang word with ____
        # Use word boundaries to avoid partial matches
        sentence = example.replace(slang, '____')
        
        # Create options: correct answer + 3 random wrong answers
        wrong_options = [w for w in slang_words if w != slang]
        random_wrong = random.sample(wrong_options, 3)
        
        # Shuffle all 4 options and find the correct answer letter
        options = [slang] + random_wrong
        random.shuffle(options)
        correct_letter = string.ascii_uppercase[options.index(slang)]
        
        mcq_data.append({
            'slang': slang,
            'sentence': sentence,
            'option_a': options[0],
            'option_b': options[1],
            'option_c': options[2],
            'option_d': options[3],
            'correct_answer': correct_letter
        })

# Create new DataFrame
mcq_df = pd.DataFrame(mcq_data)

# Save to CSV
mcq_df.to_csv('data/eval_mcq.csv', index=False)

# Display first few rows
mcq_df.head(10)


Unnamed: 0,slang,sentence,option_a,option_b,option_c,option_d,correct_answer
0,drip,His ____ was so clean that even the photograph...,yas,bounce,drip,bot,C
1,drip,She upgraded her ____ after finding that thrif...,whip,drip,fire,irl,B
2,boujee,Her ____ taste had her rejecting every restaur...,dope,fyi,big yikes,boujee,D
3,boujee,He acted ____ all night after getting a single...,prt,otp,eod,boujee,D
4,jomo,He stayed home with popcorn feeling pure ____ ...,high-key,ily,jomo,g2g,C
5,jomo,"She curled up with a book, embracing the ____ ...",jomo,periodt,drip,aak,A
6,salty,He got ____ when his little brother beat him a...,nfm,salty,bae,iow,B
7,salty,She stayed ____ the whole ride home after losi...,salty,oic,finesse,imho,A
8,woke,She stayed ____ during the whole discussion an...,asap,pov,woke,awol,C
9,woke,He became more ____ after taking that sociolog...,g,afk,aisb,woke,D


In [9]:
import json

# Read dataset.csv
dataset_df = pd.read_csv("data/dataset.csv")

# Create finetune data in JSONL format
finetune_data = []

for idx, row in dataset_df.iterrows():
    slang = row['slang']
    example = row['example']
    
    # Replace slang word with blank
    sentence = example.replace(slang, '____')
    
    # Get all options from other entries
    all_slang = dataset_df['slang'].tolist()
    wrong_options = [w for w in all_slang if w != slang]
    random_wrong = random.sample(wrong_options, min(3, len(wrong_options)))
    
    # Create options and shuffle
    options = [slang] + random_wrong
    random.shuffle(options)
    correct_letter = string.ascii_uppercase[options.index(slang)]
    
    # Build prompt
    prompt = f"""You will be given a sentence with a missing word and four answer options.
Choose the one that best completes the sentence in natural modern English.

Sentence: "{sentence}"

Options:
A) {options[0]}
B) {options[1]}
C) {options[2]}
D) {options[3]}

Answer with just the letter."""
    
    finetune_data.append({
        "prompt": prompt,
        "completion": correct_letter
    })

# Save to JSONL file
with open('data/finetune_slang.jsonl', 'w') as f:
    for item in finetune_data:
        f.write(json.dumps(item) + '\n')

print(f"Created finetune_slang.jsonl with {len(finetune_data)} examples")


Created finetune_slang.jsonl with 605 examples


In [11]:
print(finetune_data[0]['prompt'])
print(finetune_data[0]['completion'])

You will be given a sentence with a missing word and four answer options.
Choose the one that best completes the sentence in natural modern English.

Sentence: "They were so ____ they couldn't stop laughing at nothing."

Options:
A) extra
B) zooted
C) dope
D) rofl

Answer with just the letter.
B
