In [1]:
import functools

import regex as re
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from accelerate import init_empty_weights

## Dataset

In [2]:
dataset = load_dataset("theblackcat102/joke_explaination")

In [4]:
dataset = dataset.filter(lambda x: x['joke'].startswith('Q:'))

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

In [8]:
dataset['train'].to_csv('./data/jokes.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

245175

In [2]:
jokes_df = pd.read_csv('./data/jokes.csv')
explain_df = pd.read_csv('./data/gpt4turbo_explained_full.csv')

In [3]:
jokes_df['explaination'] = explain_df['Explanation']

In [4]:
jokes_df.to_csv('./data/jokes.csv')

In [10]:
select_df = jokes_df.sample(60)
select_ds = Dataset.from_pandas(select_df).train_test_split(test_size=50)

In [5]:
sampled_index = np.random.choice(range(len(dataset['train'])), size=70)

In [None]:
selected_ds = dataset['train'].select(sampled_index)

In [27]:
selected_ds = selected_ds.filter(lambda x: x['joke'].startswith('Q:'))

Filter:   0%|          | 0/70 [00:00<?, ? examples/s]

In [32]:
selected_ds.to_csv('./data/jokes.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

42102

In [29]:
train_eval_ds = selected_ds.train_test_split(test_size=50)

In [48]:
select_ds = select_ds.remove_columns(['__index_level_0__'])
select_ds['train'].to_csv('./data/train.csv')
select_ds['test'].to_csv('./data/test.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

27319

In [2]:
jokes_df = pd.read_csv('./data/jokes.csv')

## Models

In [2]:
train_df = pd.read_csv('./data/train.csv')
eval_df = pd.read_csv('./data/eval-Llama2.csv')

In [3]:
eval_ds = Dataset.from_pandas(eval_df[['joke', 'explaination']])

  if _pandas_api.is_sparse(col):


In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    trust_remote_code=True,
    use_safetensors=True,
)
quan_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
)



In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    device_map='auto',
    trust_remote_code=True,
    use_safetensors=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quan_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
# zero-shot
prompt = """<s>[INST] <<SYS>>
You are a joke explainer.

Each input joke contains a question and an answer. Your job is to explain why the answer together with the question forms a joke. Use no more than 3-4 sentences to explain the joke. Do not add "Ah I see" or similar terms to your explanation.
<</SYS>>
%s[/INST]"""

In [42]:
eval_ds = select_ds['test']
eval_ds = eval_ds.map(lambda x: {'prompt': prompt % x['joke']},
                      remove_columns=['url', '__index_level_0__'])

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
# one-shot
prompt = """<s>[INST] <<SYS>>
You are a joke explainer.

Each input joke contains a question and an answer. Your job is to explain why the answer together with the question forms a joke.

Here are some examples of joke explanations:
Joke: Q: Why didn't the string ever win a race? A: It always tied!
Explanation: The joke plays on the double meaning of the word 'tied.' In one sense, 'tied' means to finish a race at the same time as another competitor, resulting in a draw. In another sense, 'tied' refers to the action of tying a knot. Since the subject of the joke is a 'string,' the humor arises from imagining the string as always tying itself into knots, which humorously explains why it could never win a race.

Use no more than 3-4 sentences to explain the joke. Do not add "Ah I see" or similar terms to your explanation.
<</SYS>>
%s[/INST]"""

In [15]:
# five-shot
prompt = """<s>[INST] <<SYS>>
You are a joke explainer.

Each input joke contains a question and an answer. Your job is to explain why the answer together with the question forms a joke.

Here are some examples of joke explanations:
1.
Joke: Q: Why didn't the string ever win a race? A: It always tied!
Explanation: The joke plays on the double meaning of the word 'tied.' In one sense, 'tied' means to finish a race at the same time as another competitor, resulting in a draw. In another sense, 'tied' refers to the action of tying a knot. Since the subject of the joke is a 'string,' the humor arises from imagining the string as always tying itself into knots, which humorously explains why it could never win a race.
2.
Joke: Q: Why did the golfer wear two pairs of pants? A: In case he got a hole in one!
Explanation: This joke plays on the double meaning of the phrase 'hole in one'. In golf, 'a hole in one' refers to a player hitting the ball directly from the tee into the hole with one stroke, which is a very good outcome. However, in the context of wearing pants, 'a hole in one' humorously suggests a literal hole in one of the pairs of pants. Thus, the golfer wearing two pairs of pants is a silly precaution to ensure he still has one intact pair if one gets a hole.
3.
Joke: Q:What do prisoners use to call each other? A:Cell phones!
Explanation: The joke plays on the double meaning of the word 'cell.' In one sense, a 'cell' is a small room where prisoners are kept, and in another sense, it refers to 'cellular phones,' commonly known as cell phones. The humor arises from the pun created by using 'cell phones' to imply that prisoners would use phones related to their incarceration cells to communicate.
4.
Joke: Q: What kind of shoes does a ninja wear? A: Sneakers!
Explanation: The joke plays on the word 'sneakers' which is a type of shoe known for being comfortable and quiet, ideal for casual wear. However, in the context of the joke, 'sneakers' also refers to the ability to sneak around silently, which is a stereotypical attribute of a ninja. Thus, the humor arises from the double meaning of 'sneakers' as both a type of shoe and a description of a ninja's stealthy behavior.
5.
Joke: Q: Why did Cinderella get kicked off the soccer team? A: Because she ran away from the ball!
Explanation: The joke plays on the double meaning of the word 'ball'. In the context of the story of Cinderella, 'ball' refers to the formal dance she attends, from which she famously runs away at midnight. In the context of soccer, 'ball' refers to the soccer ball, which players should not run away from if they want to play effectively. The humor arises from the absurdity of applying the fairy tale behavior to the soccer field, creating a funny and unexpected reason for Cinderella to be kicked off the team.

Use no more than 3-4 sentences to explain the joke. Do not add "Ah I see" or similar terms to your explanation.
<</SYS>>
%s[/INST]"""

In [16]:
eval_ds = eval_ds.map(lambda x: {'prompt': prompt % x['joke']})

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
assist_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [17]:
results = assist_pipeline(KeyDataset(eval_ds, 'prompt'),
                          do_sample=True,
                          top_k=10,
                          num_return_sequences=1,
                          eos_token_id=tokenizer.eos_token_id,
                          max_length=1000,
                          return_full_text=False)

In [18]:
ans_ls = []
for res in results:
    ans_ls.append(re.sub(r'\s+', ' ', res[0]['generated_text']).strip())

In [19]:
eval_df['five-shot'] = ans_ls

In [20]:
eval_df.to_csv('./data/eval-Llama2.csv', index=False)

In [43]:
eval_ds = eval_ds.add_column('zero-shot', ans_ls)

In [44]:
eval_ds = eval_ds.remove_columns(['prompt'])

In [45]:
eval_ds.to_csv('./data/eval-Llama2.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

42727

### Old version

In [17]:
get_explain = functools.partial(assist_pipeline,
                                do_sample=True,
                                top_k=10,
                                num_return_sequences=1,
                                eos_token_id=tokenizer.eos_token_id,
                                max_length=300,
                                return_full_text=False)

In [18]:
jokes_df['Llama-2-7b-chat-hf_explanation'] = jokes_df['joke'].apply(
    lambda x: get_explain(prompt % x)[0]['generated_text'].strip())

In [21]:
jokes_df[['joke', 'Llama-2-7b-chat-hf_explanation'
          ]].to_csv('./data/jokes_llama2_explained.csv', index=False)

In [14]:
jokes_df.to_csv('./data/jokes_llama2_explained.csv', index=False)

In [10]:
joke = "Q: Why did the two 4s decide to skip dinner? A: Because they already 8."

In [16]:
get_explain(prompt % joke)[0]['generated_text'].strip()



'The joke works because the question "Why did the two 4s decide to skip dinner?" implies that there are two identical numbers, "4," which are having a decision-making conversation. The punchline, "Because they already 8," is unexpected and creates humor by revealing that the two "4s" are actually two different numbers, not the same one, which is the expected answer to the question.'

In [14]:
sequences = assist_pipeline(
    prompt % joke,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
    return_full_text=False,
)

In [15]:
print(sequences[0]['generated_text'].strip())

Ah, I see! The joke here is a play on words. The question asks "Why did the two 4s decide to skip dinner?" implying that there are two numbers, both of which are equal to 4. However, the answer is "Because they already 8," which is a pun on the number 8, as it sounds like "ate." So the punchline is that the two 4s didn't need to eat dinner because they already ate 8, which is a clever play on words.


Ah, I see! The joke is a play on words. The question "Why did the two 4s decide to skip dinner?" is asking why two numbers, both of which are equal to 4, would decide to skip dinner. The answer "Because they already 8" is a pun, as the number 8 is a multiple of 4. So, the joke is that the two 4s are already "8" (meaning they are equal to 8), so they don\'t need to skip dinner. It\'s a simple but clever