In [1]:
import functools

import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
from accelerate import init_empty_weights

## Dataset

In [2]:
dataset = load_dataset("theblackcat102/joke_explaination")

In [4]:
dataset = dataset.filter(lambda x: x['joke'].startswith('Q:'))

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

In [8]:
dataset['train'].to_csv('./data/jokes.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

245175

In [16]:
sampled_index = np.random.choice(range(len(dataset['train'])), size=70)

In [17]:
selected_ds = dataset['train'].select(sampled_index)

In [27]:
selected_ds = selected_ds.filter(lambda x: x['joke'].startswith('Q:'))

Filter:   0%|          | 0/70 [00:00<?, ? examples/s]

In [32]:
selected_ds.to_csv('./data/jokes.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

42102

In [29]:
train_eval_ds = selected_ds.train_test_split(test_size=50)

In [30]:
train_eval_ds['train'].to_csv('./data/train.csv')
train_eval_ds['test'].to_csv('./data/test.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

35527

In [2]:
jokes_df = pd.read_csv('./data/jokes.csv')

## Models

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    trust_remote_code=True,
    use_safetensors=True,
)
quan_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
)



In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    device_map='auto',
    trust_remote_code=True,
    use_safetensors=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quan_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
prompt = """<s>[INST] <<SYS>>
You are a joke explainer.

Each input joke contains a question and an answer. Your job is to explain why the answer together with the question forms a joke. Use 3-4 sentences to explain the joke. Do not add "Ah I see" or similar terms to your explanation.
<</SYS>>
%s[/INST]"""

In [None]:
assist_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [17]:
get_explain = functools.partial(assist_pipeline,
                                do_sample=True,
                                top_k=10,
                                num_return_sequences=1,
                                eos_token_id=tokenizer.eos_token_id,
                                max_length=300,
                                return_full_text=False)

In [18]:
jokes_df['Llama-2-7b-chat-hf_explanation'] = jokes_df['joke'].apply(
    lambda x: get_explain(prompt % x)[0]['generated_text'].strip())

In [21]:
jokes_df[['joke', 'Llama-2-7b-chat-hf_explanation'
          ]].to_csv('./data/jokes_llama2_explained.csv', index=False)

In [14]:
jokes_df.to_csv('./data/jokes_llama2_explained.csv', index=False)

In [10]:
joke = "Q: Why did the two 4s decide to skip dinner? A: Because they already 8."

In [16]:
get_explain(prompt % joke)[0]['generated_text'].strip()



'The joke works because the question "Why did the two 4s decide to skip dinner?" implies that there are two identical numbers, "4," which are having a decision-making conversation. The punchline, "Because they already 8," is unexpected and creates humor by revealing that the two "4s" are actually two different numbers, not the same one, which is the expected answer to the question.'

In [14]:
sequences = assist_pipeline(
    prompt % joke,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
    return_full_text=False,
)

In [15]:
print(sequences[0]['generated_text'].strip())

Ah, I see! The joke here is a play on words. The question asks "Why did the two 4s decide to skip dinner?" implying that there are two numbers, both of which are equal to 4. However, the answer is "Because they already 8," which is a pun on the number 8, as it sounds like "ate." So the punchline is that the two 4s didn't need to eat dinner because they already ate 8, which is a clever play on words.


Ah, I see! The joke is a play on words. The question "Why did the two 4s decide to skip dinner?" is asking why two numbers, both of which are equal to 4, would decide to skip dinner. The answer "Because they already 8" is a pun, as the number 8 is a multiple of 4. So, the joke is that the two 4s are already "8" (meaning they are equal to 8), so they don\'t need to skip dinner. It\'s a simple but clever