Tell a Joke instruction dataset. Based on https://huggingface.co/datasets/SocialGrep/one-million-reddit-jokes and augmented using keybert.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/data/datasets/tell_a_joke/tell_a_joke.ipynb)

In [None]:
!pip install keybert
!pip install datasets
!pip install pandas
!pip install huggingface_hub
!pip install swifter

In [8]:
import random
from datasets import load_dataset
import pandas as pd
from keybert import KeyBERT
from huggingface_hub import notebook_login
from datasets import Dataset
import swifter

In [None]:
SIZE_LIMIT = 20000
MIN_JOKE_SCORE = 100
HF_DATASET = "SocialGrep/one-million-reddit-jokes"
pd.set_option("display.max_colwidth", None)
jokes_ds = load_dataset(HF_DATASET)
kw_model = KeyBERT()
pd_jokes = jokes_ds["train"].to_pandas()
pd_jokes = pd_jokes.reset_index(drop=True)

filtered_jokes = pd_jokes[pd_jokes["subreddit.nsfw"] == False]
filtered_jokes.dropna(subset=["selftext"], inplace=True)
filtered_jokes = filtered_jokes[filtered_jokes["score"] > MIN_JOKE_SCORE]
filtered_jokes = filtered_jokes[filtered_jokes["selftext"] != "[deleted]"]
filtered_jokes = filtered_jokes[filtered_jokes["selftext"] != "[removed]"]
filtered_jokes = filtered_jokes[~filtered_jokes["selftext"].str.contains("edit:", case=False)]

filtered_jokes = filtered_jokes.sort_values("score", ascending=False)

filtered_jokes[["score", "title", "selftext", "subreddit.nsfw"]]
filtered_jokes = filtered_jokes[0:SIZE_LIMIT]
print(len(filtered_jokes))

In [None]:
joke_requests = [
    "Can you share a joke that involves {}?",
    "Do you know any jokes related to {}?",
    "Could you tell me a funny joke that has to do with {}?",
    "I'm in the mood for a joke about {}. Do you have any good ones?",
    "Would you happen to have a joke about {} that you could tell me?",
    "Can you think of a joke that centers around {}?",
    "I'd love to hear a witty joke related to {}. Do you have one?",
    "Tell me a humorous joke that involves {}.",
    "Could you please entertain me with a joke related to {}?",
    "What's a good joke that relates to {}?",
    "I could use a good laugh. How about a joke about {}?",
    "What's a funny joke that relates to {}?",
    "Can you make me chuckle with a joke that involves {}?",
    "I'm curious if you have a joke up your sleeve that pertains to {}?",
    "Do you have a favorite joke that involves {}?",
    "Mind sharing a joke with me that has to do with {}?",
    "How about a joke related to {}? Do you have one?",
    "I'm in need of a good joke. Something that centers around {} should do the trick.",
    "Would you be willing to share a joke about {} with me?",
    "Can you think of a joke that relates to {} that you could tell me?",
]


def make_item(joke):
    title = joke["title"]
    body = joke["selftext"]
    permalink = joke["permalink"]
    joke_text = f"{title}\n{body}"
    prefix = random.choice(joke_requests)

    try:
        keywords = kw_model.extract_keywords(joke_text, keyphrase_ngram_range=(1, 2), stop_words="english")
        main_keyword = keywords[0][0]
        instruction = f"{prefix.format(main_keyword)}"
    except Exception as e:
        print("Error:", e, joke_text, joke)
        instruction = "Could you tell me a random joke?"

    return pd.Series(
        [instruction, joke_text, HF_DATASET, {"nsfw": False, "link": permalink}],
        index=["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"],
    )

In [None]:
%time oa_format = filtered_jokes.swifter.apply(make_item, axis=1)
print(len(oa_format))

In [None]:
oa_format.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow", index=False)

In [None]:
ds = Dataset.from_parquet("dataset.parquet")
ds.push_to_hub(f"mikegarts/oa_tell_a_joke_{SIZE_LIMIT}")