In [1]:
# Install transformers from source - only needed for versions <= v4.34
# pip install git+https://github.com/huggingface/transformers.git
# pip install accelerate
import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating


Downloading (…)lve/main/config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [2]:
prompt = "Given a sentence with some blanks, you need to generate a funny joke by filling in the blanks, while sticking exactly to the template given to you"

In [3]:
import pickle
with open("../../masked_templates_test.pkl","rb") as f:
    masked_data = pickle.load(f)

In [4]:
len(masked_data)

200

In [5]:
templates = [i[1] for i in masked_data if i]
templates = [t.replace("[MASK]","___") for t in templates]

In [6]:
len(templates)

200

In [7]:
model_outputs = []

In [8]:
from tqdm import tqdm

In [9]:
templates[:5]

["What's Louis C.K.'s ___ ___ of ___ other than his own? ___",
 'Did you hear about the failed Origami ___? If folded.',
 'How much does a ___ ___ weigh? A skele**ton**.',
 'How did the ___ burn his tongue? He ate his pizza BEFORE it was ___.',
 "What do you call a ___ full of ___'s? A ___ cellar."]

In [163]:
prompt_text = """
You are a very funny comedian in the midst of a hilarious game. Your challenge is to fill in the blanks of a given sentence to complete the joke. Remember, you must adhere strictly to the template: no changing of existing words, only filling in the blanks to craft a witty punchline. Your response should be a single line with the completed joke.

I will give the input to you. You need to respond with the output.
"""

In [164]:
model_outputs = []
for i in tqdm(templates[:10]):
    prompt = prompt_text 
    messages = [
        {       "role": "system",
                "content": prompt,
            },
        {"role": "user", "content": i}
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=200, do_sample=True, temperature=0.7, top_k=5, top_p=0.95)
    model_output = outputs[0]["generated_text"]
    model_outputs.append(model_output)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:24<00:00,  2.47s/it]


In [165]:
import re

In [166]:
model_outputs[1]

'<|system|>\n\nYou are a very funny comedian in the midst of a hilarious game. Your challenge is to fill in the blanks of a given sentence to complete the joke. Remember, you must adhere strictly to the template: no changing of existing words, only filling in the blanks to craft a witty punchline. Your response should be a single line with the completed joke.\n\nI will give the input to you. You need to respond with the output.\n</s>\n<|user|>\nDid you hear about the failed Origami ___? If folded.</s>\n<|assistant|>\nDid you hear about the failed Origami swan? If folded, it turned into a crane.'

In [167]:
pattern = r'Output: "(.*?)"'
# matches = re.findall(pattern, text)

# for match in matches:
#     print(match)

In [168]:
def extract_data(sentences):
    output = []
    pattern = r'Output:(.*?)\n'
    
    for i,sentence in enumerate(sentences):
        sents = sentence.split("\n")
        sents = [i for i in sents if i]
        output.append(sents[-1])
#         for sent in sents:
#             if "Output"
#         if len(matches) == 2 and '___' not in matches[1]:
#             output.append(matches[1])
#         else:
#             output.append("NAN")
    return output

In [169]:
clean_outputs = extract_data(model_outputs)

In [170]:
len(clean_outputs)

10

In [171]:
templates[:10]

["What's Louis C.K.'s ___ ___ of ___ other than his own? ___",
 'Did you hear about the failed Origami ___? If folded.',
 'How much does a ___ ___ weigh? A skele**ton**.',
 'How did the ___ burn his tongue? He ate his pizza BEFORE it was ___.',
 "What do you call a ___ full of ___'s? A ___ cellar.",
 '___',
 'Why is ___’s ___ so big ? Because he only comes ___ a ___ .',
 "Why didn't the ___ buy Velcro ___? ....because they were a ___ off",
 "What's the ___ thing about Jonestown ___? The ___ line.",
 'Which real-life ___ was the ___ ___? ___ BonaPARTAY!!!']

In [172]:
clean_outputs

["What's Louis C.K.'s stand-up comedy of laughter other than his own? Audiences'",
 'Did you hear about the failed Origami swan? If folded, it turned into a crane.',
 'How much does a feather weigh? A skeleton weighs a feather (because the weight of living flesh is what gives a skeleton weight, but without the flesh, a skeleton is essentially weightless).',
 'How did the greedy burn his tongue? He ate his pizza BEFORE it was cold.',
 'What do you call a brewery full of barrels? A barrel cellar.',
 'I\'m not a person, but if I were filling in the blank in this joke, it might be: "Why did the tomato turn red? Because it saw the salad dressing and blushed!"',
 "Why is Dave's handshake so strong? Because he only comes out of meetings with a handshake.",
 "Why didn't the penguin buy Velcro socks? Because they were a feather short.",
 'I\'ll explain: In the infamous Jonestown tragedy, a cult called the Peoples Temple led by Jim Jones encouraged its members to drink a poisoned drink made of F

In [24]:
import pickle
with open("outputs_1.pkl","wb") as f:
    pickle.dump(clean_outputs,f)

In [1]:
import pickle

In [2]:
with open("outputs_1.pkl","rb") as f:
    outputs_zephyr = pickle.load(f)

In [3]:
import pickle
with open("../../masked_templates_test.pkl","rb") as f:
    masked_data = pickle.load(f)

In [8]:
with open("../../bert_outputs.pkl","rb") as f:
    outputs_bert = pickle.load(f)

In [10]:
original_jokes = [i[0] for i in masked_data]
templates = [i[1] for i in masked_data]
outputs_bert = [i["bert"] for i in outputs_bert]

In [17]:
final = []
for i,j,k in zip(templates, outputs_zephyr, outputs_bert):
    if len(j) > 20 and '___' not in j and '?' in j and '?' in k:
        final.append((i,j,k))
final = list(set(final))
print(len(final))

128


In [18]:
final = final[:100]

In [20]:
final_templates = [i[0] for i in final]

In [30]:
final_templates[90:]

['What does the Sun and my [MASK] have in [MASK]? they circle the [MASK].',
 'What makes an ISIS [MASK] [MASK]? The [MASK]',
 "What do you call a lazy [MASK]? A procrastigator (I'm [MASK])",
 "Why did Yoda turn [MASK]'s [MASK] around? Because he always reverses [MASK].",
 "What do you call a half [MASK] half Muslim [MASK]? O'Pressive.",
 'What does [MASK] Boy say when he dance-steps in a [MASK]?',
 'Which [MASK] duo do [MASK] like the most? Run the [MASK]',
 "Why'd the pizza [MASK] have a burnt [MASK]? He ate it before it was [MASK].",
 "Did you know [MASK] Wonder was [MASK]? He didn't.",
 'Where can you find a [MASK] with no legs? [MASK] where you left him.']

In [31]:
ans1 = ["What would the printout of a VagFax be? An emissions test.",
"What is the magic number? My credit card number, call me please! #Please don't",
"Why does Tendulkar never sweat? Because he has huge fans!",
"How do you know if someone is a painter? They've got color on them.",
"How do they circumcise a whale? Four skin divers.",
"Where do you find happiness? At a dictionary",
"Why did the balloon prices keep rising? Because they had to adjust for inflation.",
"What do you get when a cat and a dog make a baby? Ask your veterinarian.",
"What do dwarfs call their mugs? Shot glasses.",
"What do you call a relaxed Italian chef? Spa-ghetto.",
'Did you know that all surgical equipment has to be sterilized before use? Clearly not a job for dirty minds!',
"What did the policeman say to his belly? You're under a vest.",
"Have you ever tried North Korean cuisine? That's ironic, neither have they.",
'What do you call a police who arrests a sketch artist? A Draw Officer!',
'What do you call a C-list celebrity from Texas? Mr. Lone Star',
'Why did the Indian mathematician divide by zero? To get NaN (Not a Number).',
'Which real-life figure was the best at parties? Napoleon BonaPARTAY!!!',
'What is a prank played done in October called? a trick or treatment.',
'What do Dothraki use to count their horses? A Khal-culator.',
'Why is it called cat grooming and not pawdicure?',
        'Who named the Big Whales? Small fish with a sense of humor.',
'What do you call the top chef for slicing fingers? Con-cutter.',
'How do you clear out a Japanese theater? B-29.',
'What do you call a basketball player on an airplane? Sky Curry.',
'What has two legs and bleeds? Half a chicken.',
"What is The Rock's favorite radio station? 'Dwayne's Tunes, 107.3FM for all your Classic Rock'.",
'What do you call a French workout coach? A Paris-trainer.',
"How do you know when Will Smith gets sad on set? They leave 'fresh prints'.",
'What do you call a creamy pasta? Fettuccine Alfred-oh!',
'Why should you never make fun of a scarecrow? Because corn has ears!',
'Why did the blind carpenter fail his test? He didn’t see the point.',
'What is it called when a chameleon eats his own tail? A reptile dysfunction.',
'Why is life a bad joke? Beats the purpose out of me...',
'How much space is needed for fungi to grow? Mush room as possible.',
'Why do women finish first during alphabet recitations? Because Adam came before Eve!',
'Why did all of Britain stop smoking tea? Because they all joined /r/teetotalers',
"What's the hardest part about having a party? Making it.",
'Who is the richest Jewish baker? Dough-nald Trump.',
'What’s Chris Brown’s favorite kind of pie? Hit and rum.',
'How much does a hipster weigh? A skele-ton.',
"Why was Mary a carpenter? Because Joseph the foreman worked his own miracles.",
"What is it called when Batman wants to leave Church early? Christian Bale.",
"How do you stop a dog humping your leg? Pick it up and pet it.",
"What's the killer thing about Jonestown cocktails? The punch line.",
"Why can't two elephants ever finish a game? They keep dropping the marbles.",
"How did the hipster burn his tongue? He ate his pizza before it was cool.",
"Mom, can you tell me what a solar eclipse is? No, son......",
"Why does Chuck Norris play Uno? Because he beats everyone!",
"Why did the atom wear a tiny hat? Because he was just a little element.",
"Have you heard of The Dough Killer? He was kneaded to death.",
        "What do you call an Alligator with a vest? An Investigator.",
    "What do you call a bear standing in the middle of a basketball court? An unexpected player.",
    "How do you get down from an elephant? You don't. You get down from a duck.",
    "What do you call a feline who likes to cook? A Sous-chef.",
    "What Is The Hardest Thing for A perpetual motion machine To Do? Nothing.",
    "Where do crows hang out on Saturday night? The crowbar.",
    "Which room in Donald Trump's house has the most books? The Twitter room.",
    "Why did the gay scarecrow get scared? They put the fear of straw in each other.",
    "How do you make a tissue punch? Give it boxing lessons!",
    "What do you call a Mexican magician who’s run out of magic? No Juan left to trick.",
    'What is the sound of one hand clapping? Fap, fap, fap',
    "What's the favourite beer of a mathematician? A pint of Pi!",
    "You guys ever try boiled pizza? It's not as cheesy but it's definitely soup-rising",
    "What's a ghost's favourite dessert? Boo-berry pie",
    'What do you call the money a prostitute makes? Incum',
    "What did one wall say to the other? Meet you at the corner!",
    'Social Media Dilemma? Post Bacon!!',
    "Wanna hear a construction joke? I'm still working on it",
    'Wanna see some black humor? Sorry, I meant dark roasted Magic',
    'What do you call a chubby Ewok that stays inside? An Endor-sed Weight Watcher',
        "What do you call a butter in a supermarket? I can't believe it's not shopper.",
    'What do you call an indecisive chicken? A peck-culiar.',
    'What vegetable is good at martial arts? Brocco Lee!',
    'What do you call a cat who doesn’t eat meat? A purr-sian lion',
    '"Why do you enjoy running words over in your car?" "It gives me grammar Bumps!"',
    'What do you call 100 drowning lawyers? A good start',
    "What is a ghost's favorite move in Super Smash Brothers? Boo-Knight",
    'Why did the chicken cross the road? God is the cluck-splanation.',
    "What does my dad have in common with Nemo? They both can't be found.",
    'Why did the chicken cross the playground? To get to the other slide!',
        'Why was the apple Isaac Newton buried at sea? Because he was gravity-defying.',
"How do a neckbeard's cells divide? By m'tosis",
'What do you call a weightlifting Pakistani dairy farmer? A moo-scle.',
"How do you tell if you're at a gay BBQ? The hot dogs taste like quiche.",
'Why do ghosts have sore throats? Because they have no body to cough for them.',
'I wonder what silence tastes like? It tastes like unsaid words.',
'Did you hear the autobiography of the Mime world came out? Neither did I.',
'Did you hear the joke about the broken pencil? It works on so many pointless levels.',
"Why couldn't the geologist part with her rock collection? It was gneiss.",
'Why did Serj Tankian cross the road? Because he wanted to System of a Down the other side.',
        'What does the Sun and my pizza have in common? They both circle the cravings.',
'What makes an ISIS joke bad? The execution.',
"What do you call a lazy alligator? A procrastigator (I'm snoring)",
"Why did Yoda turn the sentence's structure around? Because he always reverses the syntax.",
"What do you call a half Irish half Muslim person? O'Pressive.",
'What does Elastic Boy say when he dance-steps in a puddle? "This is stretching it!"',
'Which battery duo do robots like the most? Run the Currents',
"Why'd the pizza maker have a burnt crust? He ate it before it was cool.",
"Did you know Stevie Wonder was moving? He didn't.",
'Where can you find a dog with no legs? Right where you left him.'
        
       ]

In [32]:
len(ans1)

100

In [34]:
with open("gpt_output.pkl","wb") as f:
    pickle.dump(ans1, f)

In [43]:
masked_data[0]

("What's Louis C.K.'s favorite type of meat other than his own? Jerkey",
 "What's Louis C.K.'s [MASK] [MASK] of [MASK] other than his own? [MASK]")

In [45]:
temp_dic = {}
for i in masked_data:
    temp_dic[i[1]] = i[0]

In [46]:
final_dump = []
for (i,j,k),l in zip(final, ans1):
    final_dump.append({"original": temp_dic[i], "template":i, "bert":k, "zephy":j, "gpt4":l})

In [47]:
final_dump

[{'original': 'What would the male equivalent of a VagFax be? An emissions report.',
  'template': 'What would the [MASK] [MASK] of a VagFax be? An emissions [MASK].',
  'bert': 'What would the worst job of a VagFax be? An emissions driver.',
  'zephy': 'What would the printer cartridge of a VW fax machine be? An emissions toner.',
  'gpt4': 'What would the printout of a VagFax be? An emissions test.'},
 {'original': "What is the loneliest number? My phone number, call me please! #Please don't",
  'template': "What is the [MASK] number? My [MASK] number, call me please! #Please don't",
  'bert': "What is the wrong number? My pet number, call me please! # Please don't",
  'zephy': 'What is the butted number? My butt dial number, call me please!',
  'gpt4': "What is the magic number? My credit card number, call me please! #Please don't"},
 {'original': 'Why Sachin Tendulkar never sweat? Because he has huge fans!',
  'template': 'Why [MASK] Tendulkar [MASK] sweat? Because he has huge [MAS

In [48]:
with open("output_dump.pkl","wb") as f:
    pickle.dump(final_dump, f)

In [49]:
f = open("../../jokegen/train.txt")
data = f.readlines()
data = [i for i in data if len(i)>15]

In [50]:
len(data)

211184

In [53]:
import random
random.shuffle(data)

In [55]:
human_jokes = [i.strip() for i in data[:100]]

In [57]:
with open("human_dump.pkl","wb") as f:
    pickle.dump(human_jokes, f)