In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import sentencepiece

import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset

from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup
import os
import pandas as pd
import numpy as np

from tqdm import tqdm

In [4]:

model_name_or_path = "t5-small" 

tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
model.load_state_dict(torch.load("T5-result/t5_joke_generator_2.pt"))

device = 'cuda:0'
model.to(device)

  model.load_state_dict(torch.load("T5-result/t5_joke_generator_2.pt"))


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [10]:
def generate_text(model, tokenizer, prompt, max_length=64, num_return_sequences=5, device="cuda"):
    model.eval() 
    model.to(device)

    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            do_sample=True, 
            top_k=50,  
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    return generated_texts


In [17]:
setup = "If life gives you melons"

input_text = setup + " <continue>:"

text = generate_text(model, tokenizer, input_text)

for str in text:
    print(setup, str)

If life gives you melons nays, I'm still the kiloma.
If life gives you melons the way I get to go and give the people I love.
If life gives you melons sours.
If life gives you melons sceptics scoffing you scuffs and scuffs.
If life gives you melons stale stale stale yelonea.


In [8]:
# Dataset
import os
import pandas as pd
import numpy as np

from tqdm import tqdm
from torch.utils.data import DataLoader,Dataset
class Jokesdataset(Dataset):
    '''
    This class builds the custom dataset for Dataloader
    '''
    def __init__(self,data):
        self.data = data
        self.eos_tok = "<|endoftext|>"
        #Adding JOKE: at the start and EOS TOKEN at end
        # self.data['Joke'] = self.data['Joke'].apply(lambda x: str(x) + self.eos_tok)

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        joke = self.data.iloc[idx,1]


        return joke
        
    def random_split_joke(self, idx):
        joke = joke = self.data.iloc[idx,1]
        words = joke.split()
        split_ratio = np.random.uniform(0.2, 0.7)
        split_index = int(len(words) * split_ratio)
        return " ".join(words[:split_index]), joke

jokes = pd.read_csv("/home/scxzc2/project/jokGen/reddit-cleanjokes.csv") #add the path to your Dataset in config File

dataset = Jokesdataset(jokes)

In [11]:
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

num = 0
total_belu = 0
for i in range(10):
    input, joke = dataset.random_split_joke(i)  
    
    input = input.replace("JOKE:", "")
    
    outputs = []

    input_text = input + " <continue>:"
    text = generate_text(model, tokenizer, input_text)
    for str in text:
        outputs.append(input + str)
    
    print(joke)
    print(outputs[0])
    
        
    references = [[joke] for _ in range(len(outputs))]
    
    bleu_score = corpus_bleu(references, outputs)
    
    total_belu += bleu_score
    num = num + 1
    # print(bleu_score)

avg_score = total_belu / num

print(f"AVG BLEU score: {avg_score}")


What did the bartender say to the jumper cables? You better not try to start anything.
What did the bartender say to the jumper cables?'I'm not a feist'
Don't you hate jokes about German sausage? They're the wurst!
Don't youif you eat a fish, you're in the wrong place.
Two artists had an art contest... It ended in a draw
Two artists had ana satty.
Why did the chicken cross the playground? To get to the other slide.
Why did the chicken cross the
What gun do you use to hunt a moose? A moosecut!
What gun do you use to huntfrog? No, you can't use them.
If life gives you melons, you might have dyslexia.
Ifyou have to be at the same time.
Broken pencils... ...are pointless.
Brokena single roof.
What did one snowman say to the other snowman? 'Do you smell carrots?'
What did one snowman say toa snowman do? No, it was a snowman.
How many hipsters does it take to change a lightbulb? It's a really obscure number. You've probably never heard of it.
How many hipsters does it take to changea year? O

In [17]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

def calculate_perplexity(model, tokenizer, input, target):

    inputs = tokenizer(input, return_tensors="pt")
    label = tokenizer(target, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    label_ids = label["input_ids"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, labels=label_ids)
        loss = outputs.loss 

    perplexity = torch.exp(loss)
    return perplexity.item()

total = 0
for i in range(10):
    input, joke = dataset.random_split_joke(i)  
    input = input + " <continue>:"
    perplexity = calculate_perplexity(model, tokenizer, input, joke)
    total += perplexity
    
print(f"AVG: {total/10}")

AVG: 58.17746067047119
