In [1]:
import os
import numpy as np 
import pandas as pd

import torch
import torch.nn as nn

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name_or_path = "./Llama3.2-1B/"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    # torch_dtype=torch.float16,
    device_map="auto", 
)

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

tokenizer.pad_token = tokenizer.eos_token 




In [2]:

models_path = "./.Llama3.2-1B-Fine-Tuning/gpt2_joke_generator1.pt" 
model.load_state_dict(torch.load(models_path))

device='cuda'

In [17]:

def predict(input_text, length_of_joke,number_of_jokes):
    joke_num = 0
    model.eval()
    with torch.no_grad():
        for joke_idx in range(number_of_jokes):
        
            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode('JOKE:' + input_text)).unsqueeze(0).to(device)

            for i in range(length_of_joke):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break

            
            if joke_finished:
                
                joke_num = joke_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                return output_text



In [7]:
input_text = "(NSFW) A blind man walks past"

predict(input_text, 64, 5)

<|begin_of_text|>JOKE:(NSFW) A blind man walks past a bar... The blind man walks into the bar and asks for a drink. The bartender says "We don't serve you."<|

<|begin_of_text|>JOKE:(NSFW) A blind man walks past a brothel... He says "Wow! This is the best I've seen!" The prostitute responds, "I've got a better one for you."<|

<|begin_of_text|>JOKE:(NSFW) A blind man walks past a urinal... The blind man turns to the urinal and asks "Is that a dick in your pants?" The urinal replies "No, it's a dick in my pants."<|

<|begin_of_text|>JOKE:(NSFW) A blind man walks past a hooker... He turns to her and says, "Is that a hooker in here?"<

<|begin_of_text|>JOKE:(NSFW) A blind man walks past a nudist beach... He sees two women and asks, "Which one is the nudist?" The second one replies, "The one with the most hair."<|



In [5]:
from torch.utils.data import DataLoader, Dataset
class Jokesdataset(Dataset):
    '''
    This class builds the custom dataset for Dataloader
    '''
    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.eos_tok = "<|endoftext|>"
        #Adding JOKE: at the start and EOS TOKEN at end
        self.data['Joke'] = self.data['Joke'].apply(lambda x: "JOKE:" + str(x) + self.eos_tok)

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        joke = self.data.iloc[idx,1]
    
        inputs = self.tokenizer.encode_plus(
            joke,
            None,
            add_special_tokens = True,
            max_length = 64,
            pad_to_max_length = True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {'ids':torch.tensor(ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'target':torch.tensor(ids,dtype=torch.long)}
        
    def random_split_joke(self, idx):
        joke = joke = self.data.iloc[idx,1]
        words = joke.split()
        split_ratio = np.random.uniform(0.3, 0.7)
        split_index = int(len(words) * split_ratio)
        return " ".join(words[:split_index]), joke

# jokes = pd.read_csv("/home/scxzc2/project/jokGen/reddit-cleanjokes.csv")
jokes = pd.read_csv("/home/scxzc2/project/jokGen/val.csv")

test_dataset = Jokesdataset(jokes,tokenizer)
test_dataloader = DataLoader(test_dataset,
                                batch_size=1,
                                shuffle=True,
                                num_workers=4)

In [18]:
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

jokeId = [53, 67, 183, 1345, 489, 982, 322, 283, 432, 763]

num = 0
total_belu = 0
for i in range(10):
    input, joke = test_dataset.random_split_joke(jokeId[i])  
    
    input = input.replace("JOKE:", "")
    

    
    outputs = []
    for j in range(1):
        output = predict(input, 64, 1)
        outputs.append(output)
    
    print(input)
    print(outputs[0])
        
    references = [[joke] for _ in range(len(outputs))]
    # print(references[0])
    
    bleu_score = corpus_bleu(references, outputs)
    
    total_belu += bleu_score
    num = num + 1
    # print(bleu_score)

avg_score = total_belu / num
print(f"AVG BLEU score: {avg_score}")

(NSFW) A blind man walks past a fish
<|begin_of_text|>JOKE:(NSFW) A blind man walks past a fish market. The fisherman shouts "Hey! You're fucking my wife!"<
How many Germans does it take to change a lightbulb? One. They're efficient,
<|begin_of_text|>JOKE:How many Germans does it take to change a lightbulb? One. They're efficient, efficient, efficient.<|
Do flashers have dreams where they leave the house
<|begin_of_text|>JOKE:Do flashers have dreams where they leave the house to see their friends?<|
What does the Interstellar soundtrack
<|begin_of_text|>JOKE:What does the Interstellar soundtrack has in common? They're both composed of a lot of spacey, but no music at all.<|
The ducks in Cern What does the
<|begin_of_text|>JOKE:The ducks in Cern What does the ducks in Cern eat for breakfast? CERN!<
What would you have if all autos in the
<|begin_of_text|>JOKE:What would you have if all autos in the world were black people? A black car.<|
Kennedy put a man on the moon.... Obama put
<|beg

In [22]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

    perplexity = torch.exp(loss)
    return perplexity.item()


    
    
    
jokeId = [53, 67, 183, 1345, 489, 982, 322, 283, 432, 763]

total = 0
for i in range(10):
    input, joke = test_dataset.random_split_joke(jokeId[i])  
    per = calculate_perplexity(model, tokenizer, joke)
    total += per

print(f"AVG perplexity: {total/10}")
    

AVG perplexity: 12.45514485836029
