In [7]:
import torch
from torch import nn
import pandas as pd
from collections import Counter
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

In [8]:
device = torch.device("cuda:0")

In [9]:
device

device(type='cuda', index=0)

In [10]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2-medium', 
    trust_remote_code=True, 
    use_auth_token=None, 
    local_files_only=False 
)
tokenizer.pad_token = tokenizer.eos_token

In [11]:
special_tokens_dict = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [12]:
MAX_LEN = 64

class Jokesdataset(Dataset):

    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.eos_tok = "<|endoftext|>"
        self.data['Joke'] = self.data['Joke'].apply(lambda x: str(x) + self.eos_tok)

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        joke = self.data.iloc[idx,1]

        inputs = self.tokenizer.encode_plus(
            joke,
            None,
            add_special_tokens = True,
            max_length = MAX_LEN,
            pad_to_max_length = True
        )

        ids = inputs["input_ids"]

        return torch.tensor(ids,dtype=torch.long)

In [13]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):

        super(LanguageModel, self).__init__()
        self.embeddedLayer = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim,  batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        


    def forward(self, x):
        x = self.embeddedLayer(x)
        lstm_out, (hn, cn) = self.lstm(x)
        output = self.fc(lstm_out)
        return output

In [14]:
train_path = "../reddit-cleanjokes.csv"
jokes = pd.read_csv(train_path) 

dataset = Jokesdataset(jokes,tokenizer)
dataloader = DataLoader(dataset,
                                batch_size=32,
                                shuffle=True)

vocab_size = len(tokenizer)
embedding_dim = 512
hidden_dim = 1024

In [15]:
vocab_size

50258

In [16]:
model = LanguageModel(vocab_size, embedding_dim,hidden_dim)
model.to(device)

LanguageModel(
  (embeddedLayer): Embedding(50258, 512)
  (lstm): LSTM(512, 1024, batch_first=True)
  (fc): Linear(in_features=1024, out_features=50258, bias=True)
)

In [11]:
device

device(type='cuda', index=0)

In [12]:
from tqdm import tqdm



def train(model, dataloader, learning_rate, num_epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0.0  
        loss_num = 0
        for sentence in tqdm(dataloader):
            
            for t in range(MAX_LEN - 3):
                input_t = sentence[:, :t+1].to(device) 
                target_t = sentence[:, t+1].to(device)  

                output_t = model(input_t)

                loss = criterion(output_t[:, -1], target_t)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                loss_num = loss_num + 1

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss/loss_num:.4f}")


learning_rate = 0.001
num_epochs = 10

train(model, dataloader, learning_rate, num_epochs)
torch.save(model.state_dict(), 'lstmv2-overtrain.pth')

  0%|          | 0/51 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


100%|██████████| 51/51 [01:46<00:00,  2.08s/it]


Epoch 1/10, Loss: 2.1399


100%|██████████| 51/51 [01:48<00:00,  2.13s/it]


Epoch 2/10, Loss: 1.2591


100%|██████████| 51/51 [01:48<00:00,  2.13s/it]


Epoch 3/10, Loss: 0.5741


100%|██████████| 51/51 [01:48<00:00,  2.13s/it]


Epoch 4/10, Loss: 0.3035


100%|██████████| 51/51 [01:48<00:00,  2.14s/it]


Epoch 5/10, Loss: 0.2207


100%|██████████| 51/51 [01:48<00:00,  2.14s/it]


Epoch 6/10, Loss: 0.1905


100%|██████████| 51/51 [01:49<00:00,  2.14s/it]


Epoch 7/10, Loss: 0.1718


100%|██████████| 51/51 [01:49<00:00,  2.14s/it]


Epoch 8/10, Loss: 0.1647


100%|██████████| 51/51 [01:49<00:00,  2.14s/it]


Epoch 9/10, Loss: 0.1539


100%|██████████| 51/51 [01:49<00:00,  2.14s/it]


Epoch 10/10, Loss: 0.1479


In [17]:

model.load_state_dict(torch.load("lstmv2-overtrain.pth"))

  model.load_state_dict(torch.load("lstmv2-overtrain.pth"))


<All keys matched successfully>

In [42]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

def predict(input_text, length_of_joke,number_of_jokes):
    joke_num = 0
    model.eval()
    with torch.no_grad():
        for joke_idx in range(number_of_jokes):
        
            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device)

            for i in range(length_of_joke):
                outputs = model(cur_ids)
                output = outputs[0][-1].argmax()
                
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * output], dim = 1)
                if output in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break
                
                word = tokenizer.decode(output)
                
                input_text = input_text + word

                
            
            if joke_finished:
                
                joke_num = joke_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                return output_text

# Start Predicting
input_text = "If life gives you melons"

predict(input_text, 64, 1)

'If life gives you melons, you might have dyslexia.<|endoftext|>'

In [43]:
input_text = "How did your mom know"

predict(input_text, 64, 1)

In [35]:
class Jokesdataset(Dataset):
    '''
    This class builds the custom dataset for Dataloader
    '''
    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.eos_tok = "<|endoftext|>"
        self.data['Joke'] = self.data['Joke'].apply(lambda x: str(x) + self.eos_tok)

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        joke = self.data.iloc[idx,1]
    
        inputs = self.tokenizer.encode_plus(
            joke,
            None,
            add_special_tokens = True,
            max_length = 64,
            pad_to_max_length = True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {'ids':torch.tensor(ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'target':torch.tensor(ids,dtype=torch.long)}
        
    def random_split_joke(self, idx):
        joke = joke = self.data.iloc[idx,1]
        words = joke.split()
        split_ratio = np.random.uniform(0.3, 0.7)  
        split_index = int(len(words) * split_ratio)
        return " ".join(words[:split_index]), joke

# jokes = pd.read_csv("/home/scxzc2/project/jokGen/reddit-cleanjokes.csv") #add the path to your Dataset in config File
jokes = pd.read_csv("/home/scxzc2/project/jokGen/val.csv") 

test_dataset = Jokesdataset(jokes,tokenizer)
test_dataloader = DataLoader(test_dataset,
                                batch_size=1,
                                shuffle=True,
                                num_workers=4)

In [44]:
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

jokeId = [53, 67, 183, 1345, 489, 982, 322, 283, 432, 763]

num = 0
total_belu = 0
for i in tqdm(range(10)):
    input, joke = test_dataset.random_split_joke(jokeId[i])  
    
    input = input.replace("JOKE:", "")
    
    outputs = []
    for j in range(1):
        output = predict(input, 64, 1)
        outputs.append(output)
    
    print(input)
    print(outputs[0])
        
    references = [[joke] for _ in range(len(outputs))]
    # print(references[0])
    
    bleu_score = corpus_bleu(references, outputs)
    
    total_belu += bleu_score
    num = num + 1
    # print(bleu_score)

avg_score = total_belu / num
print(f"AVG BLEU score: {avg_score}")

100%|██████████| 10/10 [00:00<00:00, 53.12it/s]

(NSFW) A blind man walks
(NSFW) A blind man walks into a bar.... So Nickelback walks into a bar, and there is no punchline, because ruining music isn't funny.<|endoftext|>
How many Germans does it take
How many Germans does it take to change a lightbulb? Just one, but it takes 5 episodes.<|endoftext|>
Do flashers have dreams where they leave the house
Do flashers have dreams where they leave the house? Intruder window<|endoftext|>
What does the Interstellar soundtrack and a porno film
What does the Interstellar soundtrack and a porno film? A cat-naaaaa<|endoftext|>
The ducks in Cern What does the ducks
The ducks in Cern What does the ducks haveman and Eyore had a baby. The baby's name? Supereyore<|endoftext|>
What would you have if all autos in
What would you have if all autos in the middle? "Is that you coffin?"<|endoftext|>
Kennedy put a man on
Kennedy put a man on my door and asked for a small donation towards the local swimming pool. I gave him a glass of water.<|endoftext|>
Girl l




torch.Size([1, 8])
What did the bartender say to the jumper cables? You better not try to start anything.<|endoftext|>



In [26]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import nn

criterion = nn.CrossEntropyLoss()
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    sentence = inputs["input_ids"].to(device)
    
    print(sentence)

    with torch.no_grad():
        for t in range(len(sentence)):
                input_t = sentence[:, :t+1].to(device) 
                target_t = sentence[:, t+1].to(device)  

                output_t = model(input_t)

                loss = criterion(output_t[:, -1], target_t)

    perplexity = torch.exp(loss)
    return perplexity.item()


example_text = "Why don't scientists trust atoms? Because they make up everything!"
# example_text = "If life gives you melons, you might have dyslexia."

ppl = calculate_perplexity(model, tokenizer, example_text)
print(f"Perplexity: {ppl}")

tensor([[ 5195,   836,   470,  5519,  3774, 23235,    30,  4362,   484,   787,
           510,  2279,     0]], device='cuda:0')
Perplexity: 37.42204666137695
