In [1]:
import torch
from torch import nn
import pandas as pd
from collections import Counter
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

In [2]:
device = torch.device("cuda:0")

In [3]:
device

device(type='cuda', index=0)

In [4]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2-medium', 
    trust_remote_code=True, 
    use_auth_token=None, 
    local_files_only=False 
)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
special_tokens_dict = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [6]:
MAX_LEN = 64

class Jokesdataset(Dataset):

    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.eos_tok = "<|endoftext|>"
        self.data['Joke'] = self.data['Joke'].apply(lambda x: str(x) + self.eos_tok)

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        joke = self.data.iloc[idx,1]

        inputs = self.tokenizer.encode_plus(
            joke,
            None,
            add_special_tokens = True,
            max_length = MAX_LEN,
            pad_to_max_length = True
        )

        ids = inputs["input_ids"]

        return torch.tensor(ids,dtype=torch.long)

In [26]:

train_path = "../reddit-cleanjokes.csv"
jokes = pd.read_csv(train_path) 

dataset = Jokesdataset(jokes,tokenizer)
dataloader = DataLoader(dataset,
                                batch_size=64,
                                shuffle=True)


vocab_size = len(tokenizer) 
print(vocab_size)

50258


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

class LSTMTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_heads, num_layers):
        super(LSTMTransformerModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.tgtEmbedd = nn.Embedding(vocab_size, 2*embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=n_heads), 
            num_layers=2
        )
        
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=hidden_dim * 2, nhead=n_heads), 
            num_layers=num_layers
        )
        
        self.fc_out = nn.Linear(hidden_dim * 2, vocab_size)
        
    def forward(self, src, tgt):
        x = self.embedding(src)
        lstm_out, (h_n, c_n) = self.lstm(x)
        transformer_encoder_out = self.transformer_encoder(x)
        lstm_last_hidden = h_n[-1].unsqueeze(1).repeat(1, src.size(1), 1)  # [batch_size, seq_len, hidden_dim]

        combined_input = torch.cat((lstm_last_hidden, transformer_encoder_out), dim=-1)  # [batch_size, seq_len, hidden_dim * 2]
        tgt_embedding = self.tgtEmbedd(tgt)

        decoder_output = self.transformer_decoder(tgt_embedding, combined_input)

        logits = self.fc_out(decoder_output)
        output = torch.softmax(logits, dim=-1)
        return output




In [28]:
embedding_dim = 512
hidden_dim = 512
n_heads = 8       
num_layers = 6    

model = LSTMTransformerModel(vocab_size, embedding_dim, hidden_dim, n_heads, num_layers)
model.to(device)



LSTMTransformerModel(
  (embedding): Embedding(50258, 512)
  (tgtEmbedd): Embedding(50258, 1024)
  (lstm): LSTM(512, 512, batch_first=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): Multihe

In [13]:
device

device(type='cuda', index=0)

In [None]:
from tqdm import tqdm



def train(model, dataloader, learning_rate, num_epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0.0  
        loss_num = 0
        for sentence in tqdm(dataloader):
            
            for t in range(MAX_LEN - 3):
                input_t = sentence[:, :t+1].to(device) 
                target_t = sentence[:, t+1].to(device)  
                
                # print(sentence[0])
                # print(input_t[0])
                # print(target_t[0])

                output_t = model(input_t, target_t)

                loss = criterion(output_t[:, -1], target_t)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                loss_num = loss_num + 1

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss/loss_num:.4f}")


learning_rate = 0.001
num_epochs = 5

train(model, dataloader, learning_rate, num_epochs)
torch.save(model.state_dict(), 'lstm-transformer.pth')

In [17]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

def predict(input_text, length_of_joke,number_of_jokes):
    joke_num = 0
    model.eval()
    with torch.no_grad():
        for joke_idx in range(number_of_jokes):
        
            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device)

            for i in range(length_of_joke):
                outputs = model(cur_ids, cur_ids)
                output = outputs[0][-1].argmax()
                
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * output], dim = 1)
                if output in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break
                
                word = tokenizer.decode(output)
                
                input_text = input_text + word

                
            
            # if joke_finished:
                
        joke_num = joke_num + 1
                
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)

        return output_text

# Start Predicting
input_text = "If life gives you melons"

predict(input_text, 64, 1)

'If life gives you melons<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'

In [43]:
input_text = "How did your mom know"

predict(input_text, 64, 1)

In [35]:
class Jokesdataset(Dataset):
    '''
    This class builds the custom dataset for Dataloader
    '''
    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.eos_tok = "<|endoftext|>"
        #Adding JOKE: at the start and EOS TOKEN at end
        self.data['Joke'] = self.data['Joke'].apply(lambda x: "JOKE:" + str(x) + self.eos_tok)

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        joke = self.data.iloc[idx,1]
    
        inputs = self.tokenizer.encode_plus(
            joke,
            None,
            add_special_tokens = True,
            max_length = 64,
            pad_to_max_length = True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {'ids':torch.tensor(ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'target':torch.tensor(ids,dtype=torch.long)}
        
    def random_split_joke(self, idx):
        joke = joke = self.data.iloc[idx,1]
        words = joke.split()
        split_ratio = np.random.uniform(0.3, 0.7)  # 随机比例
        split_index = int(len(words) * split_ratio)
        return " ".join(words[:split_index]), joke

# jokes = pd.read_csv("/home/scxzc2/project/jokGen/reddit-cleanjokes.csv") #add the path to your Dataset in config File
jokes = pd.read_csv("/home/scxzc2/project/jokGen/val.csv") #add the path to your Dataset in config File

test_dataset = Jokesdataset(jokes,tokenizer)
test_dataloader = DataLoader(test_dataset,
                                batch_size=1,
                                shuffle=True,
                                num_workers=4)

In [44]:
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

jokeId = [53, 67, 183, 1345, 489, 982, 322, 283, 432, 763]

num = 0
total_belu = 0
for i in tqdm(range(10)):
    input, joke = test_dataset.random_split_joke(jokeId[i])  
    
    input = input.replace("JOKE:", "")
    
    outputs = []
    for j in range(1):
        output = predict(input, 64, 1)
        outputs.append(output)
    
    print(input)
    print(outputs[0])
        
    references = [[joke] for _ in range(len(outputs))]
    # print(references[0])
    
    bleu_score = corpus_bleu(references, outputs)
    
    total_belu += bleu_score
    num = num + 1
    # print(bleu_score)

avg_score = total_belu / num
print(f"AVG BLEU score: {avg_score}")

100%|██████████| 10/10 [00:00<00:00, 53.12it/s]

(NSFW) A blind man walks
(NSFW) A blind man walks into a bar.... So Nickelback walks into a bar, and there is no punchline, because ruining music isn't funny.<|endoftext|>
How many Germans does it take
How many Germans does it take to change a lightbulb? Just one, but it takes 5 episodes.<|endoftext|>
Do flashers have dreams where they leave the house
Do flashers have dreams where they leave the house? Intruder window<|endoftext|>
What does the Interstellar soundtrack and a porno film
What does the Interstellar soundtrack and a porno film? A cat-naaaaa<|endoftext|>
The ducks in Cern What does the ducks
The ducks in Cern What does the ducks haveman and Eyore had a baby. The baby's name? Supereyore<|endoftext|>
What would you have if all autos in
What would you have if all autos in the middle? "Is that you coffin?"<|endoftext|>
Kennedy put a man on
Kennedy put a man on my door and asked for a small donation towards the local swimming pool. I gave him a glass of water.<|endoftext|>
Girl l




In [22]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

def calculate_perplexity(model, tokenizer, text):
    # 编码输入文本
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    
    # 获取模型输出
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss  # 模型的交叉熵损失

    # 计算 Perplexity
    perplexity = torch.exp(loss)
    return perplexity.item()


torch.Size([1, 8])
What did the bartender say to the jumper cables? You better not try to start anything.<|endoftext|>



In [26]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import nn

criterion = nn.CrossEntropyLoss()
def calculate_perplexity(model, tokenizer, text):
    # 编码输入文本
    inputs = tokenizer(text, return_tensors="pt")
    sentence = inputs["input_ids"].to(device)
    
    print(sentence)
    
    # 获取模型输出
    with torch.no_grad():
        for t in range(len(sentence)):
                input_t = sentence[:, :t+1].to(device) 
                target_t = sentence[:, t+1].to(device)  

                output_t = model(input_t)

                loss = criterion(output_t[:, -1], target_t)

    # 计算 Perplexity
    perplexity = torch.exp(loss)
    return perplexity.item()


example_text = "Why don't scientists trust atoms? Because they make up everything!"
# example_text = "If life gives you melons, you might have dyslexia."

# 计算 Perplexity
ppl = calculate_perplexity(model, tokenizer, example_text)
print(f"Perplexity: {ppl}")

tensor([[ 5195,   836,   470,  5519,  3774, 23235,    30,  4362,   484,   787,
           510,  2279,     0]], device='cuda:0')
Perplexity: 37.42204666137695
