In [20]:
import torch
from torch import nn
import pandas as pd
from collections import Counter
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

In [21]:
device = torch.device("cuda")

In [22]:
device

device(type='cuda')

In [34]:

wordRange = 4

class MyDataset(Dataset):
    def __init__(self):
        self.listOfWords = self.loadWords()
        self.listOfUniqueWords = self.obtainUniqueWords()
        self.id2word = {i: w for i, w in enumerate(self.listOfUniqueWords)}
        self.word2id = {w: i for i, w in enumerate(self.listOfUniqueWords)}
        self.listOfIds = [self.word2id[w] for w in self.listOfWords]
 
    def loadWords(self):
        csvData = pd.read_csv('reddit-cleanjokes.csv') 
        return csvData['Joke'].str.cat(sep=' ').split(' ')

    def obtainUniqueWords(self):
        wordCounts = Counter(self.listOfWords)
        return sorted(wordCounts, key=wordCounts.get, reverse=True)

    def __len__(self):
        return len(self.listOfIds) - wordRange

    def __getitem__(self, index):
        return (torch.tensor(self.listOfIds[index:index+wordRange]).to(device), torch.tensor(self.listOfIds[index+1:index+wordRange+1]).to(device))

In [24]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        output = self.fc(lstm_out)
        
        return output   



In [35]:

dataset = MyDataset()

vocab_size = len(dataset.listOfUniqueWords)
 
embedding_dim = 128
hidden_dim = 256


In [26]:
model = LanguageModel(vocab_size, embedding_dim,hidden_dim)
model.to(device)

LanguageModel(
  (embedding): Embedding(6925, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=6925, bias=True)
)

In [8]:
from tqdm import tqdm

learning_rate = 0.001
num_epochs = 8


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

for epoch in range(num_epochs):
    for batch in tqdm(dataloader):
        inputs, targets = batch
        optimizer.zero_grad()

        outputs = model(inputs)

        loss = criterion(outputs.transpose(1,2), targets)  

        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

100%|██████████| 748/748 [00:03<00:00, 247.51it/s]


Epoch 1/8, Loss: 4.456206321716309


100%|██████████| 748/748 [00:02<00:00, 314.19it/s]


Epoch 2/8, Loss: 3.2708797454833984


100%|██████████| 748/748 [00:02<00:00, 315.62it/s]


Epoch 3/8, Loss: 2.242448568344116


100%|██████████| 748/748 [00:02<00:00, 316.45it/s]


Epoch 4/8, Loss: 1.651957631111145


100%|██████████| 748/748 [00:02<00:00, 314.91it/s]


Epoch 5/8, Loss: 1.6460914611816406


100%|██████████| 748/748 [00:02<00:00, 313.92it/s]


Epoch 6/8, Loss: 0.970251739025116


100%|██████████| 748/748 [00:02<00:00, 315.65it/s]


Epoch 7/8, Loss: 1.1857290267944336


100%|██████████| 748/748 [00:02<00:00, 314.74it/s]

Epoch 8/8, Loss: 0.41014155745506287





In [9]:

torch.save(model.state_dict(), 'language_model.pth')

In [28]:
model.load_state_dict(torch.load('language_model.pth'))

  model.load_state_dict(torch.load('language_model.pth'))


<All keys matched successfully>

In [29]:
# 定义生成文本函数
def generate_text(model, start_text, max_words=20):
    model.eval()
    with torch.no_grad():
        input_ids = [dataset.word2id[word] for word in start_text.split()]
        for _ in range(max_words):
            input_tensor = torch.tensor(input_ids[-4:]).unsqueeze(0)
            output = model(input_tensor.to(device))
            next_word_id = output.argmax(dim=-1)[:, -1].item()
            input_ids.append(next_word_id)
    generated_text = ' '.join([dataset.id2word[i] for i in input_ids])
    return generated_text

In [30]:
input_text = "If life gives you melons"
generated_text = generate_text(model, input_text)
print(generated_text)

If life gives you melons like to get around Endor? Ewoks I don't have the faintest idea why I passed out Just a short pun


In [40]:
#  --------------------------------- EVAL --------------------------------------------

# Dataset
import os
import pandas as pd
import numpy as np

from tqdm import tqdm
from torch.utils.data import DataLoader,Dataset
class Jokesdataset(Dataset):
    '''
    This class builds the custom dataset for Dataloader
    '''
    def __init__(self,data):
        self.data = data
        self.eos_tok = "<|endoftext|>"
        #Adding JOKE: at the start and EOS TOKEN at end
        self.data['Joke'] = self.data['Joke'].apply(lambda x: str(x) + self.eos_tok)

        self.listOfWords = self.loadWords()
        self.listOfUniqueWords = self.obtainUniqueWords()
        self.id2word = {i: w for i, w in enumerate(self.listOfUniqueWords)}
        self.word2id = {w: i for i, w in enumerate(self.listOfUniqueWords)}
        self.listOfIds = [self.word2id[w] for w in self.listOfWords]
 
    def loadWords(self):
        csvData = pd.read_csv('reddit-cleanjokes.csv') 
        return csvData['Joke'].str.cat(sep=' ').split(' ')

    def obtainUniqueWords(self):
        wordCounts = Counter(self.listOfWords)
        return sorted(wordCounts, key=wordCounts.get, reverse=True)

    def __len__(self):
        return len(self.data)
        
    def random_split_joke(self, idx):
        joke = joke = self.data.iloc[idx,1]
        words = joke.split()
        split_ratio = np.random.uniform(0.3, 0.7)
        split_index = int(len(words) * split_ratio)
        return " ".join(words[:split_index]), joke

jokes = pd.read_csv("/home/scxzc2/project/jokGen/reddit-cleanjokes.csv") 

dataset = Jokesdataset(jokes)
dataloader = DataLoader(dataset,
                                batch_size=1,
                                shuffle=True,
                                num_workers=4)

In [43]:
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu

jokeId = [13, 7, 183, 1345, 89, 982, 322, 83, 432, 363]

num = 0
total_belu = 0
total_rouge = 0
for i in tqdm(range(10)):
    input, joke = dataset.random_split_joke(jokeId[i])  
    
    input = input.replace("JOKE:", "")
    
    outputs = []
    for j in range(1):
        output = generate_text(model, input, 20)
        outputs.append(output)
    
    # print(input)
    print(output)
        
    references = [[joke] for _ in range(len(outputs))]
    
    bleu_score = corpus_bleu(references, outputs)

    
    total_belu += bleu_score
    num = num + 1
    # print(bleu_score)
    
    


avg_score = total_belu / num
print(f"AVG BLEU score: {avg_score}")


100%|██████████| 10/10 [00:00<00:00, 109.40it/s]

What did the owner of a brownie factory say when his factory caught fire? That he needed to address the situation Math problem: I had 10 chocolate bars and ate
What did one snowman say to the other frog? Time's fun when you're having flies. Why did the boy take a ladder to school? He wanted
Original physics joke. I'm very proud. I was organizing my desk the other day and I've come to this realization... Currently, this subreddit expecting jokes about soap. I am mildly disappointed. What game
What is black, white, and red all over? A Communist Propaganda film from the 1930s. [OC c/o my 9 y.o.] What holds up a bowl's pants? Suspoonders! I don't
What did Vincent van Gogh call himself when he joined the Justice League? The Starry Knight Why did the chicken cross the road naked? A: Because
I just found out I'm colorblind It came out of the yellow. Ever heard about that movie called Constipation? It never came out. How
Always put sunglasses on your tree. Then, you'll get the proper shade. To


