In [1]:
import torch
from torch import nn
import pandas as pd
from collections import Counter
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelForCausalLM


In [2]:
device = torch.device("cuda")

In [3]:
device

device(type='cuda')

In [2]:

class GPT2JokeDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=512):

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.listOfJokes = self.load_jokes(csv_file)
    
    def load_jokes(self, csv_file):
        csv_data = pd.read_csv(csv_file)
        jokes = csv_data['Joke'].astype(str).tolist()
        jokes = [joke + ' [END]' for joke in jokes] 
        return jokes
    
    def __len__(self):

        return len(self.listOfJokes)

    def __getitem__(self, idx):

        joke = self.listOfJokes[idx]

        encodings = self.tokenizer(
            joke,
            truncation=True,
            padding='max_length', 
            max_length=self.max_length,
            return_tensors="pt" 
        )

        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [24]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:

vocab = tokenizer.get_vocab()

sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
print(sorted_vocab[:300]) 

[('!', 0), ('"', 1), ('#', 2), ('$', 3), ('%', 4), ('&', 5), ("'", 6), ('(', 7), (')', 8), ('*', 9), ('+', 10), (',', 11), ('-', 12), ('.', 13), ('/', 14), ('0', 15), ('1', 16), ('2', 17), ('3', 18), ('4', 19), ('5', 20), ('6', 21), ('7', 22), ('8', 23), ('9', 24), (':', 25), (';', 26), ('<', 27), ('=', 28), ('>', 29), ('?', 30), ('@', 31), ('A', 32), ('B', 33), ('C', 34), ('D', 35), ('E', 36), ('F', 37), ('G', 38), ('H', 39), ('I', 40), ('J', 41), ('K', 42), ('L', 43), ('M', 44), ('N', 45), ('O', 46), ('P', 47), ('Q', 48), ('R', 49), ('S', 50), ('T', 51), ('U', 52), ('V', 53), ('W', 54), ('X', 55), ('Y', 56), ('Z', 57), ('[', 58), ('\\', 59), (']', 60), ('^', 61), ('_', 62), ('`', 63), ('a', 64), ('b', 65), ('c', 66), ('d', 67), ('e', 68), ('f', 69), ('g', 70), ('h', 71), ('i', 72), ('j', 73), ('k', 74), ('l', 75), ('m', 76), ('n', 77), ('o', 78), ('p', 79), ('q', 80), ('r', 81), ('s', 82), ('t', 83), ('u', 84), ('v', 85), ('w', 86), ('x', 87), ('y', 88), ('z', 89), ('{', 90), ('|', 9

In [29]:

input_text = "Why don't scientists trust atoms?"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(inputs.input_ids, max_length=20, num_return_sequences=1, do_sample=True, top_k=150)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(outputs)
for word in outputs[0]:
    print(tokenizer.decode(word))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 5195,   836,   470,  5519,  3774, 23235,    30,   383,  5654,   995,
          2125,   470,  1654,   644,  3446,  1838,   257,  7532,   357,   259]],
       device='cuda:0')
Why
 don
't
 scientists
 trust
 atoms
?
 The
 scientific
 world
 isn
't
 sure
 what
 exactly
 makes
 a
 protein
 (
in


In [5]:

tokenizer.pad_token = tokenizer.eos_token
dataset = GPT2JokeDataset(csv_file='reddit-cleanjokes.csv', tokenizer=tokenizer)

train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [7]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([f['input_ids'] for f in data]),
        'attention_mask': torch.stack([f['attention_mask'] for f in data]),
        'labels': torch.stack([f['input_ids'] for f in data]), 
    },
)

trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10,2.3504
20,0.18
30,0.1636
40,0.1558
50,0.1608
60,0.1436
70,0.1506
80,0.1499
90,0.1537
100,0.1537




TrainOutput(global_step=1218, training_loss=0.14646127222989777, metrics={'train_runtime': 176.5044, 'train_samples_per_second': 27.569, 'train_steps_per_second': 6.901, 'total_flos': 1271447027712000.0, 'train_loss': 0.14646127222989777, 'epoch': 3.0})

In [30]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained('./results/checkpoint-1218')
model.to(device)

input_text = "If life gives you melons"

input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

model.eval()

output_ids = model.generate(
    input_ids=input_ids,
    max_length=100,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    num_return_sequences=10,
    do_sample=True
)

for output in output_ids:
    output_text = tokenizer.decode(output, skip_special_tokens=True)
    print(output_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


If life gives you melons, it gives you lemons! [END]
If life gives you melons you can't kill them with a stick? [END]
If life gives you melons, you get melons from life. [END]
If life gives you melons, you have a melon! [END]
If life gives you melons, what do you get when life gives you carrots? A sickle. [END]
If life gives you melons, it gives you melons! [END]
If life gives you melons, it gives you carrots. [END]
If life gives you melons, why not give me a melon? [END]
If life gives you melons, why should life give you melons? Because life gives you melons! [END]
If life gives you melons, why can't life give you carrots? Because life gives you carrots! [END]


In [25]:
device

device(type='cuda')