In [None]:
pip install git+https://github.com/huggingface/transformers@master

In [1]:
import gc
import torch
import pandas as pd
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
torch.manual_seed(42)

<torch._C.Generator at 0x7f2114858fb0>

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1024)

In [3]:
texts = pd.read_csv('elon_musk_tweets.csv')['text']

In [4]:
max_length = max([len(tokenizer.encode(text)) for text in texts])
print(max_length)

65


In [5]:
class TextDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.labels = []
        self.input_ids = []
        self.attn_masks = []        
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx]

In [6]:
dataset = TextDataset(texts, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
gc.collect()

25

In [7]:
torch.cuda.empty_cache()

In [8]:
training_args = TrainingArguments(output_dir='./results',
                                  num_train_epochs=1,
                                  logging_steps=10,
                                  save_steps=5000,
                                  per_device_train_batch_size=40,
                                  per_device_eval_batch_size=40,
                                  warmup_steps=10,
                                  weight_decay=0.05,
                                  logging_dir='./logs',
                                  report_to = 'none')


In [9]:
Trainer(model=model,
        args=training_args,
        train_dataset = train_dataset, 
        eval_dataset = val_dataset,
        data_collator = lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                      'attention_mask': torch.stack([f[1] for f in data]),
                                      'labels': torch.stack([f[0] for f in data])}).train()

Step,Training Loss
10,19.2384
20,2.5705
30,1.8266
40,1.5349
50,1.4638
60,1.383
70,1.4073
80,1.3346
90,1.2875
100,1.3062


TrainOutput(global_step=133, training_loss=2.812792770844653, metrics={'train_runtime': 134.1415, 'train_samples_per_second': 39.607, 'train_steps_per_second': 0.991, 'total_flos': 626410432327680.0, 'train_loss': 2.812792770844653, 'epoch': 1.0})

### GPT Generated Texts

In [None]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs = model.generate(generated,
                                do_sample = True,
                                top_k = 50, 
                                max_length = 300,
                                top_p = 0.95,
                                temperature = 1.0,
                                num_return_sequences = 20)

In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

### Original Texts (Random)

In [None]:
pd.options.display.max_colwidth = 1000
texts.sample(10)