# Fine-tuning gpt-2 on Trump's tweets dataset

In [1]:
!pip install transformers

import pandas as pd
import numpy as np
import torch
import torch.nn
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Config, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, random_split
import random

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 16.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 54.5MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 56.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=1100cc2dbf706c4809

In [3]:
# seed everything
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Loading data

Dataset is tiny, but it still 

In [6]:
data = pd.read_csv('./data.csv')['tweet']
data = data.drop_duplicates()


## Loading pretrained gpt2 tokenizer & creating custom datasets and dataloaders

In [7]:
# specify technical tokens
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                          bos_token='<|BOS|>', 
                                          eos_token='<|EOS|>', 
                                          pad_token='<|PAD|>')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.





In [8]:
class TRUMPdataset(Dataset):

    def __init__(self, corpus, tokenizer, gpt2_type="gpt2", max_length=40):

        self.tokenizer = tokenizer 
        self.input_ids = []
        self.attention_masks = []

        for sequence in corpus:
        # Loop through all corpus to add eos, bos and padding + truncate
            encodings_dict = tokenizer('<|BOS|>'+ sequence + '<|EOS|>', 
                                         truncation=True, 
                                         max_length=max_length,
                                         padding = 'max_length')

            # assign tokenizer output to id's and attention masks   
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attention_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    # add standard datset methods
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx] 

In [9]:
alldataset = TRUMPdataset(data, tokenizer, max_length=60)
tr = int(0.8 * len(alldataset))
vl = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, (tr, vl))

In [10]:
batch_size = 32
train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size 
        )

## Loading pretrained model & building train/val loops

In [11]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model = model.to(device)

# set some of the standard lr values from 5e-5 to 5e-4
optimizer = AdamW(model.parameters(), lr = 5e-4)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [12]:
def train(model, optimizer, train_loader, num_epochs =3, ):
    for epoch in range(num_epochs):

        print(f'Epoch: {epoch}')

        total_train_loss = 0
        model.train()

    for step, batch in enumerate(train_loader):

        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        model.zero_grad()

        output = model(input_ids, labels=labels, attention_mask = masks, token_type_ids=None)
        loss = output[0]
        batch_loss = loss.item() 
        total_train_loss += batch_loss  

        if step % 50 == 0:
            print(f'Batch {step} of {len(train_loader)}, loss = {batch_loss}')

        loss.backward()

        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)  



def evaluate(model, val_loader):
    total_eval_loss=0
    for batch in val_loader:

        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        with torch.no_grad():
          
          outputs  = model(input_ids,  
                             attention_mask = masks,
                             labels = labels)
          
          loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f'Validation loss is {avg_val_loss}')





## Training 

In [15]:
train(model, optimizer, train_dataloader)

Epoch: 0
Batch 0 of 142, loss = 68.0325927734375
Batch 50 of 142, loss = 3.5193979740142822
Batch 100 of 142, loss = 3.412783145904541
Epoch: 1
Batch 0 of 142, loss = 2.7614707946777344
Batch 50 of 142, loss = 2.785888671875
Batch 100 of 142, loss = 2.4498608112335205
Epoch: 2
Batch 0 of 142, loss = 2.271756172180176
Batch 50 of 142, loss = 2.420562267303467
Batch 100 of 142, loss = 2.0707485675811768


In [16]:
evaluate(model, validation_dataloader)

Validation loss is 2.6227709386083813


In [17]:
#save model
save_path = '/drive/TwitterGPT2/model'
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/drive/TwitterGPT2/model/tokenizer_config.json',
 '/drive/TwitterGPT2/model/special_tokens_map.json',
 '/drive/TwitterGPT2/model/vocab.json',
 '/drive/TwitterGPT2/model/merges.txt',
 '/drive/TwitterGPT2/model/added_tokens.json')

In [18]:
#load model
save_path = '/drive/TwitterGPT2/model'
model = GPT2LMHeadModel.from_pretrained(save_path)
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
model = model.to(device)

## Results

### Greedy search
This model chooses the most probable next word. It has some problems (i.e. repeating phrases) and sounds not human-like.

In [23]:
ds = torch.tensor(tokenizer.encode("<|BOS|>")).unsqueeze(0)
ids = ids.to(device)

samples = model.generate(ids, max_length = 70)

for i, sample_output in enumerate(samples):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}\n\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: The Democrats are trying to make it impossible for the Republican Party to win the House. They are trying to make it impossible for the Republican Party to win the Senate. They are trying to make it impossible for the Republican Party to win the House. They are trying to make it impossible for the Republican Party to win the House. They are trying to




### Beam  search
The second approach to text generation is based on choosing k most probable words at each timestamp and iterate until the end of the sequence. Then, model outputs the most probable sequence out of them. Such method outperforms simple greedy search, which looks only on one next word. We also use n_gram penalty to prevent model from repeating same passages

In [19]:
ids = torch.tensor(tokenizer.encode("<|BOS|>")).unsqueeze(0)
ids = ids.to(device)

samples = model.generate(       ids, 
                                num_beams=5, 
                                early_stopping=True, 
                                max_length = 70,
                                no_repeat_ngram_size=2,
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(samples):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}\n\n"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: The failing @nytimes is a disgrace to journalism. They should be ashamed of themselves and their dishonesty. I hope they will be forced to apologize to the people of the U.S. for what they have done, and for the terrible things that have been said about them. https://t


1: The failing @nytimes is a disgrace to journalism. They should be ashamed of themselves and their dishonesty. I hope they will be forced to apologize to the people of the U.S. for what they have done, and for the terrible things that have been said about them. https://t


2: The failing @nytimes is a disgrace to journalism. They should be ashamed of themselves and their dishonesty. I hope they will be forced to apologize to the people of the U.S. for what they have done, and for the terrible things that have been said about me. https://t


3: The failing @nytimes is a disgrace to journalism. They should be ashamed of themselves and their dishonesty. I hope they will be forced to apologize to the people of the U.S.

### More complicated way to generate sequnces is the probabalistic Top-K sampling
This method randomly samples words from a set of K most probable ones at each time stamp.

In [24]:
model.eval()

ids = torch.tensor(tokenizer.encode("<|BOS|>")).unsqueeze(0)
ids = ids.to(device)

samples = model.generate(       ids, 
                                do_sample=True, 
                                top_k=50, 
                                max_length = 70, 
                                num_return_sequences=4
                                )

for i, sample_output in enumerate(samples):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}\n\n" )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: So now we have more votes in the Supreme Court than is necessary to secure the Supreme Court, a number that has never been seen before by a Supreme Court Justices. So when will somebody who has voted for the most corrupt President in our history, or should have taken the Radical Left vote, actually get to the Supreme Court, perhaps with Justice


1: It is happening! https://t.co/s7cKZ2gLjQJ


2: How can Bill Barr have been in charge of the Justice Department for seven years without making this decision in favor of Mueller & more? But he doesn’t have the lawyers to do such a job!


3: The failing @nytimes just reported the Trump campaign is spending $35,000,000,000 on ads. The Times is totally biased & fraudulent!




### Top-p sampling
Previous method has a drawback - number K of most relevant words is constant, however probability distributions of the next words in fact vary greatly, i.e. after word 'I' or 'The' there is much more equaly likley candidates then after the word 'Airplane'. Top- p sampling calculate cumulative probablity of the most likely words and stops when it hits top_p parameter. Then, number of candidates at each timestamp is not constant anymore. 

In [25]:
model.eval()

ids = torch.tensor(tokenizer.encode("<|BOS|>")).unsqueeze(0)
ids = ids.to(device)

samples = model.generate(       ids, 
                                do_sample=True, 
                                top_k=50, 
                                max_length = 70,
                                top_p=0.92, 
                                num_return_sequences=4
                                )

for i, sample_output in enumerate(samples):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}\n\n" )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Despite the many great things the President is doing, the Radical Left Democrats, who are working hard to obstruct and delay, are refusing to go to the Senate because they are afraid of losing the House in the coming Election. They are refusing to show up because our Constitution, the Constitution itself, is not up for them. I have had a long


1: We are getting ‘big and fast ‘piggybacking’ in Florida and elsewhere. The New York Times has taken down the Fake News in order to bring it back into relevancy. But what about @DACA? Why isn’t our country moving fast enough? The Dems want to take away everything I


2: Watching @CNN @CNN, @washingtonpost, @NBCNews on the big stage, and watching the press conference, like it’s a total double standard. Many things are saying wrong on the show, like the fact that the Lamestream Media and their bosses are not looking very good for the election - just want the


3: Crooked Hillary said "the Fake News is dead." What is that? Hillary said we shoul

We can also specify beginning of the sequence and let model complete it:

In [27]:
model.eval()

ids  = tokenizer.encode('Ice cream', return_tensors='pt').to(device)
ids = ids.to(device)
samples = model.generate(       ids, 
                                do_sample=True, 
                                top_k=50, 
                                max_length = 70,
                                top_p=0.92, 
                                num_return_sequences=4
                                )

for i, sample_output in enumerate(samples):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}\n\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Ice cream with my campaign manager, “Nancy Pelosi, for giving up on my campaign. She lost anyway. “I think she is losing to Crooked Hillary” & myself, and she’s not doing very well!” @foxandfriends @foxandfriends @greggwashington 
 https://t


1: Ice creamery in Florida is out of control. Fake stories, just as I have reported, are going to be a big source of trouble, and we are going to win big with the people and the “s.”

The Fake News Media is totally out of control, and they should be on notice! https://t.


2: Ice cream. #FakeNews. Not so great. A terrible day for America today. We are doing a GREAT job, & the Dems don’t have the power!


3: Ice cream? I heard ‘I’ve been doing it since I started (and it is amazing).’ What is going on?
 https://t.co/w7r4q4wKwTm




Data:
1. https://www.kaggle.com/ayushggarg/all-trumps-twitter-insults-20152021

Based on:
1. https://huggingface.co/blog/how-to-generate
2. https://medium.com/swlh/fine-tuning-gpt-2-for-magic-the-gathering-flavour-text-generation-3bafd0f9bb93