In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
! pip -q install transformers

[K     |████████████████████████████████| 2.8 MB 4.2 MB/s 
[K     |████████████████████████████████| 636 kB 46.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 20.4 MB/s 
[K     |████████████████████████████████| 895 kB 33.2 MB/s 
[K     |████████████████████████████████| 52 kB 1.5 MB/s 
[?25h

In [3]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [4]:
dialogue_df = pd.read_csv('./dialogue_df.csv')
quotes_df = pd.read_csv('./quotes_df.csv')

In [5]:
trump_names = ['Donald Trump', 'President Trump', 'President Donald Trump', 'President Donald J. Trump']

In [6]:
dialogue_df.loc[np.isin(dialogue_df['name'], trump_names), 'name'] = 'TRUMP'

In [7]:
class Args():
    def __init__(self):
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.block_size = 512
        self.per_gpu_train_batch_size = 1
        self.per_gpu_eval_batch_size = 1
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 5
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1',
        self.device = 'cuda'

args = Args()

In [8]:
import os
os.chdir("/content")

In [9]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('microsoft/DialoGPT-small')

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/351M [00:00<?, ?B/s]

# Dataset

In [10]:
tokenizer.eos_token

'<|endoftext|>'

In [11]:
# res = []
# history = []
# for _, row in dialogue_df.iterrows():
#   if row['name'] != "TRUMP":
#     history.append(row['text'])
#   elif history:
#     answer = row['text']
#     res.append([' '.join(history), answer])
#     history = []

# df = pd.DataFrame(res, columns=['context', 'response'])

In [12]:
res = []
history = []
for _, row in dialogue_df.iterrows():
  history.append(row['text'])
  if row['name'] == 'TRUMP' and len(history) > 5:
    response = row['text']
    res.append([f' {tokenizer.eos_token} '.join(history[-5:]), response])

df = pd.DataFrame(res, columns=['context', 'response'])

In [13]:
# df = dialogue_df.loc[dialogue_df['name'] == 'TRUMP', 'text'].append(quotes_df['quote'], ignore_index=True)

In [14]:
new_df = pd.DataFrame([''] * quotes_df.shape[0], columns=['context'])
new_df['context'] = ''
new_df['response'] = quotes_df['quote']

In [15]:
df = df.append(new_df, ignore_index=True)

In [16]:
df_train, df_test, _, _ = train_test_split(df, df, train_size=0.7)

In [17]:
class TrumpDataset(Dataset):
  def __init__(self, df, tokenizer):
    self.encoded_df = df.apply(lambda x: [tokenizer.encode(i)[:tokenizer.model_max_length] for i in x], axis=0)
    # self.encoded_df = df.apply(lambda x: tokenizer.encode(x)[:tokenizer.model_max_length])

  def __len__(self):
      return len(self.encoded_df)

  def __getitem__(self, i):
      row = self.encoded_df.iloc[i]
      tokens = row['context'] + [tokenizer.eos_token_id] + row['response']
      return torch.tensor(tokens, dtype=torch.long) 

In [18]:
dataset_train = TrumpDataset(df_train, tokenizer)
dataset_test = TrumpDataset(df_test, tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 1024). Running this sequence through the model will result in indexing errors


In [19]:
dataloader_train = DataLoader(dataset_train, 
                              batch_size=args.per_gpu_train_batch_size,
                              collate_fn=lambda x: pad_sequence(x, batch_first=True),
                              shuffle=True)

dataloader_test = DataLoader(dataset_test, 
                              batch_size=args.per_gpu_eval_batch_size,
                              collate_fn=lambda x: pad_sequence(x, batch_first=True),
                              shuffle=False)

# Train

In [20]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

In [21]:
set_seed(args)

In [22]:
optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)

In [23]:
train_loss, test_loss = 0.0, 0.0

model.to(args.device)


for epoch in range(1, args.num_train_epochs + 1):
    model.train()
    lost_counter = 0

    for step, batch in enumerate(tqdm(dataloader_train, desc=f'Epoch {epoch}')):

        model.zero_grad()

        inputs, labels = (batch, batch)

        if inputs.shape[1] > 1024:
            lost_counter += 1
            continue

        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        outputs = model(inputs, labels=labels)
        loss = outputs[0]

        loss.backward()

        train_loss += loss.item()

        optimizer.step()

    model.eval()
    for step, batch in enumerate(tqdm(dataloader_train, desc=f'Epoch {epoch}')):

        inputs, labels = (batch, batch)

        if inputs.shape[1] > 1024: continue

        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        outputs = model(inputs, labels=labels)
        loss = outputs[0]

        test_loss += loss.item()
    
    print(train_loss / epoch)
    print(test_loss / epoch)
    print(lost_counter)




Epoch 1:   0%|          | 0/2165 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2165 [00:00<?, ?it/s]

6751.977539658546
4637.244629621506
12


Epoch 2:   0%|          | 0/2165 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/2165 [00:00<?, ?it/s]

5801.204813927412
4235.130446597934
12


Epoch 3:   0%|          | 0/2165 [00:00<?, ?it/s]

Epoch 3:   0%|          | 0/2165 [00:00<?, ?it/s]

5244.194776644309
3906.2488407492638
12


Epoch 4:   0%|          | 0/2165 [00:00<?, ?it/s]

Epoch 4:   0%|          | 0/2165 [00:00<?, ?it/s]

4828.275049686432
3624.1567386612296
12


Epoch 5:   0%|          | 0/2165 [00:00<?, ?it/s]

Epoch 5:   0%|          | 0/2165 [00:00<?, ?it/s]

4489.4482942610975
3380.0438812851908
12


# Dialog

In [24]:
def remove_zeroes(inputs):
  i = inputs.shape[-1] - 1
  while inputs[0][i] == 0 and i > -1:
    i -= 1
  
  return inputs[:, :i+1]

In [49]:
model.to('cpu')
model.eval()

step = 0

while True:
    msg = input(">> User:")
    if msg == 'stop':
        break
    new_user_input_ids = tokenizer.encode(msg + tokenizer.eos_token, return_tensors='pt')

    # bot_input_ids = torch.cat([remove_zeroes(chat_history_ids), new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    bot_input_ids = new_user_input_ids

    bot_input_ids = bot_input_ids[:, -50:]

    chat_history_ids = model.generate(
        bot_input_ids, max_length=100,
        pad_token_id=tokenizer.eos_token_id,  
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.9
    )

    step += 1

    print("TrumpBot: {}".format(tokenizer.decode(remove_zeroes(chat_history_ids[:, bot_input_ids.shape[-1]:])[0], skip_special_tokens=True)))

>> User:hey!
TrumpBot: I’m here with my old buddy Harris Faulkner. Harris is a great book. 
>> User:Trump? is it you?
TrumpBot:  It’s me. I’m the President. I’m the one that locked down Michigan. I did it in my own words, and I said, “Lock up the Governor.” And it was the Democrats. 


KeyboardInterrupt: ignored

In [50]:
torch.save(model, 'gptrum-small.model')

In [56]:
run()

>> User:hey Trump! How are you?
TrumpBot:  Trump is a laughingstock. He’s a laughingstock. And it blows my mind that he got elected. And it blows my mind that he got elected. I’ve never been able to understand him. I was wondering how he got elected, and I don’t know how he got elected. I don’t know how he got elected. I don’t know how he got elected. I don’t
>> User:Who are you?
TrumpBot:  I’m the President. I’m the President of the United States. 
>> User:stop
