In [3]:
!pip install -q transformers==4.17.0 datasets==2.0.0 rich[jupyter]
!pip install -q -U PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[K     |████████████████████████████████| 325 kB 66.8 MB/s 
[K     |████████████████████████████████| 231 kB 72.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 49.5 MB/s 
[K     |████████████████████████████████| 880 kB 56.5 MB/s 
[K     |████████████████████████████████| 596 kB 49.6 MB/s 
[K     |████████████████████████████████| 84 kB 1.4 MB/s 
[K     |████████████████████████████████| 212 kB 76.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 58.4 MB/s 
[K     |████████████████████████████████| 136 kB 64.8 MB/s 
[K     |████████████████████████████████| 127 kB 71.0 MB/s 
[K     |████████████████████████████████| 94 kB 4.2 MB/s 
[K     |████████████████████████████████| 271 kB 72.5 MB/s 
[K     |████████████████████████████████| 144 kB 66.4 MB/s 
[K     |████████████████████████████████| 51 kB 8.9 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: pip's depe

#GPT-2 Decoder
The intention of this notebook is to build a GPT-2-based decoder that can be fine-tuned on style-transfer data to decode from a neutral paraphrased sentence to a sentence in a style that is represented in a style embedding that is passed in as a "word vector" to the decoder.

#GPT-2 Demo

In [3]:
# from transformers import pipeline, set_seed
# generator = pipeline('text-generation', model='gpt2')
# set_seed(42)
# generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

#Configuration Setup
Hardware accelerator + necessary imports along with GPT-2 Configuration object creation.

In [1]:
import torch

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


In [4]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2TokenizerFast, BertForSequenceClassification
from datasets import load_dataset
import pandas as pd
import numpy as np
import random

from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

configuration = GPT2Config(n_embd=768)

model = GPT2LMHeadModel.from_pretrained('gpt2')

model.to(device)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

#Dataset Loading

In [5]:
data_file = drive.CreateFile({'id': '1a72PS0BiFYHY6mQV2rukQs60B_8yEDJ1'})
data_file.GetContentFile('dev.csv')
print('validation set downloaded')

data_file = drive.CreateFile({'id': '1VnWao5bgr8LWa-YjdYS9vHZbTnmq2Par'})
data_file.GetContentFile('test.csv')
print('test set downloaded')

data_file = drive.CreateFile({'id': '1qroZT1nfXbutQMu3OTEUX11fvLXEgzn_'})
data_file.GetContentFile('train.csv')
print('training set downloaded')

validation set downloaded
test set downloaded
training set downloaded


In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('dev.csv')

#Tokenization and Preparation

Tokenization and preparation of text, including getting style embeddings from the BERT model that was trained as a style classifier.

In [7]:
train['label'] = train.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 
test['label'] = test.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 
val['label'] = val.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 


def sample_from_same_style_and_add_column(df):
  #this function assumes the same number of elements with each label: consistent with our dataset balancing
  grouped = df.groupby(by='label')
  num = int(grouped.count().text.values[0])
  style_col = grouped.sample(n=num).text.values
  ret_df = df.sort_values(by='label')
  ret_df['style_example'] = style_col
  ret_df = ret_df.sort_index()
  return ret_df
  

train = sample_from_same_style_and_add_column(train)
test = sample_from_same_style_and_add_column(test)
val = sample_from_same_style_and_add_column(val)

valid_text = val.text.values
valid_style = val.style_example.values
valid_label = val.label.values
test_text = test.text.values
test_style = test.style_example.values
test_label = test.label.values
train_text = train.text.values
train_style = train.style_example.values
train_label = train.label.values


In [9]:
testing_out = drive.CreateFile({'title': 'test.csv'})

test.to_csv('test_out.csv')

###GPT-2 Setup and Preprocessing

In [32]:
def generate_decoder_inputs(df):
  bos_token = "<|endoftext|>"
  result = []
  for t, p, s in zip(df.text.values, df.paraphrase.values, df.style_example.values):
    result.append(f"{s} {bos_token} {p} {bos_token} {t}")
  return result

train_dec_in = generate_decoder_inputs(train.head(10000))
test_dec_in = generate_decoder_inputs(test.head(1000))


In [11]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token="<|pad|>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
def tokenize(batch):
    return tokenizer.batch_encode_plus(
        batch,
        max_length=100,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
        )

ttr = tokenize(train_dec_in)
tte = tokenize(test_dec_in)
tr_input_ids, tr_mask = ttr['input_ids'], ttr['attention_mask']
te_input_ids, te_mask = tte['input_ids'], tte['attention_mask']

In [21]:
tr_input_ids

tensor([[ 1219,   345,   467,  ..., 50257, 50257, 50257],
        [ 8053,  3101,   284,  ..., 50257, 50257, 50257],
        [15086,  7670,   373,  ..., 50257, 50257, 50257],
        ...,
        [ 2164,   592,   684,  ..., 50257, 50257, 50257],
        [ 1212,   481,   307,  ..., 50257, 50257, 50257],
        [ 3666,  9197,   840,  ..., 50257, 50257, 50257]])

#Train

In [33]:
from torch.utils.data import Dataset

class TransferDataset(Dataset):  
    def __init__(self, sequences):

        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token="<|endoftext|>")
        # self.input_ids = []
        # self.attn_masks = []
        self.items = []
        for s in sequences:
          d = self.tokenize(s)
          d['labels'] = d['input_ids']
          self.items.append(d)
          #if d['input_ids'].shape != d['attention_mask'].shape:
          #print(d['input_ids'].shape, d['attention_mask'].shape)
          # self.input_ids.append(torch.tensor(d['input_ids']))
          # self.attn_masks.append(torch.tensor(d['attention_mask']))

    def tokenize(self, batch):
      return self.tokenizer(
          batch,
          max_length=100,
          truncation=True,
          padding='max_length',
          return_tensors="pt"
        )
        
    def __len__(self):
        return len(self.items)

    def __getitem__(self, item):
        return self.items[item]
    
tr_dataset = TransferDataset(train_dec_in) 
te_dataset = TransferDataset(test_dec_in)   

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb1993821

In [35]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    warmup_steps=200,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tr_dataset,
    eval_dataset=te_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [36]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 4
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 2000


Step,Training Loss
500,1.8681
1000,1.7626
1500,1.687
2000,1.6432


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2000, training_loss=1.7402291564941406, metrics={'train_runtime': 1128.2001, 'train_samples_per_second': 35.455, 'train_steps_per_second': 1.773, 'total_flos': 2041344000000000.0, 'train_loss': 1.7402291564941406, 'epoch': 4.0})

In [37]:
torch.save(model.state_dict(), 'decoder.pth')

In [62]:
model.cpu()

RuntimeError: ignored

In [58]:
def generate_output(style, para):
  
  bos = "<|endoftext|>"
  inputs = [f"{s} {bos} {p} {bos}" for s, p in zip(style, para)]
  outputs = []
  for input in inputs:
    input_tensor = tokenizer(input, return_tensors="pt", max_length=100)
    input_tensor = input_tensor.to(device)
    raw_out = tokenizer.decode(model.generate(**input_tensor, max_length=100)[0])
    try:
      lidx = raw_out.rindex(bos)
      outputs.append(' '.join(raw_out[lidx:].split(' ')[1:]))
    except:
      outputs.append("")
    return outputs

test_df = test.head(10)
generate_output(test_df.style_example.values, test_df.paraphrase.values)


RuntimeError: ignored