In [2]:
!pip install -q transformers==4.17.0 datasets==2.0.0 rich[jupyter]
!pip install -q -U PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

success!


#GPT-2 Decoder
The intention of this notebook is to build a GPT-2-based decoder that can be fine-tuned on style-transfer data to decode from a neutral paraphrased sentence to a sentence in a style that is represented in a style embedding that is passed in as a "word vector" to the decoder.

#GPT-2 Demo

In [3]:
# from transformers import pipeline, set_seed
# generator = pipeline('text-generation', model='gpt2')
# set_seed(42)
# generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

#Configuration Setup
Hardware accelerator + necessary imports along with GPT-2 Configuration object creation.

In [4]:
import torch

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


In [5]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2TokenizerFast, BertForSequenceClassification
from datasets import load_dataset
import pandas as pd
import numpy as np
import random

from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

configuration = GPT2Config(n_embd=768)

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

#Dataset Loading

In [6]:
data_file = drive.CreateFile({'id': '1a72PS0BiFYHY6mQV2rukQs60B_8yEDJ1'})
data_file.GetContentFile('dev.csv')
print('validation set downloaded')

data_file = drive.CreateFile({'id': '1VnWao5bgr8LWa-YjdYS9vHZbTnmq2Par'})
data_file.GetContentFile('test.csv')
print('test set downloaded')

data_file = drive.CreateFile({'id': '1qroZT1nfXbutQMu3OTEUX11fvLXEgzn_'})
data_file.GetContentFile('train.csv')
print('training set downloaded')

validation set downloaded
test set downloaded
training set downloaded


In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('dev.csv')

#Tokenization and Preparation

Tokenization and preparation of text, including getting style embeddings from the BERT model that was trained as a style classifier.

In [8]:
train['label'] = train.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 
test['label'] = test.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 
val['label'] = val.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 


def sample_from_same_style_and_add_column(df):
  #this function assumes the same number of elements with each label: consistent with our dataset balancing
  grouped = df.groupby(by='label')
  num = int(grouped.count().text.values[0])
  style_col = grouped.sample(n=num).text.values
  ret_df = df.sort_values(by='label')
  ret_df['style_example'] = style_col
  ret_df = ret_df.sort_index()
  return ret_df
  

train = sample_from_same_style_and_add_column(train)
test = sample_from_same_style_and_add_column(test)
val = sample_from_same_style_and_add_column(val)

valid_text = val.text.values
valid_style = val.style_example.values
valid_label = val.label.values
test_text = test.text.values
test_style = test.style_example.values
test_label = test.label.values
train_text = train.text.values
train_style = train.style_example.values
train_label = train.label.values


###Processing for input to the BERT Model

In [9]:
from transformers import BertForSequenceClassification, BertTokenizer, BertModel

def tokenize_and_format(sentences):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  input_ids = []
  attention_masks = []

  # For every sentence...
  for sentence in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sentence,                      
                          add_special_tokens = True, 
                          max_length = 64,           
                          padding = 'max_length',
                          truncation = True,
                          return_attention_mask = True,   
                          return_tensors = 'pt', 
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks



In [10]:
train_style_ids, train_style_masks = tokenize_and_format(train_style)
train_text_ids, train_text_masks = tokenize_and_format(train_text)
test_style_ids, test_style_masks = tokenize_and_format(test_style)
test_text_ids, test_text_masks = tokenize_and_format(test_text)
val_style_ids, val_style_masks = tokenize_and_format(valid_style)
val_text_ids, val_text_masks = tokenize_and_format(valid_text)

In [12]:
def stk(lt):
  return torch.stack([data[0] for data in lt])

#style_model(stk(val_input_ids[0:2]), attention_mask=stk(val_masks[0:2]))[1][0][:,0].shape

In [13]:

style_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 11,
    output_attentions = False,
    output_hidden_states = True, # Whether the model returns all hidden-states.
)


data_file = drive.CreateFile({'id': '1pJjclb_Ht-fklPtK7baDj1dvdm2U8Cub'})
data_file.GetContentFile('style_classifier.pth')

style_model.load_state_dict(torch.load('style_classifier.pth'))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [14]:
def get_style_embeddings(style_em_model, input_ids, masks):
  style_em_model.eval()

  input_id_tensors = torch.stack([data[0] for data in input_ids])
  input_mask_tensors = torch.stack([data[0] for data in masks])

  # b_input_ids = input_id_tensors.to(device)
  # b_input_mask = input_mask_tensors.to(device)

  with torch.no_grad():        
        outputs = style_em_model(input_id_tensors, 
                        attention_mask=input_mask_tensors)[1][0][:,0]
  
  return outputs

In [None]:
tr_other_style_embeddings = get_style_embeddings(style_model, train_style_ids, train_style_masks)
# tr_same_style_embeddings = get_style_embeddings(style_model, train_text_ids, train_text_masks)
# te_other_style_embeddings = get_style_embeddings(style_model, test_style_ids, test_style_masks)
# te_same_style_embeddings = get_style_embeddings(style_model, test_text_ids, test_text_masks)
# val_other_style_embeddings = get_style_embeddings(style_model, val_style_ids, val_style_masks)
# val_same_style_embeddings = get_style_embeddings(style_model, val_text_ids, val_text_masks)

###GPT-2 Setup and Preprocessing

In [None]:
def get_text_and_paraphrase(df):
  return df.text.values, df.paraphrase.values

train_text, train_paraphrase = get_text_and_paraphrase(train)
test_text, test_paraphrase = get_text_and_paraphrase(train)
val_text, val_paraphrase = get_text_and_paraphrase(train)

In [None]:
def tokenize(batch):
    return tokenizer.batch_encode_plus(
        batch,
        max_length=50,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
        )

texttok_tr, paratok_tr = tokenize(train_text), tokenize(train_paraphrase)
texttok_te, paratok_te = tokenize(test_text), tokenize(test_paraphrase)
texttok_va, paratok_va = tokenize(val_text), tokenize(val_paraphrase)

In [None]:
##Converting tokenized values into embeddings


#Train

In [None]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model