**Note: Much of this code is taken from homework 1 for 685**

**Make sure we have a GPU**

In [None]:
!export CUDA_LAUNCH_BLOCKING=1
!CUDA_LAUNCH_BLOCKING=1
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


**Install necessary packages**

In [None]:
!pip install folium==0.2.1
!pip install transformers
!pip install datasets
!pip install -U -q PyDrive

Collecting folium==0.2.1
  Downloading folium-0.2.1.tar.gz (69 kB)
[?25l[K     |████▊                           | 10 kB 38.0 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 44.2 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 39.1 MB/s eta 0:00:01[K     |██████████████████▊             | 40 kB 42.3 MB/s eta 0:00:01[K     |███████████████████████▍        | 51 kB 34.1 MB/s eta 0:00:01[K     |████████████████████████████    | 61 kB 38.3 MB/s eta 0:00:01[K     |████████████████████████████████| 69 kB 7.9 MB/s 
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25l[?25hdone
  Created wheel for folium: filename=folium-0.2.1-py3-none-any.whl size=79808 sha256=4d75f582287f6ad0e36f07fbd43809f64b17d494d95fce164cee031ac1226c65
  Stored in directory: /root/.cache/pip/wheels/9a/f0/3a/3f79a6914ff5affaf50cabad60c9f4d565283283c97f0bdccf
Successfully built folium
Installing collected packages: folium
  Attempting 

**Tokenization Function**

In [None]:
import pandas as pd
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, random_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import sys
import numpy as np
import time
import datetime

def tokenize_and_format(sentences):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []

  # For every sentence...
  for sentence in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sentence,                      
                          add_special_tokens = True, 
                          max_length = 64,           
                          padding = 'max_length',
                          truncation = True,
                          return_attention_mask = True,   
                          return_tensors = 'pt', 
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


**Download data**

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# from datasets import load_dataset

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download balanced dataset
data_file = drive.CreateFile({'id': '1a72PS0BiFYHY6mQV2rukQs60B_8yEDJ1'})
data_file.GetContentFile('dev.csv')
print('validation set downloaded')

data_file = drive.CreateFile({'id': '1VnWao5bgr8LWa-YjdYS9vHZbTnmq2Par'})
data_file.GetContentFile('test.csv')
print('test set downloaded')

data_file = drive.CreateFile({'id': '1qroZT1nfXbutQMu3OTEUX11fvLXEgzn_'})
data_file.GetContentFile('train.csv')
print('training set downloaded')

# dataset = load_dataset('billray110/corpus-of-diverse-styles', split='validation')
# df = pd.DataFrame(dataset)
valid_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
#df = df.sample(frac=1).reset_index(drop=True)

valid_df['label'] = valid_df.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 
train_df['label'] = train_df.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 
test_df['label'] = test_df.label.replace(['aae','bible','coha_1810', 'coha_1890', 'coha_1990', 'joyce', 'lyrics', 'poetry', 'shakespeare', 'switchboard', 'tweets'],[0,1,2,3,4,5,6,7,8,9,10]) 

valid_text = valid_df.text.values
valid_label = valid_df.label.values
test_text = test_df.text.values
test_label = test_df.label.values
train_text = train_df.text.values
train_label = train_df.label.values

validation set downloaded
test set downloaded
training set downloaded


**Tokenize Text**


In [None]:
# tokenize the texts
total_text = np.concatenate((train_text, valid_text, test_text), axis=0)
total_label = np.concatenate((train_label, valid_label, test_label), axis=0)

total_input_ids, total_attention_masks = tokenize_and_format(total_text)

total_input_ids = torch.cat(total_input_ids, dim=0)
total_attention_masks = torch.cat(total_attention_masks, dim=0)
total_labels = torch.tensor(total_label)

# train_input_ids, train_attention_masks = tokenize_and_format(train_text)
# valid_input_ids, valid_attention_masks = tokenize_and_format(valid_text)
# test_input_ids, test_attention_masks = tokenize_and_format(test_text)

# Convert the lists into tensors.
# train_input_ids = torch.cat(train_input_ids, dim=0)
# train_attention_masks = torch.cat(train_attention_masks, dim=0)
# train_labels = torch.tensor(train_label)

# valid_input_ids = torch.cat(valid_input_ids, dim=0)
# valid_attention_masks = torch.cat(valid_attention_masks, dim=0)
# valid_labels = torch.tensor(valid_label)

# test_input_ids = torch.cat(test_input_ids, dim=0)
# test_attention_masks = torch.cat(test_attention_masks, dim=0)
# test_labels = torch.tensor(test_label)

# Print sentence 0, now as a list of IDs.
print('Original: ', total_text[0])
print('Token IDs:', total_input_ids[0])

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Original:  generally i think he he does not look forward to or anticipate there's some people that enjoy tinkering around on cars there's a lot of people and he just isn't one of them
Token IDs: tensor([  101,  3227,  1045,  2228,  2002,  2002,  2515,  2025,  2298,  2830,
         2000,  2030,  3424,  6895, 17585,  2045,  1005,  1055,  2070,  2111,
         2008,  5959,  9543,  5484,  2075,  2105,  2006,  3765,  2045,  1005,
         1055,  1037,  2843,  1997,  2111,  1998,  2002,  2074,  3475,  1005,
         1056,  2028,  1997,  2068,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [None]:
# make lists of 3-tuples (already shuffled the dataframe in cell above)
num_train = len(train_text)
num_valid = len(valid_text)
num_test = len(test_text)

train_set = [(total_input_ids[i], total_attention_masks[i], total_labels[i]) for i in range(num_train)]
val_set = [(total_input_ids[i], total_attention_masks[i], total_labels[i]) for i in range(num_train, num_train + num_valid)]
test_set = [(total_input_ids[i], total_attention_masks[i], total_labels[i]) for i in range(num_train + num_valid, num_train + num_valid + num_test)]

**Model**


In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 11, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

**Optimizer**

In [None]:
batch_size = 16
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-6, # args.adam_epsilon  - default is 1e-8
                  weight_decay = 0.01,
                  correct_bias = True,
                  betas = (0.9, 0.999)
                  
                )

epochs = 3



**Validation Performance Function**

In [None]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]
      
      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy


**Train**

In [None]:
import random

# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/batch_size) + 1
    print(num_batches)

    for i in range(num_batches):
      end_index = min(batch_size * (i+1), len(train_set))

      batch = train_set[i*batch_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)

      # Clear the previously calculated gradient
      model.zero_grad()        

      # Perform a forward pass (evaluate the model on this training batch).
      outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
      loss = outputs.loss
      logits = outputs.logits

      total_train_loss += loss.item()

      # Perform a backward pass to calculate the gradients.
      loss.backward()

      # Update parameters and take a step using the computed gradient.
      optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")


Training...
17086
Total loss: 8902.451559428126
Validation accuracy: 0.8457384199958458

Training...
17086
Total loss: 4989.977445257828
Validation accuracy: 0.8505850585058505

Training...
17086
Total loss: 3022.3443582059117
Validation accuracy: 0.8487156407948487

Training complete!


**Validate**

In [None]:
get_validation_performance(test_set)

0.8410321310553329