In [13]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# need torch 1.3.1 for elastic inference
!pip install torch==1.3.1
!pip install transformers

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [14]:
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification

In [15]:
######### Important Variables #########
# max length of the sentence
MAX_LEN = 128 

# BERT Tokenizer
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Loading BERT tokenizer...


# 1. Prepare Dataloader

In [16]:
train_df = pickle.load(open('../data/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

In [17]:
from torch.utils.data import (
    Dataset,
    DataLoader,
    TensorDataset,
    random_split,
    RandomSampler,
    SequentialSampler,
)
import torch

In [18]:
def prep_dataloader(dataset):
    if dataset == 'train':
        data = pickle.load(open('../data/train_df.pkl', 'rb'))
    elif dataset == 'val':
        data = pickle.load(open('../data/val_df.pkl', 'rb'))#.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
    else:
        data = pickle.load(open('../data/test_df.pkl', 'rb'))
    data = data.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
    
    sentences = data.sentence.values
    labels = data.label.values
    
    # 2. Encode text
    input_ids = [tokenizer.encode(sent, add_special_tokens = True) for sent in tqdm(sentences)]
    
    # 3. Pad shorter sentences
    input_ids_padded = []
    for i in input_ids:
        while len(i) < MAX_LEN:
            i.append(0)
        input_ids_padded.append(i)
    input_ids = input_ids_padded
    
    # 4. Adding mask; mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    
    # 5. Convert to PyTorch data types.
    inputs = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    masks = torch.tensor(attention_masks)

    data = TensorDataset(inputs, masks, labels)
    if dataset == 'test':
        sampler = SequentialSampler(data)
    else:
        sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [19]:
train_dataloader = prep_dataloader('train')
val_dataloader = prep_dataloader('val')
test_dataloader = prep_dataloader('test')

  0%|          | 39/35000 [00:00<01:30, 387.26it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2365 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 35000/35000 [01:50<00:00, 317.37it/s]


ValueError: expected sequence of length 144 at dim 1 (got 128)

# 2. Training

In [None]:
device = 'cpu'

In [None]:
# Model definition
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=args.num_labels,  # The number of output labels--2 for binary classification.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)
model = model.to(device)

# 4Training settings (optimizer, distributed computing, etc.)
# if is_distributed and use_cuda:
#     # multi-machine multi-gpu case
#     model = torch.nn.parallel.DistributedDataParallel(model)
# else:
#     # single-machine multi-gpu case or single-machine or multi-machine cpu case
#     model = torch.nn.DataParallel(model)
optimizer = AdamW(
    model.parameters(),
    lr=5e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps=1e-8,  # args.adam_epsilon - default is 1e-8.
)

In [None]:
def test(model, test_loader, device):
    model.eval()
    eval_accuracy = 0
    
    all_pred = []
    all_label = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            
            for pred in np.argmax(logits, axis=1).flatten():
                all_pred.append(pred)
            for label in label_ids.flatten():
                all_label.append(label)
            #tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            #eval_accuracy += tmp_eval_accuracy
    all_pred = np.array(all_pred)
    all_label = np.array(all_label)
    eval_accuracy = np.sum(all_pred == all_label) / len(all_label)
    return eval_accuracy

In [None]:
all_epoches = []
num_epoches = 4
for epoch in range(1, num_epoches + 1):
    total_loss = 0
    model.train()
    for step, batch in tqdm(enumerate(train_loader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
    train_acc = test(model, train_loader, device)
    val_acc = test(model, val_loader, device)
    all_epoches.append({
        'epoch': epoch,
        'train_acc': train_acc,
        'val_acc': val_acc,
        'model': model
    })