In [1]:
# !pip install torch==1.5.0
# !pip install transformers

In [2]:
import torch
import transformers

import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification

In [4]:
######### Important Variables #########
# max length of the sentence
MAX_LEN = 256 

# BERT Tokenizer
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Loading BERT tokenizer...


# 1. Prepare Dataloader

In [5]:
train_df = pickle.load(open('../data/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

In [6]:
from torch.utils.data import (
    Dataset,
    DataLoader,
    TensorDataset,
    random_split,
    RandomSampler,
    SequentialSampler,
)
import torch

In [7]:
batch_size = 4
def prep_dataloader(dataset):
    if dataset == 'train':
        data = pickle.load(open('../data/train_df.pkl', 'rb'))
    elif dataset == 'val':
        data = pickle.load(open('../data/val_df.pkl', 'rb'))#.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
    else:
        data = pickle.load(open('../data/test_df.pkl', 'rb'))
    data = data.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
    
    sentences = data.sentence.values
    labels = data.label.values
    
    # 2. Encode text
    input_ids = [tokenizer.encode(sent, add_special_tokens = True) for sent in tqdm(sentences)]
    
    # 3. Pad shorter sentences
    input_ids_padded = []
    for i in input_ids:
        if len(i) > MAX_LEN:
            i = i[:MAX_LEN]
        else:
            while len(i) < MAX_LEN:
                i.append(0)
        input_ids_padded.append(i)
    input_ids = input_ids_padded
    
    # 4. Adding mask; mask; 0: added, 1: otherwise
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    
    
    # 5. Convert to PyTorch data types.
    inputs = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    masks = torch.tensor(attention_masks)

    data = TensorDataset(inputs, masks, labels)
    if dataset == 'test':
        sampler = SequentialSampler(data)
    else:
        sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [8]:
train_dataloader = prep_dataloader('train')
val_dataloader = prep_dataloader('val')

  0%|          | 55/35000 [00:00<07:16, 80.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2365 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 35000/35000 [03:56<00:00, 148.01it/s]
100%|██████████| 10000/10000 [01:41<00:00, 98.81it/s] 


# 2. Training

In [9]:
import torch
torch.cuda.is_available()

False

In [10]:
device = "cpu"
num_labels = 3

In [11]:
# Model definition
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=num_labels,  # The number of output labels--2 for binary classification.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False  # Whether the model returns all hidden-states.
)
model = model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr=5e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps=1e-8,  # args.adam_epsilon - default is 1e-8.
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
def test(model, test_dataloader, device):
    model.eval()
    eval_accuracy = 0
    
    all_pred = []
    all_label = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            
            for pred in np.argmax(logits, axis=1).flatten():
                all_pred.append(pred)
            for label in label_ids.flatten():
                all_label.append(label)
            #tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            #eval_accuracy += tmp_eval_accuracy
    all_pred = np.array(all_pred)
    all_label = np.array(all_label)
    eval_accuracy = np.sum(all_pred == all_label) / len(all_label)
    return eval_accuracy

In [13]:
all_epoches = []
num_epoches = 4
for epoch in range(1, num_epoches + 1):
    total_loss = 0
    model.train()
    for  batch in tqdm(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
    train_acc = test(model, train_dataloader, device)
    val_acc = test(model, val_dataloader, device)
    print(" => Epoch {} / {}:".format(epoch, num_epoches))
    print(" => Train acc: {}; Val acc: {}".format(train_acc, val_acc))
    print()
    all_epoches.append({
        'epoch': epoch,
        'train_acc': train_acc,
        'val_acc': val_acc,
        'model': model
    })

  0%|          | 0/8750 [00:06<?, ?it/s]


RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 93763584 bytes. Error code 12 (Cannot allocate memory)


In [None]:
test_dataloader = prep_dataloader('test')
test(best_model, test_dataloader, device):