In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [5]:
df = pd.read_csv("stock-news.csv")

In [13]:
text = df.headline.values
labels_string = df.label.values

In [14]:
labels = []
for label in labels_string:
    if label == "Positive":
        labels.append(0)
    elif label == "Neutral":
        labels.append(1)
    elif label == "Negative":
        labels.append(2)

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)

In [17]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒═══════════╤═════════════╕
│ Tokens    │   Token IDs │
╞═══════════╪═════════════╡
│ net       │        5795 │
├───────────┼─────────────┤
│ ##f       │        2087 │
├───────────┼─────────────┤
│ ##lix     │       20711 │
├───────────┼─────────────┤
│ lays      │       22307 │
├───────────┼─────────────┤
│ offs      │       12822 │
├───────────┼─────────────┤
│ more      │        1167 │
├───────────┼─────────────┤
│ employees │        4570 │
├───────────┼─────────────┤
│ to        │        1106 │
├───────────┼─────────────┤
│ cut       │        2195 │
├───────────┼─────────────┤
│ costs     │        4692 │
╘═══════════╧═════════════╛


In [18]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                    )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
token_id[5]

tensor([  101,   190,   119,   188,   119,   175,  1810, 11080,   171,  7897,
         2083,   112,   188, 19192,  4442,  9148,  1112,  1211,  3021,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [20]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ [CLS]    │         101 │                1 │
├──────────┼─────────────┼──────────────────┤
│ will     │        1209 │                1 │
├──────────┼─────────────┼──────────────────┤
│ health   │        2332 │                1 │
├──────────┼─────────────┼──────────────────┤
│ revenue  │        7143 │                1 │
├──────────┼─────────────┼──────────────────┤
│ growth   │        3213 │                1 │
├──────────┼─────────────┼──────────────────┤
│ b        │         171 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##uo     │       11848 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##y      │        1183 │                1 │
├──────────┼─────────────┼──────────────────┤
│ j        │         179 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##abi    │       23156 │        

In [21]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''

  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()

  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [22]:
torch.cuda.is_available()

False

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()