In [12]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix, classification_report

stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

Load Data

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train', columns=['text', 'stars'], folder='data')
valid_df = load_data('valid', columns=['text', 'stars'], folder='data')
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'], folder='data')

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


Preprocessing
Requirment for BERT:
1. add beginning and endding tokens
2. Make sentences the same length
3. create an attention mask

Can also try (TO DO):
1. stopwords, stem, etc.
2. truncation/selection strategies of text
3. selectively choose data points: more 2 star data, ignore data with text length < n words, etc.
4. different max length

In [58]:
train_text = train_df.text.values
train_labels = train_df.stars.values
# select train set size
train_text = train_text[:50]
train_labels = train_labels[:50]

val_text = valid_df.text.values
val_labels = valid_df.stars.values
# select train set size
val_text = val_text[:32]
val_labels = val_labels[:32]

In [5]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 128,
                        padding = 'max_length',
                        truncation='longest_first',
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [15]:
train_token_id = []
train_attention_masks = []

for sample in train_text:
  encoding_dict = preprocessing(sample, tokenizer)
  train_token_id.append(encoding_dict['input_ids']) 
  train_attention_masks.append(encoding_dict['attention_mask'])

train_token_id = torch.cat(train_token_id, dim = 0)
train_attention_masks = torch.cat(train_attention_masks, dim = 0)
train_labels = torch.tensor(train_labels)

  train_labels = torch.tensor(train_labels)


In [10]:
val_token_id = []
val_attention_masks = []

for sample in val_text:
  encoding_dict = preprocessing(sample, tokenizer)
  val_token_id.append(encoding_dict['input_ids']) 
  val_attention_masks.append(encoding_dict['attention_mask'])


val_token_id = torch.cat(val_token_id, dim = 0)
val_attention_masks = torch.cat(val_attention_masks, dim = 0)
val_labels = torch.tensor(val_labels)

other possible codes:

In [None]:
### select more 2, 3, 4 stars data

import random

for t, l in zip(train_text[5001:10000], train_labels[5001:10000]):
    if l==2:
        text=np.append(text, [t])
        labels=np.append(labels, [l])
    if l==3 and (random.randint(0,1)%2):
        text=np.append(text, [t])
        labels=np.append(labels, [l])
    if l==4 and (random.randint(0,3)%4):
        text=np.append(text, [t])
        labels=np.append(labels, [l])
print(text.size, labels.size)

In [28]:
#### select first and last words

# Split each long sentence into two parts and keep short sentences as they are
train_text_cut = []
for sentence in train_text:
    if len(sentence) > 126:
        first_part = sentence[:64]
        second_part = sentence[-62:]
        train_text_cut.append(first_part + second_part)
    else:
        train_text_cut.append(sentence)

train_token_id = []
train_attention_masks = []

for sample in train_text_cut:
  encoding_dict = preprocessing(sample, tokenizer)
  train_token_id.append(encoding_dict['input_ids']) 
  train_attention_masks.append(encoding_dict['attention_mask'])

train_token_id = torch.cat(train_token_id, dim = 0)
train_attention_masks = torch.cat(train_attention_masks, dim = 0)
train_labels = torch.tensor(train_labels)

In [49]:
val_text = valid_df.text.values[:32]
val_labels = valid_df.stars.values[:32]

In [50]:
# Split each long sentence into two parts and keep short sentences as they are
val_text_cut = []
for sentence in val_text:
    if len(sentence) > 126:
        first_part = sentence[:64]
        second_part = sentence[-62:]
        val_text_cut.append(first_part + second_part)
    else:
        val_text_cut.append(sentence)

val_token_id = []
val_attention_masks = []

for sample in val_text_cut:
  encoding_dict = preprocessing(sample, tokenizer)
  val_token_id.append(encoding_dict['input_ids']) 
  val_attention_masks.append(encoding_dict['attention_mask'])

val_token_id = torch.cat(val_token_id, dim = 0)
val_attention_masks = torch.cat(val_attention_masks, dim = 0)
val_labels = torch.tensor(val_labels)

prepare data for model

can try different batch size here

In [54]:
# Train and validation sets
batch_size = 16

train_set = TensorDataset(train_token_id, train_attention_masks, train_labels)

val_set = TensorDataset(val_token_id, val_attention_masks, val_labels)

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            shuffle=True,
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            shuffle=True,
            batch_size = batch_size
        )

Load pretrained BERT

can change:
1. different pretrained model (together with tokenizer, probably need to download the model and use another library)
2. hyperparameters in optimizer
3. add other possible optimizing strategies?

In [13]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 6,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

fine tune the model

can change number of epoch

In [69]:
epochs = 1

tr_loss_list = []

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = []
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        print("=============")
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss.append(train_output.loss.item())
        print(nb_tr_steps, train_output.loss.item())
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    tr_loss_list.append(tr_loss)
    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    test_pred = []
    test_ori = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                                token_type_ids = None, 
                                attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        test_ori.append(label_ids)
        test_pred.append(np.argmax(logits, axis = 1))
        print(np.argmax(logits, axis = 1))

    ########## if want to save the report, assign the value to some variables ##########
    print("accuracy for validation set")
    flatten_test_pred = [item for sublist in test_pred for item in sublist]
    flatten_test_ori = [item for sublist in test_ori for item in sublist]
    print(classification_report(flatten_test_ori, flatten_test_pred))
    print("\n\n")
    print(confusion_matrix(flatten_test_ori, flatten_test_pred))

    
    train_pred = []
    train_ori = []

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids, 
                                token_type_ids = None, 
                                attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        train_ori.append(label_ids)
        train_pred.append(np.argmax(logits, axis = 1))
        print(np.argmax(logits, axis = 1))

    ########## if want to save the report, assign the value to some variables ##########
    print("accuracy for training set")
    flatten_train_pred = [item for sublist in train_pred for item in sublist]
    flatten_train_ori = [item for sublist in train_ori for item in sublist]
    print(classification_report(flatten_train_ori, flatten_train_pred))
    print("\n\n")
    print(confusion_matrix(flatten_train_ori, flatten_train_pred))

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

0 1.5384200811386108
1 1.4080415964126587
2 1.360826849937439
3 1.7676968574523926
[5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
[5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
accuracy for validation set
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         9
           5       0.50      1.00      0.67        16

    accuracy                           0.50        32
   macro avg       0.10      0.20      0.13        32
weighted avg       0.25      0.50      0.33        32




[[ 0  0  0  0  1]
 [ 0  0  0  0  2]
 [ 0  0  0  0  4]
 [ 0  0  0  0  9]
 [ 0  0  0  0 16]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5]
[5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]
[5 5 3 5 5 5 5 5 5 3 5 5 5 5 5 5]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Epoch: 100%|██████████| 1/1 [01:33<00:00, 93.40s/it]

[5 3]
accuracy for training set
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         2
           3       1.00      0.33      0.50        12
           4       0.00      0.00      0.00        13
           5       0.41      1.00      0.58        19

    accuracy                           0.46        50
   macro avg       0.28      0.27      0.22        50
weighted avg       0.40      0.46      0.34        50




[[ 0  0  0  0  4]
 [ 0  0  0  0  2]
 [ 0  0  4  0  8]
 [ 0  0  0  0 13]
 [ 0  0  0  0 19]]





save the model

In [None]:
model.save_pretrained('bert_model')
tokenizer.save_pretrained('bert_model')

Previous atempt:

1. train set size 5000, batch size 16, max length = 128, padding = 'max_length', truncation='longest_first'.
epoch = 2: accuracy = 0.67, f1-score = 0.59, epoch = 3: 0.64, 0.55 (no improvement, PROBABLY overfit)
2. train set size 8932(5000_1492), batch size 16, max length = 128, padding = 'max_length', truncation='longest_first'.
epoch = 2: accuracy = 0.61, f1-score = 0.57(compared with epoch 1, drop in star 5 score, improve in others), shuffle=False
3. first epoch: train set size 6492(5000+1492),0.64 0.58, second epoch: set size 5000, shuffle = False, 0.66 0.56
4. train set size 10000, 2 epoch, 
