# BERT on 10K

In [None]:
# Install transformers package from Huggingface
!pip install transformers



In [None]:
import torch
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader
from torch.nn import functional as F

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print('Using GPU ', torch.cuda.get_device_name(0)) 
else:
  device = torch.device("cpu")
  print('Using CPU')

Using GPU  Tesla P100-PCIE-16GB


In [None]:
df = pd.read_csv('10K_text_price_label.csv')

print("The length for 10K data: {}".format(len(df)))

The length for 10K data: 944


In [None]:
df.head()

Unnamed: 0,Company,Year,Doc,Label
0,ACC,2017,10 k 1 acc2016123110k htm 10 k document unite ...,0
1,ACC,2016,10 k 1 acc2015123110k htm 10 k 10 k unite stat...,2
2,ACC,2015,10 k 1 acc2014123110k htm 10 k acc 2014 12 31 ...,1
3,ACC,2014,10 k 1 acc2013123110k htm 10 k acc 2013 12 31 ...,2
4,ACC,2013,10 k 1 t75648_10k htm form 10 k t75648_10k htm...,0


In [None]:
#df['Label'] = df['Label'].apply(lambda x: int(x))
df.Label.unique()

array([0, 2, 1])

In [None]:
# Split the data set
doc_data = df[['Doc']].to_numpy()
doc_data = doc_data.reshape(doc_data.shape[0])
labels = df[['Label']].to_numpy()
labels = labels.reshape(labels.shape[0])

train_texts, test_texts, train_labels, test_labels = train_test_split(doc_data, labels, test_size=0.2, shuffle=True, random_state=0)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
EPOCHS = 20
BATCHES = 8
learning_rates = [1e-3, 1e-4, 1e-5]

In [None]:
# Turn labels and encodings into a Dataset object

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# According to the above result
best_lr = 1e-5

# In order to improve the accuracy, set max_length = 30
# Encoding the training data
train_encoding = tokenizer(list(train_texts), return_tensors='pt', padding=True, truncation=True, max_length=30)

# Turn into dataset object
train_dataset = MyDataset(train_encoding, train_labels)

# Bert model from Huggingface
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True, num_labels=3)
# Put the model on device
model.to(device)

# Use mini-bathces
train_loader = DataLoader(train_dataset, batch_size=BATCHES, shuffle=True)

# Set the optimizer AdamW
optimizer = AdamW(model.parameters(), lr=best_lr)

# Implement eary stopping
min_loss = float('inf')
epoch_count = 0
early_stop = False



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
for epoch in range(20):
  # Put the model in training mode
  model.train()

  train_loss = 0

  for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    batch_labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
    # Use cross entropy loss
    #loss = F.cross_entropy(outputs.logits, batch_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
  
  print("Current Epoch: {}".format(epoch + 1))
  print("------------------------------------------")
  print("Train loss: {}".format(train_loss))
  print()
  
  # Check whether to stop or not
  min_loss = min(train_loss, min_loss)
  if min_loss < train_loss:
    if epoch_count == 4:
      early_stop = True
      print("Stop training because of the early stop at epoch {}".format(epoch + 1))
      break
    else:
      epoch_count += 1
  else:
    # Reset the count
    epoch_count = 0

  if __name__ == '__main__':


Current Epoch: 1
------------------------------------------
Train loss: 99.15174627304077

Current Epoch: 2
------------------------------------------
Train loss: 98.35795950889587

Current Epoch: 3
------------------------------------------
Train loss: 98.35019940137863

Current Epoch: 4
------------------------------------------
Train loss: 98.1101348400116

Current Epoch: 5
------------------------------------------
Train loss: 96.75103312730789

Current Epoch: 6
------------------------------------------
Train loss: 95.22738939523697

Current Epoch: 7
------------------------------------------
Train loss: 92.62544578313828

Current Epoch: 8
------------------------------------------
Train loss: 90.41807186603546

Current Epoch: 9
------------------------------------------
Train loss: 85.6110614836216

Current Epoch: 10
------------------------------------------
Train loss: 83.17677301168442

Current Epoch: 11
------------------------------------------
Train loss: 76.94444304704666


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

test_encoding = tokenizer(list(test_texts), return_tensors='pt', padding=True, truncation=True, max_length=30)
test_dataset = MyDataset(test_encoding, test_labels)
test_loader = DataLoader(test_dataset, batch_size=BATCHES, shuffle=False)

In [None]:
with torch.no_grad():
    total_loss = 0
    y_pred = None
    for batch in test_loader:
        
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      batch_labels = batch['labels'].to(device)
      output = model(input_ids, attention_mask=attention_mask, labels=batch_labels)

      _, predicted_labels = torch.max(output.logits, 1)
      if y_pred is not None:
          y_pred = torch.cat((y_pred, predicted_labels), 0)
      else:
          y_pred = predicted_labels
 
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, y_pred.cpu(), average='micro')
acc = accuracy_score(test_labels, y_pred.cpu())
print('Precison: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 score: {}'.format(f1))
print('Accuracy: {}'.format(acc))

  if __name__ == '__main__':


Precison: 0.36507936507936506
Recall: 0.36507936507936506
F1 score: 0.36507936507936506
Accuracy: 0.36507936507936506


In [None]:
# Set current learning rate here
best_lr = 1e-4

# Bert model from Huggingface
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True, num_labels=3)

# Set the optimizer AdamW
optimizer = AdamW(model.parameters(), lr=best_lr)

# Implement early stopping
min_loss = float('inf')
epoch_count = 0
early_stop = False

# device = torch.device("cpu")

# Put the model on device
model.to(device)

for epoch in range(EPOCHS):
  # Put the model in training mode
  model.train()

  train_loss = 0

  for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    batch_labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
    # Use cross entropy loss
    #loss = F.cross_entropy(outputs.logits, batch_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
  
  print("Current Epoch: {}".format(epoch + 1))
  print("------------------------------------------")
  print("Train loss: {}".format(train_loss))
  print()
  
  # Check whether to stop or not
  min_loss = min(train_loss, min_loss)
  if min_loss < train_loss:
    if epoch_count == 4:
      early_stop = True
      print("Stop training because of the early stop at epoch {}".format(epoch + 1))
      break
    else:
      epoch_count += 1
  else:
    # Reset the count
    epoch_count = 0
  
with torch.no_grad():
    total_loss = 0
    y_pred = None
    for batch in test_loader:
        
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      batch_labels = batch['labels'].to(device)
      output = model(input_ids, attention_mask=attention_mask, labels=batch_labels)

      _, predicted_labels = torch.max(output.logits, 1)
      if y_pred is not None:
          y_pred = torch.cat((y_pred, predicted_labels), 0)
      else:
          y_pred = predicted_labels
 
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, y_pred.cpu(), average='micro')
acc = accuracy_score(test_labels, y_pred.cpu())
print('Precison: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 score: {}'.format(f1))
print('Accuracy: {}'.format(acc))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Current Epoch: 1
------------------------------------------
Train loss: 101.31162589788437

Current Epoch: 2
------------------------------------------
Train loss: 100.32213014364243

Current Epoch: 3
------------------------------------------
Train loss: 100.84465849399567

Current Epoch: 4
------------------------------------------
Train loss: 99.94551229476929

Current Epoch: 5
------------------------------------------
Train loss: 100.50420427322388

Current Epoch: 6
------------------------------------------
Train loss: 99.48814398050308

Current Epoch: 7
------------------------------------------
Train loss: 99.67683762311935

Current Epoch: 8
------------------------------------------
Train loss: 99.68718737363815

Current Epoch: 9
------------------------------------------
Train loss: 99.74585354328156

Current Epoch: 10
------------------------------------------
Train loss: 99.94914871454239

Current Epoch: 11
------------------------------------------
Train loss: 99.462453603

In [None]:
# Set current learning rate here
best_lr = 1e-6

# Bert model from Huggingface
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True, num_labels=3)

# Set the optimizer AdamW
optimizer = AdamW(model.parameters(), lr=best_lr)

# Implement early stopping
min_loss = float('inf')
epoch_count = 0
early_stop = False

# device = torch.device("cpu")

# Put the model on device
model.to(device)

for epoch in range(EPOCHS):
  # Put the model in training mode
  model.train()

  train_loss = 0

  for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    batch_labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
    # Use cross entropy loss
    #loss = F.cross_entropy(outputs.logits, batch_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
  
  print("Current Epoch: {}".format(epoch + 1))
  print("------------------------------------------")
  print("Train loss: {}".format(train_loss))
  print()
  
  # Check whether to stop or not
  min_loss = min(train_loss, min_loss)
  if min_loss < train_loss:
    if epoch_count == 4:
      early_stop = True
      print("Stop training because of the early stop at epoch {}".format(epoch + 1))
      break
    else:
      epoch_count += 1
  else:
    # Reset the count
    epoch_count = 0
  
with torch.no_grad():
    total_loss = 0
    y_pred = None
    for batch in test_loader:
        
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      batch_labels = batch['labels'].to(device)
      output = model(input_ids, attention_mask=attention_mask, labels=batch_labels)

      _, predicted_labels = torch.max(output.logits, 1)
      if y_pred is not None:
          y_pred = torch.cat((y_pred, predicted_labels), 0)
      else:
          y_pred = predicted_labels
 
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, y_pred.cpu(), average='micro')
acc = accuracy_score(test_labels, y_pred.cpu())
print('Precison: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 score: {}'.format(f1))
print('Accuracy: {}'.format(acc))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Current Epoch: 1
------------------------------------------
Train loss: 101.53323954343796

Current Epoch: 2
------------------------------------------
Train loss: 99.5491891503334

Current Epoch: 3
------------------------------------------
Train loss: 98.98423862457275

Current Epoch: 4
------------------------------------------
Train loss: 98.03639221191406

Current Epoch: 5
------------------------------------------
Train loss: 98.69284856319427

Current Epoch: 6
------------------------------------------
Train loss: 97.8411111831665

Current Epoch: 7
------------------------------------------
Train loss: 97.93597447872162

Current Epoch: 8
------------------------------------------
Train loss: 97.49109625816345

Current Epoch: 9
------------------------------------------
Train loss: 97.88804751634598

Current Epoch: 10
------------------------------------------
Train loss: 97.40900492668152

Current Epoch: 11
------------------------------------------
Train loss: 96.97042214870453

In [None]:
# Set current learning rate here
best_lr = 1e-3

# Bert model from Huggingface
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True, num_labels=3)

# Set the optimizer AdamW
optimizer = AdamW(model.parameters(), lr=best_lr)

# Implement early stopping
min_loss = float('inf')
epoch_count = 0
early_stop = False

# device = torch.device("cpu")

# Put the model on device
model.to(device)

for epoch in range(EPOCHS):
  # Put the model in training mode
  model.train()

  train_loss = 0

  for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    batch_labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
    # Use cross entropy loss
    #loss = F.cross_entropy(outputs.logits, batch_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
  
  print("Current Epoch: {}".format(epoch + 1))
  print("------------------------------------------")
  print("Train loss: {}".format(train_loss))
  print()
  
  # Check whether to stop or not
  min_loss = min(train_loss, min_loss)
  if min_loss < train_loss:
    if epoch_count == 4:
      early_stop = True
      print("Stop training because of the early stop at epoch {}".format(epoch + 1))
      break
    else:
      epoch_count += 1
  else:
    # Reset the count
    epoch_count = 0
  
with torch.no_grad():
    total_loss = 0
    y_pred = None
    for batch in test_loader:
        
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      batch_labels = batch['labels'].to(device)
      output = model(input_ids, attention_mask=attention_mask, labels=batch_labels)

      _, predicted_labels = torch.max(output.logits, 1)
      if y_pred is not None:
          y_pred = torch.cat((y_pred, predicted_labels), 0)
      else:
          y_pred = predicted_labels
 
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, y_pred.cpu(), average='micro')
acc = accuracy_score(test_labels, y_pred.cpu())
print('Precison: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 score: {}'.format(f1))
print('Accuracy: {}'.format(acc))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Current Epoch: 1
------------------------------------------
Train loss: 108.65174180269241

Current Epoch: 2
------------------------------------------
Train loss: 103.48272258043289

Current Epoch: 3
------------------------------------------
Train loss: 106.42497771978378

Current Epoch: 4
------------------------------------------
Train loss: 109.76540422439575

Current Epoch: 5
------------------------------------------
Train loss: 107.67937356233597

Current Epoch: 6
------------------------------------------
Train loss: 110.05821073055267

Current Epoch: 7
------------------------------------------
Train loss: 107.99241816997528

Stop training because of the early stop at epoch 7
Precison: 0.2857142857142857
Recall: 0.2857142857142857
F1 score: 0.2857142857142857
Accuracy: 0.2857142857142857
