In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 28.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [2]:
!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 466, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 466 (delta 106), reused 116 (delta 102), pack-reused 316[K
Receiving objects: 100% (466/466), 5.16 MiB | 17.79 MiB/s, done.
Resolving deltas: 100% (226/226), done.


In [3]:

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [5]:
# Set random seed
set_seed(26092020)

In [6]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Downloading:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# tokenizer 
text = "Budi suka bermain bola "

encoding = tokenizer.encode(text)
decoding = tokenizer.decode(encoding)
encoding_input = tokenizer(text)

print(encoding)
print(decoding)
print(encoding_input)

[2, 5103, 1506, 1326, 1522, 3]
[CLS] budi suka bermain bola [SEP]
{'input_ids': [2, 5103, 1506, 1326, 1522, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [8]:
config

BertConfig {
  "_name_or_path": "indobenchmark/indobert-base-p1",
  "_num_labels": 5,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 5

In [9]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
count_param(model)

124443651

prepare dataset

In [11]:
train_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [12]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_dataset.data.shape, valid_dataset.data.shape, test_dataset.data.shape

((11000, 2), (1260, 2), (500, 2))

In [13]:
train_dataset.data.head()

Unnamed: 0,text,sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,0
1,mohon ulama lurus dan k212 mmbri hujjah partai...,1
2,lokasi strategis di jalan sumatera bandung . t...,0
3,betapa bahagia nya diri ini saat unboxing pake...,0
4,duh . jadi mahasiswa jangan sombong dong . kas...,2


In [14]:
#label index
train_dataset.LABEL2INDEX

{'positive': 0, 'neutral': 1, 'negative': 2}

In [15]:
train_dataset.data.sentiment.value_counts()

0    6416
2    3436
1    1148
Name: sentiment, dtype: int64

In [16]:
valid_dataset.data.sentiment.value_counts()

0    735
2    394
1    131
Name: sentiment, dtype: int64

In [17]:
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)

  cpuset_checked))


In [18]:
len(train_loader), len(valid_loader), len(test_loader)

(688, 79, 32)

In [19]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


fine tunning & evaluation

In [20]:
optimizer = optim.Adam(model.parameters(), lr=5e-6)
model = model.cuda()

In [21]:
# Train
n_epochs = 3
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))
         # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

  cpuset_checked))
(Epoch 1) TRAIN LOSS:0.2584 LR:0.00000500: 100%|██████████| 688/688 [03:00<00:00,  3.82it/s]


(Epoch 1) TRAIN LOSS:0.2584 ACC:0.90 F1:0.87 REC:0.85 PRE:0.89 LR:0.00000500


  cpuset_checked))
VALID LOSS:0.1795 ACC:0.93 F1:0.91 REC:0.91 PRE:0.91: 100%|██████████| 79/79 [00:07<00:00,  9.89it/s]


(Epoch 1) VALID LOSS:0.1795 ACC:0.93 F1:0.91 REC:0.91 PRE:0.91


  cpuset_checked))
(Epoch 2) TRAIN LOSS:0.1252 LR:0.00000500: 100%|██████████| 688/688 [02:54<00:00,  3.94it/s]


(Epoch 2) TRAIN LOSS:0.1252 ACC:0.96 F1:0.95 REC:0.94 PRE:0.95 LR:0.00000500


  cpuset_checked))
VALID LOSS:0.1828 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93: 100%|██████████| 79/79 [00:07<00:00,  9.97it/s]


(Epoch 2) VALID LOSS:0.1828 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93


  cpuset_checked))
(Epoch 3) TRAIN LOSS:0.0795 LR:0.00000500: 100%|██████████| 688/688 [02:54<00:00,  3.93it/s]


(Epoch 3) TRAIN LOSS:0.0795 ACC:0.98 F1:0.97 REC:0.97 PRE:0.97 LR:0.00000500


  cpuset_checked))
VALID LOSS:0.1889 ACC:0.93 F1:0.90 REC:0.90 PRE:0.92: 100%|██████████| 79/79 [00:08<00:00,  9.87it/s]

(Epoch 3) VALID LOSS:0.1889 ACC:0.93 F1:0.90 REC:0.90 PRE:0.92





In [23]:
#evaluate on test

model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
  _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
  list_hyp += batch_hyp

#save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('prediction.csv', index=False)

  cpuset_checked))
100%|██████████| 32/32 [00:03<00:00, 10.09it/s]


In [24]:
df.head()

Unnamed: 0,index,label
0,0,negative
1,1,negative
2,2,negative
3,3,negative
4,4,negative


In [25]:
df.label.value_counts()

negative    227
positive    222
neutral      51
Name: label, dtype: int64