In [None]:
# Acknowledgement:
# https://www.kaggle.com/code/neerajmohan/fine-tuning-bert-for-text-classification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import re
import string

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from sklearn.metrics import recall_score, accuracy_score

import wandb

In [None]:
from nltk.corpus import stopwords

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
import transformers
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=2)

In [None]:
df = pd.read_csv('augment_subtitute.csv')
df.head()

In [None]:
# df = df.drop(['Unnamed: 0', 'sub-category', 'has_exclamation', 'has_question', 'has_number', 'mark_count', 'total_character', 'original_title'], axis=1)

In [None]:
# df = df[['title', 'label_score']]
# df.head()

In [None]:
# stopwords is the words that often show up in a sentence
stop = stopwords.words('indonesian')
stop[:5]

In [None]:
# function from: https://www.kaggle.com/code/ahmadalqawasmeh/nlp-text-clustering-a-simple-guide
# modified by us to use Sastrawi stemmer 


## This function to clean the text in the col (text-col)
def clean_text(text):
    text=str(text).lower() #x``
    text=re.sub('\d+', '', text) #removes numbers
    text=re.sub('\[.*?\]', '', text) #removes HTML tags
    text=re.sub('https?://\S+|www\.\S+', '', text) #removes url
    text=re.sub(r"["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", "", text) #removes emojis
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text) #removes punctuations
    # text = [ps.stem(word) for word in text.split(' ') if not word in set(stop) ]  ## remove all stopwords from the text, apply steming on words
    text = [stemmer.stem(word) for word in text.split(' ') if not word in stop] #remove stopwords then apply stemmer
    text = ' '.join(text) ## join the words seperated by spaces
    return text

In [None]:
#getting first title
first_title = df["title"][0]
print(f"Before: {first_title}")

#clean whole title
df['title']=df['title'].apply(clean_text)

# notice the difference before and after cleaning
first_title = df["title"][0]
print(f"After: {first_title}")

In [None]:
# tokenize using tokenizer

print(' Original: ', df["title"][0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(df["title"][0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df["title"][0])))

print('Length of the token:',len(tokenizer.tokenize(df["title"][0])))

In [None]:
#checking the longset sentence and it's token length (for attention mask)

max_len = 0

# For every sentence...
for sentence in df["title"]:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens in the beginning and end.
    input_ids = tokenizer.encode(sentence, add_special_tokens=True)

    # Update the maximum sentence length to get the max length of sentence
    max_len = max(max_len, len(input_ids))

print('Max array length: ', max_len)

In [None]:
input_ids = []
attention_masks = []

# For every title...
for title in df["title"]:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        title,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 33,           # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['label_score'].values, dtype=torch.int64)

In [None]:
#notice that out dataset has 14757 rows

print('input id shape:', input_ids.shape)
print('attention_mask shape:', attention_masks.shape)
print('labels shape:', labels.shape)

In [None]:
# input id sample
input_ids[:1]

In [None]:
# attention mask sample (notice that 0 means there is no word there. We do this to make the array to be the same length)
attention_masks[:1]

In [None]:
# labels sample
labels[:1]

In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# looking at dataset
dataset.tensors

In [None]:
# split to train and test (here we use 80-10-10 split)
# Define the ratios for the train, validation, and test sets
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Calculate the number of samples for each set
train_size = int(train_ratio * len(dataset))
val_size = int(val_ratio * len(dataset))
test_size = len(dataset) - train_size - val_size

# Split the dataset into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Print the number of samples in each set
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

In [None]:
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
#setup device agnostic code
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
# run model on cpu or gpu (use gpu if available)
model = model.to(device)

# this is the model
model

In [None]:
def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               optimizer: torch.optim.Optimizer,
               scheduler,
               device=device):
    #setup total training loss
    total_train_loss = 0

    #training mode: activated
    model.train()

    for step, batch in enumerate(train_dataloader):
        # Unpack the training batch from the dataloader, put to device (gpu or cpu)
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        #optimizer zero grad
        optimizer.zero_grad()
        
        #forward pass
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        #compute loss
        loss = output.loss
        total_train_loss += loss.item()

        #backprop
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        #optimizer step
        optimizer.step()

        #scheduler step
        scheduler.step()
        
    #count average from accumulated train loss
    avg_train_loss = total_train_loss / len(train_dataloader)

    return avg_train_loss

In [None]:
def val_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               optimizer: torch.optim.Optimizer,
               scheduler,
               device=device):

    #evaluation mode
    model.eval()

    total_val_loss = 0
    best_recall_score = 0
    val_predictions = []
    val_labels = []

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # forward pass
        with torch.inference_mode():
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        #compute and accumulate loss
        loss = output.loss
        total_val_loss += loss.item()

        #get the logit
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        #get the label (notice that there are only 2 possible label so we take the max)
        val_predictions.extend(np.argmax(logits, axis=1).tolist())
        val_labels.extend(label_ids.tolist())

    avg_val_loss = total_val_loss / len(validation_dataloader)

    #get accuracy and recall score
    val_accuracy = accuracy_score(val_labels, val_predictions)
    val_recall = recall_score(val_labels, val_predictions, average='binary')

    if val_recall > best_recall_score:
        torch.save(model, 'indobert_fine-tuned_augmented')
        best_recall_score = val_recall

    return avg_val_loss, val_accuracy, val_recall

In [None]:
# ga usah dicoba
# 2e-7
# 0.000002=2e-6


epochs = 10
lr = 2e-5

optimizer = torch.optim.AdamW(model.parameters(),
                              lr = lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                              eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                              )


# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
wandb.init(
    # set the wandb project where this run will be logged
    project="IndoBert Finetuning",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "indobenchmark/indobert-base-p1",
    "dataset": "augment_subtitute",
    "epochs": epochs,
    "batch_size":batch_size
    }
)

In [None]:
#for graphing purposes:
epoch_list = []
train_loss_list = []
avg_val_loss_list = []
val_accuracy_list = []
val_recall_list = []

for epoch in tqdm(range(epochs)):
    print(f"epoch: {epoch}")
    print("-"*10)
    train_loss = train_step(model=model, dataloader=train_dataloader, optimizer=optimizer, scheduler=scheduler, device=device)
    avg_val_loss, val_accuracy, val_recall = val_step(model=model, dataloader=validation_dataloader, optimizer=optimizer, scheduler=scheduler, device=device)
    print(f"Train Loss: {train_loss*100:.4f} | Avg validation loss: {avg_val_loss*100:.4f} | Validation accuracy: {val_accuracy*100:.4f} | Validation Recall {val_recall*100:.6f}")
    
    # log metrics to wandb
    wandb.log({"Train Loss": train_loss, "Avg validation loss": avg_val_loss,
               "Validation accuracy":val_accuracy, "Validation Recall":val_recall,
               })

    epoch_list.append(epoch)
    train_loss_list.append(train_loss)
    avg_val_loss_list.append(avg_val_loss)
    val_accuracy_list.append(val_accuracy)
    val_recall_list.append(val_recall)
    
wandb.finish()

In [None]:
epoch_list = [i for i in range(10)]

plt.figure(figsize=(10,7))
plt.plot(epoch_list, np.array(torch.tensor(train_loss_list)), label= "Train Loss")
plt.plot(epoch_list, np.array(torch.tensor(avg_val_loss_list)), label= "Average Validation Loss")
plt.plot(epoch_list, np.array(torch.tensor(val_accuracy_list)), label= "Validation Accuracy")
plt.plot(epoch_list, np.array(torch.tensor(val_recall_list)), label= "Validation Recall")
plt.legend()
plt.grid()
plt.title("Loss, Accuracy, and Recall Progress over fine tuning")
plt.show()

In [None]:
# giving the model the data it never seen for testing
avg_test_loss, test_accuracy, test_recall = val_step(model=model, dataloader=test_dataloader, optimizer=optimizer, scheduler=scheduler, device=device)

print(f"Avg Test loss: {avg_test_loss*100:.4f} | Test accuracy: {test_accuracy*100:.4f} | Test Recall {test_recall*100:.4f}")