In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


# Initiate Dataset

In [2]:
# https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data
df = pd.read_excel("output_file.xlsx",sheet_name="Sheet1")
df.head()

Unnamed: 0,no,comments,bank,date,platform,label,topik,Unnamed: 7
0,1,keren sekali transmart bintang melanda bintang...,Bank Mega,2023-05-07,Instagram,0,service,
1,2,tidak pernah kabari menerapkan,Bank Mega,2023-05-07,Instagram,2,cc,
2,3,keren bht,Bank Mega,2023-05-07,Instagram,0,service,
3,4,transmart makssar masih kurang lengkap barangnya,Bank Mega,2023-05-07,Instagram,2,service,
4,5,dan transmart keren oke wajah tersenyum dengan...,Bank Mega,2023-05-07,Instagram,0,service,


In [3]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # print(self.texts, len(self.texts))
        # print(text, label, idx)
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

## Train Validation Split

In [4]:
label_encoder = LabelEncoder()
df['topik'] = label_encoder.fit_transform(df['topik'])
df = df.rename(columns={'label': 'sentiment'})

In [5]:
df.head()

Unnamed: 0,no,comments,bank,date,platform,sentiment,topik,Unnamed: 7
0,1,keren sekali transmart bintang melanda bintang...,Bank Mega,2023-05-07,Instagram,0,2,
1,2,tidak pernah kabari menerapkan,Bank Mega,2023-05-07,Instagram,2,0,
2,3,keren bht,Bank Mega,2023-05-07,Instagram,0,2,
3,4,transmart makssar masih kurang lengkap barangnya,Bank Mega,2023-05-07,Instagram,2,2,
4,5,dan transmart keren oke wajah tersenyum dengan...,Bank Mega,2023-05-07,Instagram,0,2,


In [6]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Reset index for both train and validation sets
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Extract texts and labels
train_texts, train_labels = train_df["comments"], train_df["topik"]
val_texts, val_labels = val_df["comments"], val_df["topik"]

# BERT Model

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes) #add classification layer

    def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_states = outputs.last_hidden_state
            hidden_states = outputs.hidden_states
            attentions = outputs.attentions
            pooled_output = outputs.pooler_output
            x = self.dropout(pooled_output)
            logits = self.fc(x)
            # print(last_hidden_states)
            # print(hidden_states)
            # print(attentions)
            # print(pooled_output)
            # print(logits)
            return logits

In [8]:
# def train(model, data_loader, optimizer, scheduler, device):
#     model.train()
#     for batch in data_loader:
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device).long()
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         loss = nn.CrossEntropyLoss()(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         scheduler.step()

In [9]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device).long()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [10]:
import torch

def train(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs, patience, checkpoint_path):
    model.train()
    best_val_accuracy = 0.0
    consecutive_no_improvement = 0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training phase
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device).long()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Validation phase
        val_accuracy, report = evaluate(model, val_loader, device)
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(report)

        # Check for early stopping
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            consecutive_no_improvement = 0

            # Save the checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_val_accuracy': best_val_accuracy,
                'consecutive_no_improvement': consecutive_no_improvement
            }, checkpoint_path)

        else:
            consecutive_no_improvement += 1

        if consecutive_no_improvement >= patience:
            print(f"Early stopping after {epoch + 1} epochs without improvement.")
            break

    print("Training completed.")

In [11]:
bert_model_name = 'indolem/indobert-base-uncased'
num_classes = 3
max_length = 128
batch_size = 16
num_epochs = 20
learning_rate = 2e-5
patience = 2
checkpoint_path = "best_indobert_model.pth"

In [12]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [14]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [15]:
train(model, train_dataloader, val_dataloader, optimizer, scheduler, device, num_epochs, patience, checkpoint_path)

Epoch 1/20


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.7660
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.00      0.00      0.00         8
           2       0.77      1.00      0.87       108

    accuracy                           0.77       141
   macro avg       0.26      0.33      0.29       141
weighted avg       0.59      0.77      0.66       141

Epoch 2/20
Validation Accuracy: 0.7943
              precision    recall  f1-score   support

           0       0.67      0.56      0.61        25
           1       0.33      0.88      0.48         8
           2       0.92      0.84      0.88       108

    accuracy                           0.79       141
   macro avg       0.64      0.76      0.66       141
weighted avg       0.84      0.79      0.81       141

Epoch 3/20
Validation Accuracy: 0.8440
              precision    recall  f1-score   support

           0       0.80      0.48      0.60        25
           1       0.67

## Save model

In [18]:
torch.save(model.state_dict(), 'bert_model_topic1.pth')

In [None]:
# Instantiate the BERT model architecture
loaded_model = BERTClassifier(bert_model_name, num_classes)

# Load the saved model state dictionary
loaded_model.load_state_dict(torch.load('bert_model1.pth'))

# Set the model to evaluation mode (important if you have dropout layers)
loaded_model.eval()

# Move the model to the desired device (CPU or GPU)
loaded_model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

## Test model

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        print(preds)
        print(preds.item())
    if preds.item() == 0:
        return "positive"
    elif preds.item() == 1:
        return "neutral"
    else:
        return "negative"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(predict_sentiment("bank mega jelek banget pelayanannya", loaded_model, tokenizer, device))

tensor([[[-0.4350, -0.6659, -0.3739,  ..., -0.3584,  0.7349,  0.2229],
         [-0.2089,  0.4062, -1.0106,  ..., -0.5113,  0.1457, -0.1008],
         [-0.0333, -0.0485, -0.0718,  ...,  0.7889, -0.0395, -0.4712],
         ...,
         [-0.0241, -0.8481, -0.7842,  ...,  0.5589, -0.1591,  0.5199],
         [-0.0543, -0.9029, -0.9501,  ...,  0.5639, -0.1369,  0.4541],
         [ 0.0170, -0.7999, -0.9675,  ...,  0.4635, -0.5736,  0.4686]]])
None
None
tensor([[ 5.2712e-02,  1.2291e-01,  3.5079e-01, -5.2025e-02, -6.3245e-01,
          1.7005e-01,  6.3419e-01, -7.6902e-01,  2.6698e-01,  1.6044e-01,
         -4.6129e-02,  3.5555e-01, -6.5248e-01,  7.0153e-01,  7.0624e-02,
          1.9026e-01, -2.0505e-01, -5.7395e-01, -5.3928e-01,  2.5725e-01,
         -9.0643e-02,  5.0417e-01, -5.4447e-02, -4.1963e-01, -1.7511e-02,
         -1.1273e-01,  9.4036e-02,  7.6764e-03,  7.0238e-01,  4.0891e-01,
          1.3801e-02,  7.1326e-01, -8.5344e-02,  5.7809e-01, -2.9001e-01,
         -3.7979e-01,  2.7353e

# Check last BERT layer

In [None]:
last_layer_weights = model.bert.encoder.layer[-1].output.dense.weight
print(last_layer_weights)

Parameter containing:
tensor([[ 0.0174,  0.0182, -0.0121,  ..., -0.0242, -0.0634, -0.0537],
        [-0.0223,  0.0087,  0.0474,  ...,  0.0408, -0.0022, -0.0073],
        [-0.0057, -0.0232, -0.0267,  ...,  0.0102,  0.0136, -0.0225],
        ...,
        [-0.0093,  0.0008, -0.0089,  ...,  0.0272, -0.0265, -0.0443],
        [ 0.0066,  0.0147,  0.0247,  ..., -0.0567,  0.0437, -0.0308],
        [ 0.0071,  0.0157, -0.0321,  ...,  0.0467, -0.0337, -0.0027]],
       requires_grad=True)


In [None]:
def predict_sentiment2(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        last_hidden_states = outputs.last_hidden_state
        pooler_output = outputs.pooler_output
        hidden_states = outputs.hidden_states
        attentions = outputs.attentions
        print(last_hidden_states)
        print(pooler_output)
        print(hidden_states)
        print(attentions)
    if preds.item() == 0:
        return "positive"
    elif preds.item() == 1:
        return "neutral"
    else:
        return "negative"

In [None]:
print(predict_sentiment2("bank mega jelek banget pelayanannya", loaded_model, tokenizer, device))

AttributeError: 'Tensor' object has no attribute 'last_hidden_state'