# Import Library & Setup

In [1]:
!pip install -U "transformers==4.40.2" "huggingface-hub==0.23.5" -q

import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

# Matiin HuggingFace chat template warning (soalnya sempet ada error)
os.environ["HF_HUB_DISABLE_CHAT_TEMPLATES"] = "1"

# Cek device sekarang
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m402.8/402.8 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.1.1 requires huggingface-hub>=0.24.0, but you have huggingface-hub 0.23.5 which is incompatible.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 

# Load Dataset

In [2]:
df = pd.read_parquet("/kaggle/input/yelp-bert-dataset/data bert 2.parquet")
print(df.head())

                                                text  sentiment
0  If you decide to eat here, just be aware it is...          1
1  A couple friends and I stopped by for some lat...          1
2  Sometimes this food is very very good.  Unfort...          1
3  After trying a few ramen places with crazy var...          1
4  Great food. Terrible customer service. I've be...          1


In [3]:
df['sentiment'] = df['sentiment'].astype('category').cat.codes

# Split train & validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)

print(f"Total data: {len(df)} | Train: {len(train_texts)} | Validation: {len(val_texts)}")
print(df['sentiment'].value_counts())

Total data: 60000 | Train: 48000 | Validation: 12000
sentiment
1    20000
2    20000
0    20000
Name: count, dtype: int64


# Tokenizer Preparation (Cased & Uncased)

In [4]:
tokenizer_cased = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer_uncased = BertTokenizer.from_pretrained('bert-base-uncased')

print("Tokenizer loaded successfully!")



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizer loaded successfully!


# Dataset & DataLoader

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        sentiment = self.labels[idx]
        enc = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'labels': torch.tensor(sentiment, dtype=torch.long)
        }

In [7]:
train_data_cased = TextDataset(train_texts, train_labels, tokenizer_cased)
val_data_cased = TextDataset(val_texts, val_labels, tokenizer_cased)
train_data_uncased = TextDataset(train_texts, train_labels, tokenizer_uncased)
val_data_uncased = TextDataset(val_texts, val_labels, tokenizer_uncased)

In [8]:
# DataLoader
train_loader_cased = DataLoader(train_data_cased, batch_size=32, shuffle=True)
val_loader_cased = DataLoader(val_data_cased, batch_size=32, shuffle=False)
train_loader_uncased = DataLoader(train_data_uncased, batch_size=32, shuffle=True)
val_loader_uncased = DataLoader(val_data_uncased, batch_size=32, shuffle=False)

# Model Initialization (BERT Cased & Uncased)

In [9]:
# BERT Cased
model_cased = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', num_labels=len(set(df['sentiment']))
).to(device)

optimizer_cased = AdamW(model_cased.parameters(), lr=3e-5)
scheduler_cased = get_linear_schedule_with_warmup(
    optimizer_cased, num_warmup_steps=0, num_training_steps=len(train_loader_cased)*3
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# BERT Uncased
model_uncased = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(set(df['sentiment']))
).to(device)

optimizer_uncased = AdamW(model_uncased.parameters(), lr=3e-5)
scheduler_uncased = get_linear_schedule_with_warmup(
    optimizer_uncased, num_warmup_steps=0, num_training_steps=len(train_loader_uncased)*3
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# BERT Cased

## Training BERT Cased

In [11]:
epochs = 4

# BERT Cased
for epoch in range(epochs):
    model_cased.train()
    total_loss = 0
    for batch in tqdm(train_loader_cased, desc=f"[Cased] Epoch {epoch+1}"):
        optimizer_cased.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_cased(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer_cased.step()
        scheduler_cased.step()

    print(f"[Cased] Epoch {epoch+1} | Loss: {total_loss/len(train_loader_cased):.4f}")

[Cased] Epoch 1: 100%|██████████| 1500/1500 [18:34<00:00,  1.35it/s]


[Cased] Epoch 1 | Loss: 0.5184


[Cased] Epoch 2: 100%|██████████| 1500/1500 [18:35<00:00,  1.34it/s]


[Cased] Epoch 2 | Loss: 0.3621


[Cased] Epoch 3: 100%|██████████| 1500/1500 [18:41<00:00,  1.34it/s]


[Cased] Epoch 3 | Loss: 0.2240


[Cased] Epoch 4: 100%|██████████| 1500/1500 [18:42<00:00,  1.34it/s]

[Cased] Epoch 4 | Loss: 0.1605





## Matrix Evaluation BERT Cased

In [12]:
def evaluate(model, loader, name="Model"):
    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    print(f"\n{name} Accuracy: {accuracy_score(actuals, preds):.4f}")
    print(classification_report(actuals, preds))



In [13]:
evaluate(model_cased, val_loader_cased, name="BERT Cased")


BERT Cased Accuracy: 0.8120
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      4023
           1       0.71      0.74      0.73      3963
           2       0.88      0.86      0.87      4014

    accuracy                           0.81     12000
   macro avg       0.81      0.81      0.81     12000
weighted avg       0.81      0.81      0.81     12000



## Save Model BERT Cased

In [14]:
model_cased.save_pretrained("bert_cased_finetuned_yelp")
tokenizer_cased.save_pretrained("bert_cased_finetuned_yelp")

print("Model berhasil save yayyy")

Model berhasil save yayyy


# BERT Uncased

## Training BERT Uncased

In [15]:
# BERT Uncased
for epoch in range(epochs):
    model_uncased.train()
    total_loss = 0
    for batch in tqdm(train_loader_uncased, desc=f"[Uncased] Epoch {epoch+1}"):
        optimizer_uncased.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_uncased(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer_uncased.step()
        scheduler_uncased.step()

    print(f"[Uncased] Epoch {epoch+1} | Loss: {total_loss/len(train_loader_uncased):.4f}")

[Uncased] Epoch 1: 100%|██████████| 1500/1500 [18:42<00:00,  1.34it/s]


[Uncased] Epoch 1 | Loss: 0.5081


[Uncased] Epoch 2: 100%|██████████| 1500/1500 [18:43<00:00,  1.34it/s]


[Uncased] Epoch 2 | Loss: 0.3556


[Uncased] Epoch 3: 100%|██████████| 1500/1500 [18:43<00:00,  1.33it/s]


[Uncased] Epoch 3 | Loss: 0.2227


[Uncased] Epoch 4: 100%|██████████| 1500/1500 [18:43<00:00,  1.33it/s]

[Uncased] Epoch 4 | Loss: 0.1621





## Matrix Evaluation BERT Uncased

In [None]:
def evaluate(model, loader, name="Model"):
    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    print(f"\n{name} Accuracy: {accuracy_score(actuals, preds):.4f}")
    print(classification_report(actuals, preds))

In [None]:
evaluate(model_uncased, val_loader_uncased, name="BERT Uncased")


BERT Uncased Accuracy: 0.8159
              precision    recall  f1-score   support

           0       0.85      0.84      0.84      4023
           1       0.72      0.73      0.73      3963
           2       0.87      0.88      0.87      4014

    accuracy                           0.82     12000
   macro avg       0.82      0.82      0.82     12000
weighted avg       0.82      0.82      0.82     12000



## Save Model BERT Uncased

In [None]:
model_uncased.save_pretrained("bert_uncased_finetuned_yelp")
tokenizer_uncased.save_pretrained("bert_uncased_finetuned_yelp")

print("Model berhasil save yayyy")

Model berhasil save yayyy
