In [1]:
%pip install pandas numpy matplotlib seaborn nltk scikit-learn Sastrawi googletrans==4.0.0rc1 tensorflow scikit-learn joblib



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import googletrans

import time

import warnings
warnings.filterwarnings("ignore")

## TRAINING MODEL

In [2]:
import json
from ast import literal_eval
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import joblib

#### Load and prepare the balanced dataset

In [3]:

from collections import Counter
import random
from tqdm.auto import tqdm

# Ensure reproducibility across runs
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

data_path = Path('../2_preprocessing/data_imbalanced_downsampled.csv')
balanced_df = pd.read_csv(data_path)
balanced_df = balanced_df.dropna(subset=['stemmed_tokens', 'Labelling']).copy()


def tokens_to_text(value):
    """Convert list-like token data into whitespace separated text."""
    if isinstance(value, list):
        return ' '.join(value)
    if isinstance(value, str):
        try:
            parsed = literal_eval(value)
            if isinstance(parsed, list):
                return ' '.join(parsed)
        except (ValueError, SyntaxError):
            pass
        return value.translate(str.maketrans('', '', "[]'"))
    return ''

balanced_df['text_for_model'] = (
    balanced_df['stemmed_tokens']
    .apply(tokens_to_text)
    .str.replace(',', ' ', regex=False)
    .str.strip()
)

balanced_df = balanced_df[balanced_df['text_for_model'].str.len() > 0]

label_encoder = LabelEncoder()
balanced_df['label_id'] = label_encoder.fit_transform(balanced_df['Labelling'])

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    balanced_df['text_for_model'],
    balanced_df['label_id'],
    test_size=0.2,
    random_state=SEED,
    stratify=balanced_df['label_id']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=SEED,
    stratify=temp_labels
)

#### Vocabulary building and encoding helpers


In [4]:
def build_vocab(text_series, max_vocab=20000, min_freq=2):
    counter = Counter()
    for text in text_series:
        counter.update(text.split())
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for token, freq in counter.most_common():
        if freq < min_freq:
            continue
        if len(vocab) >= max_vocab:
            break
        vocab[token] = len(vocab)
    return vocab


def encode_text(text, vocab_map, max_length):
    tokens = text.split()
    ids = [vocab_map.get(tok, vocab_map['<UNK>']) for tok in tokens[:max_length]]
    if len(ids) < max_length:
        ids += [vocab_map['<PAD>']] * (max_length - len(ids))
    return torch.tensor(ids, dtype=torch.long)


vocab = build_vocab(train_texts)
lengths = balanced_df['text_for_model'].str.split().str.len()
max_len = int(np.percentile(lengths, 95))
max_len = max(max_len, 10)
print(f'Vocab size: {len(vocab):,} | Sequence length: {max_len}')


class TokensDataset(Dataset):
    def __init__(self, texts, labels):
        self.samples = [encode_text(text, vocab, max_len) for text in texts]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx], self.labels[idx]


def create_loader(texts, labels, batch_size=256, shuffle=False):
    dataset = TokensDataset(list(texts), np.array(labels))
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


train_loader = create_loader(train_texts, train_labels, shuffle=True)
val_loader = create_loader(val_texts, val_labels)
test_loader = create_loader(test_texts, test_labels)

Vocab size: 6,678 | Sequence length: 34


#### Define the BiLSTM sentiment classifier

In [5]:
class SentimentBiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=64, dropout=0.4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        final_state = torch.cat((h_n[-2], h_n[-1]), dim=1)
        return self.classifier(final_state).squeeze(1)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentBiLSTM(len(vocab)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

#### Training helpers with progress bar feedback

In [6]:
def run_epoch(loader, train_mode):
    epoch_loss = 0.0
    correct = 0
    total = 0
    desc = 'Train' if train_mode else 'Val'
    model.train() if train_mode else model.eval()

    progress = tqdm(loader, desc=desc, leave=False)
    for batch_inputs, batch_labels in progress:
        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad(set_to_none=True)

        with torch.set_grad_enabled(train_mode):
            logits = model(batch_inputs)
            loss = criterion(logits, batch_labels)
            if train_mode:
                loss.backward()
                optimizer.step()

        preds = (torch.sigmoid(logits) > 0.5).float()
        total += batch_labels.size(0)
        epoch_loss += loss.item() * batch_labels.size(0)
        correct += (preds == batch_labels).sum().item()
        progress.set_postfix(loss=epoch_loss / total, acc=correct / total)

    return epoch_loss / total, correct / total


num_epochs = 15
patience = 3
best_val_loss = float('inf')
best_state = None
patience_counter = 0

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = run_epoch(train_loader, train_mode=True)
    val_loss, val_acc = run_epoch(val_loader, train_mode=False)
    scheduler.step(val_loss)

    print(f'Epoch {epoch:02d}/{num_epochs} | '
          f'train_loss={train_loss:.4f} acc={train_acc:.4f} | '
          f'val_loss={val_loss:.4f} acc={val_acc:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping triggered.')
            break

                                                                                

Epoch 01/15 | train_loss=0.3870 acc=0.8424 | val_loss=0.2652 acc=0.8979


                                                                                

Epoch 02/15 | train_loss=0.2522 acc=0.9078 | val_loss=0.2502 acc=0.9071


                                                                                

Epoch 03/15 | train_loss=0.2196 acc=0.9223 | val_loss=0.2492 acc=0.9092


                                                                                

Epoch 04/15 | train_loss=0.1981 acc=0.9317 | val_loss=0.2475 acc=0.9076


                                                                                

Epoch 05/15 | train_loss=0.1780 acc=0.9400 | val_loss=0.2590 acc=0.9069


                                                                                

Epoch 06/15 | train_loss=0.1571 acc=0.9478 | val_loss=0.2831 acc=0.9089


                                                                                

Epoch 07/15 | train_loss=0.1417 acc=0.9530 | val_loss=0.2807 acc=0.9020
Early stopping triggered.




#### Evaluation on the held-out test split

In [7]:
if best_state is not None:
    model.load_state_dict(best_state)

model.eval()
all_preds, all_probs, all_true = [], [], []
with torch.no_grad():
    progress = tqdm(test_loader, desc='Test', leave=False)
    for batch_inputs, batch_labels in progress:
        batch_inputs = batch_inputs.to(device)
        logits = model(batch_inputs)
        probs = torch.sigmoid(logits).cpu()
        preds = (probs > 0.5).int()
        all_probs.extend(probs.numpy().tolist())
        all_preds.extend(preds.numpy().tolist())
        all_true.extend(batch_labels.numpy().astype(int))

all_true = np.array(all_true)
all_preds = np.array(all_preds)
test_acc = (all_preds == all_true).mean()
print(f'Test accuracy: {test_acc:.4f}')
print(classification_report(all_true, all_preds, target_names=label_encoder.classes_))
print('Confusion matrix:', confusion_matrix(all_true, all_preds))

                                                      

Test accuracy: 0.9035
              precision    recall  f1-score   support

    negative       0.88      0.93      0.91      1996
    positive       0.93      0.87      0.90      1902

    accuracy                           0.90      3898
   macro avg       0.91      0.90      0.90      3898
weighted avg       0.90      0.90      0.90      3898

Confusion matrix: [[1863  133]
 [ 243 1659]]




#### Persist model artifacts for future inference

In [8]:
model_path = './LSTM-conf/sentiment_bilstm_model.pt'
vocab_path = './LSTM-conf/sentiment_vocab.json'
label_path = './LSTM-conf/sentiment_label_encoder.joblib'
config_path = './LSTM-conf/sentiment_config.json'

torch.save(model.state_dict(), model_path)
with open(vocab_path, 'w', encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False)
joblib.dump(label_encoder, label_path)
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump({'max_len': max_len, 'vocab_size': len(vocab)}, f)

print('Artifacts saved:', model_path, vocab_path, label_path, config_path)

Artifacts saved: sentiment_bilstm_model.pt sentiment_vocab.json sentiment_label_encoder.joblib sentiment_config.json


## K-Fold Cross-Validation
Menjalankan evaluasi StratifiedKFold untuk melihat stabilitas model di seluruh data latih.

In [10]:

from sklearn.model_selection import StratifiedKFold

texts = balanced_df['text_for_model'].reset_index(drop=True)
labels = balanced_df['label_id'].reset_index(drop=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
num_folds = skf.get_n_splits()
num_epochs_cv = 10
cv_patience = 2
cv_results = []


class FoldTokensDataset(Dataset):
    def __init__(self, texts, labels, vocab_map, max_length):
        self.samples = [encode_text(text, vocab_map, max_length) for text in texts]
        self.labels = torch.tensor(labels.to_numpy(), dtype=torch.float32)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx], self.labels[idx]


def make_loader(texts, labels, vocab_map, max_length, batch_size=256, shuffle=False):
    dataset = FoldTokensDataset(texts.reset_index(drop=True), labels.reset_index(drop=True), vocab_map, max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


def run_epoch_fold(model, loader, criterion, optimizer=None, desc='Train'):
    train_mode = optimizer is not None
    model.train() if train_mode else model.eval()
    epoch_loss = 0.0
    correct = 0
    total = 0
    progress = tqdm(loader, desc=desc, leave=False)
    for batch_inputs, batch_labels in progress:
        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.to(device)

        if train_mode:
            optimizer.zero_grad(set_to_none=True)

        logits = model(batch_inputs)
        loss = criterion(logits, batch_labels)

        if train_mode:
            loss.backward()
            optimizer.step()

        preds = (torch.sigmoid(logits) > 0.5).float()
        batch_size = batch_labels.size(0)
        epoch_loss += loss.item() * batch_size
        correct += (preds == batch_labels).sum().item()
        total += batch_size
        progress.set_postfix(loss=epoch_loss / total, acc=correct / total)

    return epoch_loss / total, correct / total


for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels), 1):
    print(f"===== Fold {fold}/{num_folds} =====")
    fold_train_texts = texts.iloc[train_idx]
    fold_train_labels = labels.iloc[train_idx]
    fold_val_texts = texts.iloc[val_idx]
    fold_val_labels = labels.iloc[val_idx]

    fold_vocab = build_vocab(fold_train_texts)
    fold_lengths = fold_train_texts.str.split().str.len()
    fold_max_len = max(int(np.percentile(fold_lengths, 95)), 10)

    train_loader_cv = make_loader(fold_train_texts, fold_train_labels, fold_vocab, fold_max_len, shuffle=True)
    val_loader_cv = make_loader(fold_val_texts, fold_val_labels, fold_vocab, fold_max_len)

    fold_model = SentimentBiLSTM(len(fold_vocab)).to(device)
    fold_criterion = nn.BCEWithLogitsLoss()
    fold_optimizer = torch.optim.Adam(fold_model.parameters(), lr=1e-3)
    fold_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        fold_optimizer,
        mode='min',
        factor=0.5,
        patience=2,
        verbose=False
    )

    best_val_loss = float('inf')
    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(1, num_epochs_cv + 1):
        train_loss, train_acc = run_epoch_fold(
            fold_model,
            train_loader_cv,
            fold_criterion,
            optimizer=fold_optimizer,
            desc=f'Fold {fold} Train'
        )
        val_loss, val_acc = run_epoch_fold(
            fold_model,
            val_loader_cv,
            fold_criterion,
            optimizer=None,
            desc=f'Fold {fold} Val'
        )
        fold_scheduler.step(val_loss)

        print(
            f'Fold {fold} Epoch {epoch:02d} | '
            f'train_loss={train_loss:.4f} acc={train_acc:.4f} | '
            f'val_loss={val_loss:.4f} acc={val_acc:.4f}'
        )

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= cv_patience:
                print('Early stopping fold due to no validation improvement.')
                break

    cv_results.append({'fold': fold, 'val_loss': best_val_loss, 'val_acc': best_val_acc})
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

val_losses = [item['val_loss'] for item in cv_results]
val_accs = [item['val_acc'] for item in cv_results]

print('===== Cross-validation summary =====')
for item in cv_results:
    print(f"Fold {item['fold']}: val_loss={item['val_loss']:.4f} | val_acc={item['val_acc']:.4f}")
    
print(f"Average val loss: {np.mean(val_losses):.4f} +/- {np.std(val_losses):.4f}")
print(f"Average val acc : {np.mean(val_accs):.4f} +/- {np.std(val_accs):.4f}")

===== Fold 1/5 =====


                                                                                       

Fold 1 Epoch 01 | train_loss=0.3879 acc=0.8432 | val_loss=0.2702 acc=0.8969


                                                                                       

Fold 1 Epoch 02 | train_loss=0.2590 acc=0.9039 | val_loss=0.2536 acc=0.9047


                                                                                       

Fold 1 Epoch 03 | train_loss=0.2244 acc=0.9192 | val_loss=0.2440 acc=0.9117


                                                                                       

Fold 1 Epoch 04 | train_loss=0.2032 acc=0.9294 | val_loss=0.2378 acc=0.9125


                                                                                       

Fold 1 Epoch 05 | train_loss=0.1835 acc=0.9359 | val_loss=0.2503 acc=0.9126


                                                                                       

Fold 1 Epoch 06 | train_loss=0.1626 acc=0.9441 | val_loss=0.2545 acc=0.9093
Early stopping fold due to no validation improvement.
===== Fold 2/5 =====


                                                                                       

Fold 2 Epoch 01 | train_loss=0.3915 acc=0.8319 | val_loss=0.2699 acc=0.8983


                                                                                       

Fold 2 Epoch 02 | train_loss=0.2560 acc=0.9061 | val_loss=0.2460 acc=0.9079


                                                                                      

Fold 2 Epoch 03 | train_loss=0.2277 acc=0.9171 | val_loss=0.2485 acc=0.9126


                                                                                      

Fold 2 Epoch 04 | train_loss=0.2050 acc=0.9274 | val_loss=0.2433 acc=0.9142


                                                                                      

Fold 2 Epoch 05 | train_loss=0.1853 acc=0.9366 | val_loss=0.2495 acc=0.9098


                                                                                      

Fold 2 Epoch 06 | train_loss=0.1653 acc=0.9434 | val_loss=0.2512 acc=0.9116
Early stopping fold due to no validation improvement.
===== Fold 3/5 =====


                                                                                      

Fold 3 Epoch 01 | train_loss=0.3785 acc=0.8385 | val_loss=0.2843 acc=0.8899


                                                                                      

Fold 3 Epoch 02 | train_loss=0.2549 acc=0.9072 | val_loss=0.2539 acc=0.9013


                                                                                       

Fold 3 Epoch 03 | train_loss=0.2229 acc=0.9208 | val_loss=0.2482 acc=0.9055


                                                                                      

Fold 3 Epoch 04 | train_loss=0.2033 acc=0.9288 | val_loss=0.2515 acc=0.9051


                                                                                       

Fold 3 Epoch 05 | train_loss=0.1802 acc=0.9383 | val_loss=0.2618 acc=0.9065
Early stopping fold due to no validation improvement.
===== Fold 4/5 =====


                                                                                      

Fold 4 Epoch 01 | train_loss=0.3829 acc=0.8420 | val_loss=0.2792 acc=0.8961


                                                                                       

Fold 4 Epoch 02 | train_loss=0.2531 acc=0.9079 | val_loss=0.2638 acc=0.9017


                                                                                       

Fold 4 Epoch 03 | train_loss=0.2225 acc=0.9219 | val_loss=0.2732 acc=0.9036


                                                                                      

Fold 4 Epoch 04 | train_loss=0.1993 acc=0.9316 | val_loss=0.2615 acc=0.9048


                                                                                      

Fold 4 Epoch 05 | train_loss=0.1795 acc=0.9390 | val_loss=0.2706 acc=0.9052


                                                                                      

Fold 4 Epoch 06 | train_loss=0.1599 acc=0.9459 | val_loss=0.2728 acc=0.9006
Early stopping fold due to no validation improvement.
===== Fold 5/5 =====


                                                                                      

Fold 5 Epoch 01 | train_loss=0.3899 acc=0.8406 | val_loss=0.2727 acc=0.8953


                                                                                      

Fold 5 Epoch 02 | train_loss=0.2548 acc=0.9063 | val_loss=0.2511 acc=0.9058


                                                                                      

Fold 5 Epoch 03 | train_loss=0.2227 acc=0.9209 | val_loss=0.2535 acc=0.9052


                                                                                       

Fold 5 Epoch 04 | train_loss=0.2005 acc=0.9291 | val_loss=0.2475 acc=0.9083


                                                                                       

Fold 5 Epoch 05 | train_loss=0.1809 acc=0.9368 | val_loss=0.2565 acc=0.9058


                                                                                      

Fold 5 Epoch 06 | train_loss=0.1605 acc=0.9447 | val_loss=0.2621 acc=0.9075
Early stopping fold due to no validation improvement.
===== Cross-validation summary =====
Fold 1: val_loss=0.2378 | val_acc=0.9125
Fold 2: val_loss=0.2433 | val_acc=0.9142
Fold 3: val_loss=0.2482 | val_acc=0.9055
Fold 4: val_loss=0.2615 | val_acc=0.9048
Fold 5: val_loss=0.2475 | val_acc=0.9083
Average val loss: 0.2476 +/- 0.0079
Average val acc : 0.9090 +/- 0.0037


