1. Instalasi dan Setup Awal

In [2]:
import pandas as pd
import re
import nltk
import os
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'pandas'

2. Fungsi Preprocessing

In [4]:
def preprocess_text(text):
    """Membersihkan teks: lowercase, hapus HTML/URL/tanda baca, stopwords, dan stemming"""
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)          # Hapus HTML tags
    text = re.sub(r'http\S+', '', text)        # Hapus URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)    # Hapus non-alphabet
    text = re.sub(r'\s+', ' ', text).strip()   # Hapus spasi berlebih

    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # Stopwords
    words = [PorterStemmer().stem(word) for word in words]                     # Stemming
    return ' '.join(words)

# Contoh uji fungsi
print(preprocess_text("<p>I LOVEEEE this movie!!! https://example.com </p>"))  # Output: "lovee movi"

loveee movi


3. Load & Preprocess data

In [5]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/dataset/IMDB Dataset.csv")  # Ganti dengan path Anda
df['clean_review'] = df['review'].apply(preprocess_text)  # Apply preprocessing

# Encode label
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])  # positive=1, negative=0

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['clean_review'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

print("Contoh data training:", train_texts[0][:100], "...")

Contoh data training: that kept ask mani fight scream match swear gener mayhem permeat minut comparison also stand think o ...


4. Tokenisasi dengan BERT

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenisasi data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

# Konversi ke TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

print("Jumlah batch training:", len(train_dataset))

Jumlah batch training: 2500


5. Training Model

In [None]:
# Inisialisasi model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Setup optimizer
num_train_steps = len(train_dataset) * 3
optimizer, _ = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps)

# Compile model
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Training
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3
)

# Evaluasi
loss, accuracy = model.evaluate(test_dataset)
print(f"\nAkurasi test: {accuracy:.4f}")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
 121/2500 [>.............................] - ETA: 34:40:46 - loss: 0.6061 - accuracy: 0.6627

6. Simpan Model

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Lokasi penyimpanan
save_dir = "/content/drive/MyDrive/bert_imdb_model"
os.makedirs(save_dir, exist_ok=True)

# Simpan model & tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

# Alternatif download langsung
!zip -r bert_model.zip {save_dir}
from google.colab import files
files.download("bert_model.zip")

print(f"Model disimpan di: {save_dir}")

7. Fungsi Prediksi

In [None]:
def predict_sentiment(text, model, tokenizer):
    """Prediksi sentimen dari teks input"""
    cleaned_text = preprocess_text(text)  # Preprocess konsisten
    inputs = tokenizer(
        cleaned_text,
        return_tensors="tf",
        truncation=True,
        padding=True,
        max_length=256
    )
    outputs = model(inputs)
    probs = tf.nn.softmax(outputs.logits, axis=-1)
    label = tf.argmax(probs, axis=1).numpy()[0]
    return "positive" if label == 1 else "negative"

# Contoh penggunaan
loaded_model = TFBertForSequenceClassification.from_pretrained(save_dir)
loaded_tokenizer = BertTokenizer.from_pretrained(save_dir)

print(predict_sentiment("This movie sucks!", loaded_model, loaded_tokenizer))  # Output: negative