In [13]:
!pip install transformers tensorflow pandas scikit-learn openpyxl




In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Ganti path ke file kamu
DATA_PATH = r"Dataset Text Web.xlsx"

# Misalnya sheet berisi kolom: "text" dan "label"
df = pd.read_excel(DATA_PATH)

# Cek dulu struktur datanya
print(df.head())

# Split data menjadi train & test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["review"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)


                                              review  label
0  Q kira besar ,ternyata mini🤭🤭 tapi cocok sic d...      1
1  Koyok ngengek, yg dikirim minyak 1 mili iklan ...      0
2  Saya kira awalnya minyak wangi Fress beneran, ...      1
3  Baik, harum sekali, testyr kurang besar, murah...      2
4  Produk sesuai dengan pesanan pengiriman tepat ...      2


In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

MODEL_PATH = r"indobert-base-p1"
SAVED_PATH = r"result"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=len(df["label"].unique()))






TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
import tensorflow as tf

train_encodings = tokenizer(
    list(train_texts),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="tf"
)

test_encodings = tokenizer(
    list(test_texts),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="tf"
)


In [20]:
train_labels = tf.convert_to_tensor(list(train_labels))
test_labels = tf.convert_to_tensor(list(test_labels))

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(100).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)


In [21]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

history = model.fit(
    train_dataset,
    epochs=3,
    validation_data=test_dataset
)


Epoch 1/3


Epoch 2/3
Epoch 3/3


In [35]:
from sklearn.metrics import classification_report, confusion_matrix

# ==============================
# 🔍 Evaluasi Model
# ==============================

# 1️⃣ Ambil prediksi model pada test set
y_pred_logits = model.predict(test_dataset).logits  # ambil logits
y_pred = np.argmax(y_pred_logits, axis=1)           # ambil kelas prediksi

# 2️⃣ Ambil label asli (ground truth)
y_true = np.concatenate([y for x, y in test_dataset], axis=0)

# 3️⃣ Print classification report
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, digits=4))

# 4️⃣ (Opsional) Print confusion matrix
print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_true, y_pred))


=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9704    0.9213    0.9452       178
           1     0.8069    0.9070    0.8540       129
           2     0.9217    0.8689    0.8945       122

    accuracy                         0.9021       429
   macro avg     0.8997    0.8991    0.8979       429
weighted avg     0.9074    0.9021    0.9034       429


=== Confusion Matrix ===
[[164  13   1]
 [  4 117   8]
 [  1  15 106]]


In [40]:
import numpy as np
from sklearn.metrics import accuracy_score

# Prediksi probabilitas (logits)
y_train_pred_logits = model.predict(train_encodings).logits
y_test_pred_logits = model.predict(test_encodings).logits

# Ambil kelas dengan probabilitas tertinggi
y_train_pred = np.argmax(y_train_pred_logits, axis=1)
y_test_pred = np.argmax(y_test_pred_logits, axis=1)

# Hitung akurasi
train_acc = accuracy_score(train_labels, y_train_pred)
test_acc = accuracy_score(test_labels, y_test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Gap: {train_acc - test_acc:.4f}")


Training Accuracy: 0.9609
Test Accuracy: 0.9021
Gap: 0.0588


In [22]:
SAVE_PATH = r"result"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print(f"✅ Model berhasil disimpan di {SAVE_PATH}")


✅ Model berhasil disimpan di result


In [36]:
import numpy as np

def predict_sentiment(text):
    tokens = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="tf")
    logits = model(**tokens).logits
    pred = tf.nn.softmax(logits, axis=-1)
    label_id = tf.argmax(pred, axis=1).numpy()[0]
    return label_id, pred.numpy()[0]

sample_text = "Wanginya biasa aja"
label, prob = predict_sentiment(sample_text)
print("Teks:", sample_text)
if label == 2: 
    print("Prediksi Label: Positif")
if label == 1: 
    print("Prediksi Label: Netral")
if label == 0: 
    print("Prediksi Label: Negatif")
print("Probabilitas:", prob)


Teks: Wanginya biasa aja
Prediksi Label: Netral
Probabilitas: [0.00315966 0.99166673 0.00517357]
