Langkah Opsional install library pada colab jika dibutuhkan aja

In [None]:
# !pip install datasets

In [None]:
# !pip uninstall transformers peft -y
# !pip install transformers peft

Langkah 1 Import library

In [None]:
import pandas as pd
from transformers import  AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy, EarlyStoppingCallback
import torch
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"

Langkah 2 Memasukan dataset dan tahap pengolahan data

In [None]:
df = pd.read_csv("ada labelnya - Sheet1.csv")
df.rename(columns={'Data Komentar TIKTOK': 'text', 'Labeling': 'label'}, inplace=True)

print(f"Nilai kosong pada kolom text: {df['text'].isna().sum()}")
print(f"Nilai kosong pada kolom label: {df['label'].isna().sum()}")

df = df.dropna(subset=['text'])
df = df.dropna(subset=['label'])

print(f"DataFrame shape after dropping NA: {df.shape}")

label_map = {'netral': 0, 'positif': 1, 'negatif': 2}
df['label'] = df['label'].map(label_map)
df

Langkah 3 Inisialisasi model

In [None]:
model_name = "cahya/roberta-base-indonesian-522M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Langkah 4 Tokenisasi Data

In [None]:
def tokenize_function(examples):
  texts = [str(text) if text is not None else "" for text in examples['text']]
  return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_function, batched=True)

sample_idx = 0
sample_text = dataset[sample_idx]['text']
sample_tokens = dataset[sample_idx]

token_ids = sample_tokens['input_ids']
token_words = tokenizer.convert_ids_to_tokens(token_ids[:30])

token_df = pd.DataFrame({
  'Position': range(len(token_words)),
  'Token': token_words,
  'ID': token_ids[:len(token_words)],
  'Attention Mask': sample_tokens['attention_mask'][:len(token_words)]
})

token_df['Token'] = token_df['Token'].str.replace('Ġ', '', regex=False)

print(f"Text asli sebelum tokenisasi: {sample_text}\n")
print("Hasil tokenisasi:")
display(token_df)

Langkah 5 Split data

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

Langkah 6 Load Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

Langkah 7 Set training config & trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy=IntervalStrategy.EPOCH,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # atau ganti sesuai metrik kamu
    greater_is_better=False,
    save_strategy=IntervalStrategy.EPOCH
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Langkah 8 Memulai training model

In [None]:
trainer.train()

Langkah 9 Evaluasi Model

In [None]:
results = trainer.evaluate()
print(results)

Langkah 10 Simpan hasil sentimen ke file excel baru

In [None]:
sentiment_results = pd.DataFrame()

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

sentiment_results["text"] = test_dataset["text"]
sentiment_results["true_label"] = test_dataset["label"]
sentiment_results["predicted_label"] = predicted_labels

reverse_label_map = {v: k for k, v in label_map.items()}
sentiment_results['true_label_text'] = sentiment_results['true_label'].map(reverse_label_map)
sentiment_results['predicted_label_text'] = sentiment_results['predicted_label'].map(reverse_label_map)

y_true = np.array(test_dataset["label"])
y_pred = predicted_labels

accuracy = accuracy_score(y_true, y_pred)
precision_macro = precision_score(y_true, y_pred, average='macro')
precision_weighted = precision_score(y_true, y_pred, average='weighted')
recall_macro = recall_score(y_true, y_pred, average='macro')
recall_weighted = recall_score(y_true, y_pred, average='weighted')
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')

report = classification_report(y_true, y_pred, target_names=list(reverse_label_map.values()), output_dict=True)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision_macro:.4f}")
print(f"Precision (weighted): {precision_weighted:.4f}")
print(f"Recall (macro): {recall_macro:.4f}")
print(f"Recall (weighted): {recall_weighted:.4f}")
print(f"F1 Score (macro): {f1_macro:.4f}")
print(f"F1 Score (weighted): {f1_weighted:.4f}")

for key, value in results.items():
  sentiment_results[key] = value

sentiment_results['accuracy'] = accuracy
sentiment_results['precision_macro'] = precision_macro
sentiment_results['precision_weighted'] = precision_weighted
sentiment_results['recall_macro'] = recall_macro
sentiment_results['recall_weighted'] = recall_weighted
sentiment_results['f1_macro'] = f1_macro
sentiment_results['f1_weighted'] = f1_weighted

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(cm)

with pd.ExcelWriter("sentiment_analysis_results.xlsx") as writer:
  sentiment_results.to_excel(writer, sheet_name="Predictions", index=False)

  metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision (macro)', 'Precision (weighted)',
          'Recall (macro)', 'Recall (weighted)',
          'F1 Score (macro)', 'F1 Score (weighted)'],
    'Value': [accuracy, precision_macro, precision_weighted,
         recall_macro, recall_weighted,
         f1_macro, f1_weighted]
  })
  metrics_df.to_excel(writer, sheet_name="Overall Metrics", index=False)

  class_metrics = pd.DataFrame()
  for cls in report:
    if cls not in ['accuracy', 'macro avg', 'weighted avg']:
      class_metrics[cls] = pd.Series(report[cls])
  class_metrics.to_excel(writer, sheet_name="Class Metrics", index=True)

print("Sentiment analysis results saved to sentiment_analysis_results.xlsx")