In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
df_eng = pd.read_csv('sentiment_clean_en.csv')
df_malay = pd.read_csv('sentiment_clean_ma.csv')

In [3]:
# ✅ Step 1: Balance English dataset
sample_size = 32000
df_eng_balanced = df_eng.groupby("sentiment").apply(lambda x: x.sample(n=min(sample_size, len(x)), random_state=42))
df_eng_balanced = df_eng_balanced.reset_index(drop=True)
df_malay_balanced = df_malay.groupby("label").apply(lambda x: x.sample(n=min(sample_size, len(x)), random_state=42))
df_malay_balanced = df_malay_balanced.reset_index(drop=True)

  df_eng_balanced = df_eng.groupby("sentiment").apply(lambda x: x.sample(n=min(sample_size, len(x)), random_state=42))
  df_malay_balanced = df_malay.groupby("label").apply(lambda x: x.sample(n=min(sample_size, len(x)), random_state=42))


In [4]:
# Step 1: Rename columns to match
df_eng_balanced = df_eng_balanced.rename(columns={"sentiment": "sentiment_label"})
df_malay_balanced = df_malay_balanced.rename(columns={"label": "sentiment_label"})
df_eng_balanced = df_eng_balanced.rename(columns={"text": "tweet_text"})
df_malay_balanced = df_malay_balanced.rename(columns={"text": "tweet_text"})

In [None]:
# Step 3: Combine both datasets
df_combined = pd.concat([df_eng_balanced, df_malay_balanced], ignore_index=True)

# Step 4: Shuffle dataset for randomness
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
# Step 1: Convert numerical labels back to text
df_combined["sentiment_label"] = df_combined["sentiment_label"].map({0: "negative", 1: "positive"})

# Check the mapping
print(df_combined["sentiment_label"].value_counts())


sentiment_label
positive    48310
negative    47702
Name: count, dtype: int64


In [7]:
# ✅ Step 3: Encode labels
label_encoder = LabelEncoder()
df_combined["sentiment_label"] = label_encoder.fit_transform(df_combined["sentiment_label"])

In [8]:
# Get the mapping of labels to numbers
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print(label_mapping)

{0: 'negative', 1: 'positive'}


In [9]:
# ✅ Step 4: Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_combined["tweet_text"], df_combined["sentiment_label"], test_size=0.2, random_state=42
)

In [10]:
# ✅ Step 5: Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

def tokenize_function(examples):
    return tokenizer(list(map(str, examples["tweet_text"])), padding="max_length", truncation=True, max_length=128)


In [11]:
# ✅ Step 6: Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(pd.DataFrame({"tweet_text": train_texts, "labels": train_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"tweet_text": test_texts, "labels": test_labels}))

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 76809/76809 [00:25<00:00, 3034.45 examples/s]
Map: 100%|██████████| 19203/19203 [00:06<00:00, 3039.39 examples/s]


In [12]:
# ✅ Step 7: Load Model
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# ✅ Step 8: Training Arguments
training_args = TrainingArguments(
    output_dir="./sentiment_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_score": f1_score(labels, preds, average="weighted"),
    }



In [14]:
# ✅ Step 9: Train Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.3394,0.33508,0.848878,0.84887
2,0.2775,0.329704,0.859761,0.859728
3,0.209,0.375734,0.861688,0.861701


TrainOutput(global_step=14403, training_loss=0.2917973791767489, metrics={'train_runtime': 3195.2937, 'train_samples_per_second': 72.114, 'train_steps_per_second': 4.508, 'total_flos': 7631016317526528.0, 'train_loss': 0.2917973791767489, 'epoch': 3.0})

In [15]:
# 11. Evaluate the Model
trainer.evaluate()

{'eval_loss': 0.37573379278182983,
 'eval_accuracy': 0.861688277873249,
 'eval_f1_score': 0.8617007998438757,
 'eval_runtime': 88.7834,
 'eval_samples_per_second': 216.29,
 'eval_steps_per_second': 3.39,
 'epoch': 3.0}

In [16]:
# 12. Save the Model
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_tokenizer")

('sentiment_tokenizer\\tokenizer_config.json',
 'sentiment_tokenizer\\special_tokens_map.json',
 'sentiment_tokenizer\\vocab.txt',
 'sentiment_tokenizer\\added_tokens.json')

In [17]:
import joblib
joblib.dump(label_encoder, "sentiment_label_encoder.pkl")  # Save encoder

['sentiment_label_encoder.pkl']