In [1]:
from google.colab import drive
import os

# Mount Google Drive ke direktori default
drive.mount('/content/drive')

# Setel direktori kerja ke subdirektori yang diinginkan
subdir = '/content/drive/MyDrive/SatriaData2024'
os.makedirs(subdir, exist_ok=True)
os.chdir('/content/drive/MyDrive/SatriaData2024')

# Verifikasi direktori kerja saat ini
!pwd


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SatriaData2024


In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score


In [4]:
data = pd.read_csv('data/dataset_penyisihan_bdc_2024.csv')
data.head()

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik


In [5]:
dataset = Dataset.from_pandas(data)

# Kamus untuk memetakan label teks ke integer
label_to_id = {
    "Sumber Daya Alam": 0,
    "Politik": 1,
    "Demografi": 2,
    "Pertahanan dan Keamanan": 3,
    "Ideologi": 4,
    "Ekonomi": 5,
    "Sosial Budaya": 6,
    "Geografi": 7
}

# Ubah label di dataset menjadi integer
def map_labels(example):
    example['label'] = label_to_id[example['label']]
    return example

# Terapkan fungsi map_labels ke dataset
dataset = dataset.map(map_labels)

# Verifikasi bahwa label telah diubah menjadi integer
print(dataset)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})


In [6]:
# Split data training dan testing menggunakan stratify (karena data imbalance)
train_data, test_data = train_test_split(dataset, test_size=0.2, stratify=dataset['label'], random_state=42)

train_data = Dataset.from_dict(train_data)
test_data = Dataset.from_dict(test_data)


In [7]:
import nltk
import random
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from collections import Counter
from datasets import Dataset

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('indonesian')

def synonym_replacement(words, stop_words):
    new_words = []
    for word in words:
        if word not in stop_words:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = random.choice(synonyms[0].lemma_names()).lower()
                new_words.append(synonym)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return new_words

def random_insertion(words, stop_words):
    new_words = words.copy()
    synonyms = random.sample(stop_words, random.randint(1, len(stop_words) // 5))
    for synonym in synonyms:
        random_index = random.randint(0, len(new_words))
        new_words.insert(random_index, synonym)
    return new_words

def random_deletion(words, stop_words):
    new_words = words.copy()
    if len(new_words) > 2:  # Avoid deleting everything
        deletion_index = random.randint(0, len(new_words) - 1)
        if new_words[deletion_index] not in stop_words:
            del new_words[deletion_index]
    return new_words

def augment_data(example):
    augmented_texts = []
    for text in example['text']:
        words = text.split()
        augmented_text = ' '.join(synonym_replacement(words, stop_words) + random_insertion(words.copy(), stop_words) + random_deletion(words.copy(), stop_words))
        augmented_texts.append(augmented_text)
    return {'text': augmented_texts, 'label': example['label']}

def balance_data(train_data, min_count=200):
    label_counter = Counter(train_data['label'])
    data_by_label = {label: [] for label in label_counter}

    for text, label in zip(train_data['text'], train_data['label']):
        data_by_label[label].append(text)

    augmented_train_data = {'text': [], 'label': []}

    for label, texts in data_by_label.items():
        while len(data_by_label[label]) < min_count:
            example = {'text': texts, 'label': [label] * len(texts)}
            augmented_example = augment_data(example)
            data_by_label[label].extend(augmented_example['text'])
        augmented_train_data['text'].extend(data_by_label[label])
        augmented_train_data['label'].extend([label] * len(data_by_label[label]))

    return Dataset.from_dict(augmented_train_data)

# Apply the balance_data function to your dataset
augmented_train_data = balance_data(train_data)

# Convert the augmented data back to a DataFrame
augmented_df = augmented_train_data.to_pandas()

# Calculate and display the label distribution
label_distribution = augmented_df['label'].value_counts().reset_index()
label_distribution.columns = ['label', 'count']
print(label_distribution)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


   label  count
0      1   2378
1      6    470
2      2    392
3      3    320
4      4    320
5      0    306
6      5    294
7      7    256


In [8]:
# Class Weighting
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(data['label']),
    y=data['label']
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print(class_weights)


tensor([10.0806,  1.7030, 31.2500,  1.5625,  1.5625,  0.2103,  1.0647,  3.2552])


In [9]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
# Import libraries
from datasets import Dataset as HFDataset
from transformers import BertTokenizer

# Pastikan augmented_train_data dan test_data adalah objek datasets.Dataset

# Tokenizer untuk model BERT
tokenizer = BertTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenisasi dataset
tokenized_train_dataset = augmented_train_data.map(tokenize_function, batched=True)
tokenized_test_dataset = test_data.map(tokenize_function, batched=True)

print(tokenized_train_dataset)
print(tokenized_test_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/4736 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4736
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})


In [11]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(model.device)
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [12]:
# Inisialisasi training argument
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./model/baru",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-9,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [13]:
from sklearn.metrics import balanced_accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"balanced_accuracy": balanced_accuracy_score(labels, predictions)}


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

# Load model
model = BertForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=8)

# Path ke checkpoint
checkpoint_path = "./model/baru/checkpoint-1184"

# Jika tokenized_train_dataset dan tokenized_test_dataset sudah didefinisikan
# dan compute_metrics sudah didefinisikan

# Hyperparameter grid
learning_rates = [1e-5, 2e-5, 3e-5]
num_train_epochs = [2, 3, 4]
batch_sizes = [16, 32]

# Menyimpan skor terbaik dan parameter terbaik
best_score = 0
best_params = {}

for lr in learning_rates:
    for epochs in num_train_epochs:
        for batch_size in batch_sizes:
            training_args = TrainingArguments(
                output_dir='./model/baru',
                evaluation_strategy="epoch",
                save_strategy="epoch",
                learning_rate=lr,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=epochs,
                weight_decay=0.01,
                load_best_model_at_end=True,
                metric_for_best_model="balanced_accuracy",
                logging_dir='./logs',
                report_to="none"  # Disable reporting to WANDB or other external services
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_train_dataset,
                eval_dataset=tokenized_test_dataset,
                compute_metrics=compute_metrics
            )

            # Melanjutkan pelatihan dari checkpoint
            trainer.train(resume_from_checkpoint=checkpoint_path)

            # Evaluate the model
            eval_result = trainer.evaluate()
            if eval_result['eval_balanced_accuracy'] > best_score:
                best_score = eval_result['eval_balanced_accuracy']
                best_params = {
                    'learning_rate': lr,
                    'num_train_epochs': epochs,
                    'batch_size': batch_size
                }

print("Best Score:", best_score)
print("Best Parameters:", best_params)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


	per_device_train_batch_size: 32 (from args) != 16 (from trainer_state.json)
Could not locate the best model at ./model/baru/checkpoint-888/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Epoch,Training Loss,Validation Loss


Could not locate the best model at ./model/baru/checkpoint-888/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Epoch,Training Loss,Validation Loss


In [None]:
!cp -r ./results /content/drive/MyDrive/SatriaData2024/model
