In [1]:
from google.colab import drive
import os

# Mount Google Drive ke direktori default
drive.mount('/content/drive')

# Setel direktori kerja ke subdirektori yang diinginkan
subdir = '/content/drive/MyDrive/SatriaData2024'
os.makedirs(subdir, exist_ok=True)
os.chdir('/content/drive/MyDrive/SatriaData2024')

# Verifikasi direktori kerja saat ini
!pwd


Mounted at /content/drive
/content/drive/MyDrive/SatriaData2024


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score


2024-06-19 04:13:24.733427: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-19 04:13:24.775066: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
data = pd.read_csv('data/dataset_penyisihan_bdc_2024.csv')
data.head()

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik


In [None]:
dataset = Dataset.from_pandas(data)

# Kamus untuk memetakan label teks ke integer
label_to_id = {
    "Sumber Daya Alam": 0,
    "Politik": 1,
    "Demografi": 2,
    "Pertahanan dan Keamanan": 3,
    "Ideologi": 4,
    "Ekonomi": 5,
    "Sosial Budaya": 6,
    "Geografi": 7
}

# Ubah label di dataset menjadi integer
def map_labels(example):
    example['label'] = label_to_id[example['label']]
    return example

# Terapkan fungsi map_labels ke dataset
dataset = dataset.map(map_labels)

# Verifikasi bahwa label telah diubah menjadi integer
print(dataset)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})


In [None]:
# Split data training dan testing menggunakan stratify (karena data imbalance)
train_data, test_data = train_test_split(dataset, test_size=0.2, stratify=dataset['label'], random_state=42)

train_data = Dataset.from_dict(train_data)
test_data = Dataset.from_dict(test_data)


In [None]:
import nltk
import random
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from collections import Counter
from datasets import Dataset

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('indonesian')

def synonym_replacement(words, stop_words):
    new_words = []
    for word in words:
        if word not in stop_words:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = random.choice(synonyms[0].lemma_names()).lower()
                new_words.append(synonym)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return new_words

def random_insertion(words, stop_words):
    new_words = words.copy()
    synonyms = random.sample(stop_words, random.randint(1, len(stop_words) // 5))
    for synonym in synonyms:
        random_index = random.randint(0, len(new_words))
        new_words.insert(random_index, synonym)
    return new_words

def random_deletion(words, stop_words):
    new_words = words.copy()
    if len(new_words) > 2:  # Avoid deleting everything
        deletion_index = random.randint(0, len(new_words) - 1)
        if new_words[deletion_index] not in stop_words:
            del new_words[deletion_index]
    return new_words

def augment_data(example):
    augmented_texts = []
    for text in example['text']:
        words = text.split()
        augmented_text = ' '.join(synonym_replacement(words, stop_words) + random_insertion(words.copy(), stop_words) + random_deletion(words.copy(), stop_words))
        augmented_texts.append(augmented_text)
    return {'text': augmented_texts, 'label': example['label']}

def balance_data(train_data, min_count=200):
    label_counter = Counter(train_data['label'])
    data_by_label = {label: [] for label in label_counter}

    for text, label in zip(train_data['text'], train_data['label']):
        data_by_label[label].append(text)

    augmented_train_data = {'text': [], 'label': []}

    for label, texts in data_by_label.items():
        while len(data_by_label[label]) < min_count:
            example = {'text': texts, 'label': [label] * len(texts)}
            augmented_example = augment_data(example)
            data_by_label[label].extend(augmented_example['text'])
        augmented_train_data['text'].extend(data_by_label[label])
        augmented_train_data['label'].extend([label] * len(data_by_label[label]))

    return Dataset.from_dict(augmented_train_data)

# Apply the balance_data function to your dataset
augmented_train_data = balance_data(train_data)

# Convert the augmented data back to a DataFrame
augmented_df = augmented_train_data.to_pandas()

# Calculate and display the label distribution
label_distribution = augmented_df['label'].value_counts().reset_index()
label_distribution.columns = ['label', 'count']
print(label_distribution)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   label  count
0      1   2378
1      6    470
2      2    392
3      3    320
4      4    320
5      0    306
6      5    294
7      7    256


In [None]:
# Class Weighting
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(data['label']),
    y=data['label']
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print(class_weights)


tensor([10.0806,  1.7030, 31.2500,  1.5625,  1.5625,  0.2103,  1.0647,  3.2552])


In [None]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Import libraries
from datasets import Dataset as HFDataset
from transformers import BertTokenizer

# Pastikan augmented_train_data dan test_data adalah objek datasets.Dataset

# Tokenizer untuk model BERT
tokenizer = BertTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenisasi dataset
tokenized_train_dataset = augmented_train_data.map(tokenize_function, batched=True)
tokenized_test_dataset = test_data.map(tokenize_function, batched=True)

print(tokenized_train_dataset)
print(tokenized_test_dataset)


Map:   0%|          | 0/4736 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4736
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})


In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(model.device)
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
# Inisialisasi training argument
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    predictions = np.argmax(pred.predictions, axis=1)
    return {
        'balanced_accuracy': balanced_accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='weighted'),
        'recall': recall_score(labels, predictions, average='weighted'),
        'f1': f1_score(labels, predictions, average='weighted')
    }

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=len(class_weights))

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

# Train data
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

t = torch.tensor([1,2], device=device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:

import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load your data
# data = pd.read_csv('path_to_your_data.csv')  # Uncomment and set your data path

# Initialize tokenizer and model from BERTweet
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
model = AutoModel.from_pretrained('vinai/bertweet-base')

# Function to clean Twitter text
def clean_text(text):
    text = text.replace('URL', '')  # remove URLs
    text = text.replace('USER', '') # remove user mentions
    return text

# Tokenize and extract features
def extract_features(text):
    cleaned_text = clean_text(text)
    encoded_input = tokenizer(cleaned_text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        features = model(**encoded_input)
    return features.last_hidden_state[:,0,:].squeeze().numpy()

# Apply the function on the dataset
# data['features'] = data['text'].apply(extract_features)  # Uncomment and adapt field names as needed
# data.head()  # To show some of the output
