In [1]:
# ! pip install sastrawi

In [2]:
import re
import nlp_id
from nlp_id.lemmatizer import Lemmatizer
from nlp_id.stopword import StopWord
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv("data/dataset_penyisihan_bdc_2024.csv", delimiter=";")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


In [5]:
data.head()

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik


In [6]:
print(data['label'].unique())
print(data.label.value_counts())

['Sumber Daya Alam' 'Politik' 'Demografi' 'Pertahanan dan Keamanan'
 'Ideologi' 'Ekonomi' 'Sosial Budaya' 'Geografi']
Politik                    2972
Sosial Budaya               587
Pertahanan dan Keamanan     400
Ideologi                    400
Ekonomi                     367
Sumber Daya Alam            192
Demografi                    62
Geografi                     20
Name: label, dtype: int64


## Cleaning Data

In [7]:
# Inisialisasi stopword dan lemmatizer
stopword = StopWord()
lemmatizer = Lemmatizer()

# Fungsi membersihkan teks
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www.\S+", "", text)   # URL
    text = re.sub(r"@\w+", "", text)              # Mention
    text = re.sub(r"#\w+", "", text)              # Hashtag
    text = re.sub(r"\brt\b", "", text)            # Retweet
    text = re.sub(r"\d+", "", text)               # Angka
    text = re.sub(r"[^\w\s]", "", text)           # Tanda baca
    
    words = text.split()
    
    # Hapus stopword dan lakukan stemming/lemmatisasi
    clean_words = [
        lemmatizer.lemmatize(word) 
        for word in words 
        if word not in stopword.get_stopword()
    ]
    
    return ' '.join(clean_words)

data['clean_text'] = data['text'].apply(clean_text)

In [8]:
# Fungsi deteksi kata noise
def is_noise_word(word):
    if len(word) > 20:
        return True
    if re.search(r'[bcdfghjklmnpqrstvwxyz]{5,}', word):
        return True
    num_vowel = len(re.findall(r'[aeiou]', word))
    if len(word) > 8 and (num_vowel / len(word)) < 0.2:
        return True
    if re.search(r'[a-zA-Z]', word) and re.search(r'\d', word):
        return True
    return False

# Fungsi menghapus kata noise
def remove_noise_words(text):
    words = text.split()
    clean_words = [
        word for word in words
        if not is_noise_word(word) and not word.isdigit()
    ]
    return ' '.join(clean_words)

data['clean_text'] = data['clean_text'].apply(remove_noise_words)

## Augmentasi Data

In [9]:
tokenizer_para = AutoTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
model_para = AutoModelForSeq2SeqLM.from_pretrained("cahya/bert2bert-indonesian-summarization")

# Back translation models
model_id_en = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-id-en")
tokenizer_id_en = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
model_en_id = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-id")
tokenizer_en_id = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")



In [10]:
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}", 
        return_tensors="pt", 
        max_length=256, 
        truncation=True
    )
    outputs = model_para.generate(
        **inputs, 
        max_length=256, 
        num_return_sequences=1, 
        do_sample=True, 
        top_k=50
    )
    return tokenizer_para.decode(outputs[0], skip_special_tokens=True)

def back_translate(text):
    encoded = tokenizer_id_en(text, return_tensors="pt", padding=True)
    translated_en = model_id_en.generate(**encoded)
    text_en = tokenizer_id_en.decode(translated_en[0], skip_special_tokens=True)

    encoded_en = tokenizer_en_id(text_en, return_tensors="pt", padding=True)
    translated_id = model_en_id.generate(**encoded_en)
    return tokenizer_en_id.decode(translated_id[0], skip_special_tokens=True)

In [None]:
majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]
augmented_rows = []

for label, count in data['label'].value_counts().items():
    if label == majority_class:
        continue
    
    subset = data[data['label'] == label]
    needed = target_count - count
    print(f"Augmentasi kelas {label} -> {needed} data")
    
    repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)
    
    for i, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label}"):
        text = row['clean_text']
        try:
            aug_text = paraphrase_text(text) if i % 2 == 0 else back_translate(text)
        except:
            aug_text = text
        
        augmented_rows.append({
            "text": row['text'],
            "clean_text": aug_text,
            "label": label
        })

augmented_df = pd.DataFrame(augmented_rows)

Augmentasi kelas Sosial Budaya -> 2385 data


Augmenting Sosial Budaya:   0%|          | 0/2385 [00:00<?, ?it/s]

Augmenting Sosial Budaya:   2%|▏         | 50/2385 [03:26<2:27:21,  3.79s/it]

In [None]:
# Gabungkan dataset asli dan augmented
final_df = pd.concat([data, augmented_df], ignore_index=True)
print("Distribusi baru:\n", final_df['label'].value_counts())

In [None]:
final_df.to_csv("data/prepData1.csv", index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,       # jumlah fitur maksimum
    ngram_range=(1,2),        # unigram + bigram
    min_df=5,                 # kata minimal muncul di 5 dokumen
)
X = vectorizer.fit_transform(data['clean_text'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(data['label'])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.793
              precision    recall  f1-score   support

           0       0.60      0.23      0.33        13
           1       0.73      0.70      0.71        73
           2       0.00      0.00      0.00         4
           3       0.88      0.53      0.66        80
           4       0.75      0.64      0.69        80
           5       0.80      0.94      0.87       594
           6       0.76      0.59      0.66       117
           7       0.85      0.44      0.58        39

    accuracy                           0.79      1000
   macro avg       0.67      0.51      0.56      1000
weighted avg       0.79      0.79      0.78      1000

