In [None]:
import re
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
from tqdm import tqdm

In [None]:
data = pd.read_csv("data/dataset_penyisihan_bdc_2024.csv", delimiter=";")

In [None]:
data.info()

In [None]:
data.head()

In [None]:
print(data['label'].unique())
print(data.label.value_counts())

In [None]:
import unicodedata
import string
# Fungsi membersihkan teks tanpa stemming/lemmatisasi
def clean_text(text):
    text = str(text).lower()

    # 1. Hapus RT, via, cc di awal
    text = re.sub(r'^(RT|rt|via|cc)\b', '', text).strip()

    # 2. Hapus mention @username
    text = re.sub(r'@\w+', '', text)

    # 3. Hapus URL
    text = re.sub(r'http\S+', '', text)

    # 4. Hapus hashtag
    text = re.sub(r'#\S+', '', text)

    # 5. Hapus bracket [RE ...] atau yang sejenis
    text = re.sub(r'\[.*?\]', '', text)

    # 6. Hapus encoding random (+ECNv...= dsb)
    text = re.sub(r'\S*=\S*', '', text)

    # 7. Hapus karakter non-ASCII dan simbol aneh
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 8. Normalisasi unicode (hilangkan diakritik tak perlu)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    return text

# Terapkan ke kolom data
data['clean_text'] = data['text'].apply(clean_text)

In [None]:
# Hapus baris jika kolom 'clean_text' kosong atau hanya berisi spasi
data = data[data['clean_text'].astype(str).str.strip() != ""]

In [None]:
data.info()

# AUGMENTASI

## Sosial Budaya

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Sosial Budaya"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


## Pertahanan dan Keamanan

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Pertahanan dan Keamanan"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


## Ideologi

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Ideologi"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


## Ekonomi

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Ekonomi"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


## Sumber Daya Alam

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Sumber Daya Alam"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


## Demografi

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Demografi"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


## Geografi

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# =========================
# CONFIG
# =========================
label_target = "Geografi"
output_file = f"prepData_{label_target.replace(' ', '_')}.csv"

majority_class = data['label'].value_counts().idxmax()
target_count = data['label'].value_counts()[majority_class]

subset = data[data['label'] == label_target]
count = len(subset)
needed = target_count - count
print(f"Augmentasi kelas {label_target} -> {needed} data")

if needed <= 0:
    raise SystemExit("Tidak perlu augmentasi.")

# =========================
# Device Setting (GPU Kaggle)
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📌 Menggunakan device: {device}")

# =========================
# Load Models
# =========================
model_path = "data/indoT5-paraphrase"
tokenizer_para = AutoTokenizer.from_pretrained(model_path)
model_para = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

sim_model = SentenceTransformer('data/distiluse', device=device)

# =========================
# Fungsi Paraphrase IndoT5
# =========================
def paraphrase_text(text):
    inputs = tokenizer_para(
        f"parafrase: {text}",
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding=True
    ).to(device)

    outputs = model_para.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=256,
        num_return_sequences=3,
        do_sample=True,
        top_k=50
    )

    candidates = tokenizer_para.batch_decode(outputs, skip_special_tokens=True)

    embeddings = sim_model.encode([text] + candidates, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(embeddings[0], embeddings[1:])[0]
    best_idx = torch.argmax(similarities).item()
    best_sentence = candidates[best_idx]

    if best_sentence.strip().lower() == text.strip().lower():
        return None
    return best_sentence

# =========================
# Augmentation Loop
# =========================
augmented_rows = []
repeat_df = subset.sample(needed, replace=True).reset_index(drop=True)

for _, row in tqdm(repeat_df.iterrows(), total=needed, desc=f"Augmenting {label_target}"):
    src_text = row['clean_text']

    retry_count = 0
    aug_text = paraphrase_text(src_text)

    while aug_text is not None and aug_text.strip().lower() == src_text.strip().lower() and retry_count < 2:
        aug_text = paraphrase_text(src_text)
        retry_count += 1

    if aug_text is None or aug_text.strip().lower() == src_text.strip().lower():
        continue

    augmented_rows.append({
        "text": row['text'],
        "clean_text": aug_text,
        "label": label_target
    })

augmented_df = pd.DataFrame(augmented_rows)
augmented_df.to_csv(output_file, index=False)
print(f"✅ Augmentasi selesai, hasil disimpan di {output_file}")


In [None]:
import glob

aug_files = glob.glob("prepData_*.csv")
augmented_list = [pd.read_csv(file) for file in aug_files]
augmented_all = pd.concat(augmented_list, ignore_index=True)

In [None]:
final_dataset = pd.concat([data, augmented_all], ignore_index=True)
final_dataset.to_csv("augmentedDataset1.csv", index=False)

print("Dataset asli + augmentasi semua kelas disimpan ke augmentedDataset1.csv")