In [1]:
import pandas as pd
import re
import os
import csv
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# ========== OPSIONAL: progress bar ==========
# Hapus 3 baris ini setelah selesai pengujian
from tqdm import tqdm
tqdm.pandas()
# ============================================


# ======== 0. Fungsi Perbaikan CSV Tidak Rapi ========

def perbaiki_csv_tidak_rapi(input_path, output_path):
    """
    Gabungkan baris CSV yang kontennya terpotong oleh newline di dalam tanda kutip.
    Misal:
    Judul,"Baris pertama
    baris kedua"
    ‚Üí jadi satu baris utuh.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    hasil = []
    buffer = ''
    judul = ''

    for line in lines:
        line = line.strip('\n')
        if not line:
            continue

        # Jika line mengandung kutip pembuka tapi belum kutip penutup
        if line.count('"') == 1 and buffer == '':
            if ',' in line:
                judul, buffer = line.split(',', 1)
            else:
                continue
            continue

        # Jika sedang menggabung isi konten
        if buffer:
            buffer += ' ' + line
            if buffer.count('"') % 2 == 0:
                hasil.append([judul.strip(), buffer.strip('"').strip()])
                buffer = ''
            continue

        # Baris normal (judul + konten sudah lengkap)
        if line.count('"') >= 2 and ',' in line:
            judul, konten = line.split(',', 1)
            hasil.append([judul.strip(), konten.strip('"').strip()])

    # Simpan hasil yang sudah diperbaiki
    with open(output_path, 'w', encoding='utf-8', newline='') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(['judul', 'konten'])
        writer.writerows(hasil)

    print(f"üß© CSV diperbaiki ‚Üí {output_path} ({len(hasil)} data)")


# ======== 1. Fungsi Preprocessing ========

def casefolding(text):
    return text.lower()

def normalisasi(text):
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenisasi(text):
    return text.split()

def filter_token(tokens):
    return [t for t in tokens if t.isalpha() and len(t) > 2]


# ======== 2. Stopword Removal ========

def load_stopwords(file_path="stopwords_indo.txt"):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            custom_stopwords = set(line.strip() for line in f if line.strip())
            return set(ENGLISH_STOP_WORDS).union(custom_stopwords)
    except FileNotFoundError:
        print(f"‚ö†Ô∏è File stopwords tidak ditemukan: {file_path}. Menggunakan default ENGLISH_STOP_WORDS saja.")
        return set(ENGLISH_STOP_WORDS)

STOPWORDS = load_stopwords()

def hapus_stopword(tokens):
    return [t for t in tokens if t not in STOPWORDS]


# ======== 3. Pipeline Preprocessing ========

def preprocess_text(text):
    text = casefolding(text)
    text = normalisasi(text)
    tokens = tokenisasi(text)
    tokens = filter_token(tokens)
    tokens = hapus_stopword(tokens)
    return tokens


# ======== 4. Proses Dataset ========

def preprocess_csv(file_path):
    # perbaiki dulu CSV (buat versi _fixed)
    fixed_path = file_path.replace(".csv", "_fixed.csv")
    perbaiki_csv_tidak_rapi(file_path, fixed_path)

    # baru baca dengan pandas
    df = pd.read_csv(fixed_path)

    # cari kolom teks utama
    text_col = None
    for col in df.columns:
        if df[col].dtype == 'object':
            text_col = col
            break

    if text_col is None:
        raise ValueError(f"Tidak ada kolom teks dalam {file_path}")

    # gunakan progress bar untuk setiap teks
    df['tokens'] = df[text_col].fillna("").progress_apply(preprocess_text)
    return df[['tokens']]


# ======== 5. Jalankan untuk semua dataset ========

os.makedirs("clean_dataset", exist_ok=True)

datasets = [
    'dataset/etd_ugm_fixed.csv',
    'dataset/etd_usk_fixed.csv',
    'dataset/kompas_fixed.csv',
    'dataset/mojok_fixed.csv',
    'dataset/tempo_fixed.csv'
]

for ds in datasets:
    try:
        hasil = preprocess_csv(ds)
        output_path = f"clean_{ds}"
        hasil.to_csv(output_path, index=False)
        print(f"‚úÖ {ds} selesai diproses ‚Üí {output_path}")
    except Exception as e:
        print(f"‚ö†Ô∏è Gagal memproses {ds}: {e}")


üß© CSV diperbaiki ‚Üí dataset/etd_ugm_fixed_fixed.csv (8578 data)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8578/8578 [00:00<00:00, 124907.53it/s]


‚úÖ dataset/etd_ugm_fixed.csv selesai diproses ‚Üí clean_dataset/etd_ugm_fixed.csv
üß© CSV diperbaiki ‚Üí dataset/etd_usk_fixed_fixed.csv (9796 data)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9796/9796 [00:00<00:00, 130969.22it/s]


‚úÖ dataset/etd_usk_fixed.csv selesai diproses ‚Üí clean_dataset/etd_usk_fixed.csv
üß© CSV diperbaiki ‚Üí dataset/kompas_fixed_fixed.csv (10000 data)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 206939.11it/s]


‚úÖ dataset/kompas_fixed.csv selesai diproses ‚Üí clean_dataset/kompas_fixed.csv
üß© CSV diperbaiki ‚Üí dataset/mojok_fixed_fixed.csv (9484 data)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9484/9484 [00:00<00:00, 192030.72it/s]


‚úÖ dataset/mojok_fixed.csv selesai diproses ‚Üí clean_dataset/mojok_fixed.csv
üß© CSV diperbaiki ‚Üí dataset/tempo_fixed_fixed.csv (7560 data)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7560/7560 [00:00<00:00, 184094.21it/s]

‚úÖ dataset/tempo_fixed.csv selesai diproses ‚Üí clean_dataset/tempo_fixed.csv



