`Import Lib`

In [None]:
import os
import pandas as pd
import re
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


# STEP 1 — Case Folding

In [None]:
def case_folding(text):
    if pd.isna(text):
        return ""
    return text.lower()

def step1_casefolding(input_file="merge_datasets/cleandataset.csv", save_to="step_data"):

    os.makedirs(save_to, exist_ok=True)


    if not os.path.exists(input_file):
        print(f"❌ File {input_file} tidak ditemukan.")
        return


    df = pd.read_csv(input_file)
    print(f"🔹 Melakukan Case Folding pada file: {input_file}")

    if 'judul' not in df.columns or 'konten' not in df.columns:
        print("⚠️ File tidak memiliki kolom 'judul' dan 'konten'. Proses dihentikan.")
        return

   
    df['judul'] = df['judul'].astype(str).apply(case_folding)
    df['konten'] = df['konten'].astype(str).apply(case_folding)

    output_path = os.path.join(save_to, "step1_casefolding.csv")
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"✅ Step 1 selesai — hasil disimpan di: {output_path}")

if __name__ == "__main__":
    step1_casefolding()


# Cleaning

In [None]:

def cleaning(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\d+', '', text) 
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    return text.strip()

def step2_cleaning(input_file="step_data/step1_casefolding.csv", output_file="step_data/step2_cleaning.csv"):
  
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    if not os.path.exists(input_file):
        print(f"❌ File {input_file} tidak ditemukan.")
        return

    print(f"🔹 Membaca file: {input_file}")
    df = pd.read_csv(input_file)

    if 'judul' not in df.columns or 'konten' not in df.columns:
        print("⚠️ File tidak memiliki kolom 'judul' dan 'konten'. Proses dihentikan.")
        return

    print("🧹 Membersihkan teks (hapus angka & tanda baca)...")
    df['judul'] = df['judul'].astype(str).apply(cleaning)
    df['konten'] = df['konten'].astype(str).apply(cleaning)

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Step 2 selesai — hasil disimpan di: {output_file}")

if __name__ == "__main__":
    step2_cleaning()


# Tokenizing

In [None]:

def tokenizing(text):
    if pd.isna(text):
        return []
    return text.split()

def step3_tokenizing(input_file="step_data/step2_cleaning.csv", output_file="step_data/step3_tokenizing.csv"):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)


    if not os.path.exists(input_file):
        print(f"❌ File {input_file} tidak ditemukan.")
        return

    print(f"🔹 Membaca file: {input_file}")
    df = pd.read_csv(input_file)


    if 'judul' not in df.columns or 'konten' not in df.columns:
        print("⚠️ File tidak memiliki kolom 'judul' dan 'konten'. Proses dihentikan.")
        return

    print("✂️ Melakukan tokenisasi teks...")
    df['judul'] = df['judul'].astype(str).apply(tokenizing)
    df['konten'] = df['konten'].astype(str).apply(tokenizing)

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Step 3 selesai — hasil disimpan di: {output_file}")

if __name__ == "__main__":
    step3_tokenizing()


# Stopword Removal

In [None]:
stop_factory = StopWordRemoverFactory()
stopwords_id = set(stop_factory.get_stop_words())

def remove_stopwords(tokens):
    """Menghapus stopword dari token list."""
    if isinstance(tokens, str):
        tokens = tokens.strip("[]").replace("'", "").split(", ")
    return [word for word in tokens if word not in stopwords_id]

def step4_stopword(input_file="step_data/step3_tokenizing.csv", output_file="step_data/step4_stopword.csv"):
    """Menghapus stopword dari kolom 'judul' dan 'konten'."""
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    if not os.path.exists(input_file):
        print(f"❌ File {input_file} tidak ditemukan.")
        return

    print(f"🔹 Membaca file: {input_file}")
    df = pd.read_csv(input_file)

    if 'judul' not in df.columns or 'konten' not in df.columns:
        print("⚠️ File tidak memiliki kolom 'judul' dan 'konten'. Proses dihentikan.")
        return

    print("🧹 Menghapus stopword...")
    df['judul'] = df['judul'].apply(remove_stopwords)
    df['konten'] = df['konten'].apply(remove_stopwords)

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Step 4 selesai — hasil disimpan di: {output_file}")

if __name__ == "__main__":
    step4_stopword()


## Detokenisasi 

In [None]:
import pandas as pd
import ast

def detokenize_csv(input_path: str, output_path: str, 
                   col_judul: str = "judul", col_konten: str = "konten"):

    def parse_list(text):
        try:
            tokens = ast.literal_eval(text)
            if isinstance(tokens, list):
                return " ".join(tokens)
            else:
                return str(tokens)
        except Exception:
            return str(text)

    print(f"📂 Membaca data dari: {input_path}")
    df = pd.read_csv(input_path)

    if col_judul in df.columns:
        df[col_judul] = df[col_judul].apply(parse_list)
    if col_konten in df.columns:
        df[col_konten] = df[col_konten].apply(parse_list)

    df.to_csv(output_path, index=False)
    print(f"✅ File hasil disimpan ke: {output_path}")

    print("\n🧠 Contoh hasil detokenisasi:")
    print(df.head(3).to_string(index=False))

    return df

if __name__ == "__main__":
    input_file = "./step_data/step5_stemming_token.csv"  
    output_file = "./step_data/step6_detokenized.csv"    
    detokenize_csv(input_file, output_file)


📂 Membaca data dari: ./step_data/step5_stemming_token.csv
✅ File hasil disimpan ke: ./step_data/step6_detokenized.csv

🧠 Contoh hasil detokenisasi:
                                                                                                     judul                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          