# Data Cleaning - Enhanced

Notebook ini melakukan data cleaning untuk analisis sentimen Twitter tentang sertifikasi halal.

**Tahapan Cleaning:**
1. Load dan filter data
2. Hapus URL
3. Hapus Mention (@username)
4. Hapus Hashtag (#topic)
5. Hapus Emoji dan karakter khusus
6. Hapus Angka
7. Hapus Whitespace berlebih
8. Case folding (lowercase)
9. Hapus Duplikat

In [None]:
# Import libraries
import pandas as pd
import re
import os

In [None]:
# Load data
df = pd.read_csv("data/dataSertifikasiHalal.csv", index_col=0)
print(f"Jumlah data awal: {len(df)}")
print(f"Kolom: {df.columns.tolist()}")
df.head()

In [None]:
# Pilih kolom yang diperlukan dan buat copy untuk menghindari warning
df = df[['full_text', 'tweet_url']].copy()
print(f"Kolom yang dipilih: {df.columns.tolist()}")
df.head()

## Fungsi-fungsi Cleaning

In [None]:
def remove_url(text):
    """Menghapus URL dari teks."""
    if pd.isna(text):
        return ''
    url_pattern = r'https?://\S+|www\.\S+|bit\.ly/\S+|t\.co/\S+'
    return re.sub(url_pattern, '', str(text))


def remove_mention(text):
    """Menghapus mention (@username) dari teks."""
    if not text:
        return ''
    return re.sub(r'@[A-Za-z0-9_]+', '', str(text))


def remove_hashtag(text):
    """Menghapus hashtag (#topic) dari teks."""
    if not text:
        return ''
    return re.sub(r'#[A-Za-z0-9_]+', '', str(text))


def remove_emoji(text):
    """Menghapus emoji dan karakter khusus dari teks."""
    if not text:
        return ''
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', str(text))


def remove_numbers(text):
    """Menghapus angka dari teks."""
    if not text:
        return ''
    return re.sub(r'\d+', '', str(text))


def remove_special_chars(text):
    """Menghapus karakter khusus, hanya menyisakan huruf dan spasi."""
    if not text:
        return ''
    return re.sub(r'[^a-zA-Z\s]', '', str(text))


def remove_extra_whitespace(text):
    """Menghapus whitespace berlebih dan trim."""
    if not text:
        return ''
    text = re.sub(r'\s+', ' ', str(text))
    return text.strip()


def clean_text(text):
    """
    Fungsi utama untuk membersihkan teks.
    Menggabungkan semua fungsi cleaning.
    """
    if pd.isna(text) or text == '' or str(text).lower() == 'false':
        return ''
    
    text = str(text)
    text = remove_url(text)
    text = remove_mention(text)
    text = remove_hashtag(text)
    text = remove_emoji(text)
    text = remove_numbers(text)
    text = remove_special_chars(text)
    text = remove_extra_whitespace(text)
    text = text.lower()  # Case folding di akhir
    
    return text

In [None]:
# Test fungsi cleaning
test_texts = [
    "@user1 Produk ini #halal ðŸ˜Š cek https://example.com nomor 12345!!!",
    "SERTIFIKASI HALAL sangat PENTING untuk masyarakat Indonesia @MUI",
    "Batas waktu sertifikasi halal 2026 #UMKM #BPJPH https://t.co/abc123"
]

print("Test Fungsi Cleaning:")
print("="*70)
for text in test_texts:
    result = clean_text(text)
    print(f"\nBefore: {text}")
    print(f"After:  {result}")

## Apply Cleaning ke Dataset

In [None]:
# Simpan teks asli
df['original_text'] = df['full_text'].copy()

# Apply cleaning
print("Memulai proses cleaning...")
df['cleaned_text'] = df['full_text'].apply(clean_text)
print("Cleaning selesai!")

# Preview hasil
print("\nContoh hasil cleaning:")
for i in range(min(3, len(df))):
    print(f"\n--- Data {i+1} ---")
    orig = str(df['original_text'].iloc[i])
    clean = str(df['cleaned_text'].iloc[i])
    print(f"Original: {orig[:80]}..." if len(orig) > 80 else f"Original: {orig}")
    print(f"Cleaned:  {clean[:80]}..." if len(clean) > 80 else f"Cleaned:  {clean}")

In [None]:
# Hapus baris dengan cleaned_text kosong
print(f"Jumlah data sebelum filter kosong: {len(df)}")
df = df[df['cleaned_text'].str.len() > 0]
print(f"Jumlah data setelah filter kosong: {len(df)}")

In [None]:
# Hapus duplikat berdasarkan cleaned_text
print(f"Jumlah data sebelum hapus duplikat: {len(df)}")
df = df.drop_duplicates(subset=['cleaned_text'])
print(f"Jumlah data setelah hapus duplikat: {len(df)}")

In [None]:
# Reset index
df = df.reset_index(drop=True)

# Info dataset
print(f"\nDataset Info:")
print(f"Jumlah data final: {len(df)}")
print(f"Kolom: {df.columns.tolist()}")
df.info()

In [None]:
# Preview data
df[['original_text', 'cleaned_text']].head(10)

In [None]:
# Buat folder data jika belum ada
os.makedirs('data', exist_ok=True)

# Simpan hasil cleaning
df.to_csv("data/hasil_cleaning.csv", index=False)
print("Data berhasil disimpan ke data/hasil_cleaning.csv")

## Statistik Cleaning

In [None]:
# Statistik panjang teks
df['text_length'] = df['cleaned_text'].str.len()
df['word_count'] = df['cleaned_text'].str.split().str.len()

print("Statistik Panjang Teks (karakter):")
print(df['text_length'].describe())

print("\nStatistik Jumlah Kata:")
print(df['word_count'].describe())