# ***ANALISIS SENTIMEN DATA SERTIFIKASI HALAL***
# ***DENGAN PENERAPAN K-MEANS CLUSTERING***

# ***LOAD DATA***

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load data sertifikasi halal
data = pd.read_csv('data/dataSertifikasiHalal.csv')
data.info()
data.head(5)

In [None]:
# ubah kolom ke datetime (aman jika masih string)
data['created_at'] = pd.to_datetime(data['created_at'])

# pisahkan tanggal dan waktu
data['tanggal'] = data['created_at'].dt.date
data['waktu']   = data['created_at'].dt.time

data.head()

In [None]:
# Buat dataframe dengan kolom yang diperlukan
df = pd.DataFrame(data[['tanggal', 'waktu', 'full_text', 'favorite_count', 'retweet_count', 'username']])
df.columns = ['tanggal', 'waktu', 'content', 'likes', 'retweets', 'username']
df.head(5)

# ***PREPROCESSING DATA***

**PROSES HAPUS DATA DUPLIKAT**

---

In [None]:
df.info()
df.head()

In [None]:
# jumlah data duplikat
data_duplikat = df[df.duplicated(subset='content', keep=False)]
print(f'Jumlah data duplikat: {len(data_duplikat)}')
data_duplikat.head()

In [None]:
# hapus data duplikat
df = df.drop_duplicates(subset='content', keep='first')
print(f'Jumlah data setelah hapus duplikat: {len(df)}')
df.info()
df.head()

In [None]:
# hapus data null
df = df.dropna(subset=['content'])
print(f'Jumlah data setelah hapus null: {len(df)}')
df.info()

**PROSES CASE FOLDING (lowercase)**

---

In [None]:
# Case folding - mengubah semua huruf menjadi lowercase
df['content_lower'] = df['content'].str.lower()
df[['content', 'content_lower']].head()

**PROSES CLEANING (Hapus URL, mention, hashtag, angka, tanda baca)**

---

In [None]:
import re
import string

def clean_text(text):
    if pd.isna(text):
        return ""
    
    # Hapus URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Hapus mentions (@username)
    text = re.sub(r'@\w+', '', text)
    
    # Hapus hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)
    
    # Hapus angka
    text = re.sub(r'\d+', '', text)
    
    # Hapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Hapus karakter spesial
    text = re.sub(r'[^\w\s]', '', text)
    
    # Hapus newline dan tab
    text = re.sub(r'[\n\t\r]', ' ', text)
    
    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['content_clean'] = df['content_lower'].apply(clean_text)
df[['content', 'content_clean']].head()

**PROSES TOKENIZING**

---

In [None]:
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

def tokenize_text(text):
    if pd.isna(text) or text == "":
        return []
    return word_tokenize(text)

df['content_tokens'] = df['content_clean'].apply(tokenize_text)
df[['content_clean', 'content_tokens']].head()

**PROSES STOPWORD REMOVAL**

---

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords

# Gabungkan stopwords dari NLTK Indonesia dan Sastrawi
stop_words_nltk = set(stopwords.words('indonesian'))
stop_factory = StopWordRemoverFactory()
stop_words_sastrawi = set(stop_factory.get_stop_words())

# Custom stopwords untuk konteks sertifikasi halal
custom_stopwords = {
    'yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk', 'dengan',
    'adalah', 'pada', 'juga', 'tidak', 'ada', 'akan', 'bisa', 'sudah',
    'ya', 'ga', 'gak', 'ngga', 'nggak', 'aja', 'saja', 'kan', 'dong',
    'sih', 'nih', 'tuh', 'deh', 'yuk', 'yg', 'dgn', 'utk', 'dlm', 'krn',
    'rt', 'amp', 'via'
}

all_stopwords = stop_words_nltk.union(stop_words_sastrawi).union(custom_stopwords)

def remove_stopwords(tokens):
    return [word for word in tokens if word not in all_stopwords and len(word) > 2]

df['content_no_stopword'] = df['content_tokens'].apply(remove_stopwords)
df[['content_tokens', 'content_no_stopword']].head()

**PROSES STEMMING**

---

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

print("Memulai proses stemming (ini membutuhkan waktu)...")

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

df['content_stemmed'] = df['content_no_stopword'].apply(stem_tokens)
df[['content_no_stopword', 'content_stemmed']].head()

In [None]:
# Join tokens menjadi string
df['content_final'] = df['content_stemmed'].apply(lambda x: ' '.join(x))

# Hapus baris dengan content_final kosong
df = df[df['content_final'].str.len() > 0]
print(f"Data setelah preprocessing: {len(df)} baris")

df[['content', 'content_final']].head()

# ***PELABELAN DATA SENTIMEN***

Karena data tweet tidak memiliki rating/score seperti data Play Store, kita akan melakukan pelabelan sentimen menggunakan:
1. Lexicon-based labeling (kata kunci positif/negatif)
2. Validasi dengan K-Means Clustering

In [None]:
# Kata-kata positif dan negatif untuk labeling awal
kata_positif = [
    'bagus', 'baik', 'senang', 'puas', 'mantap', 'keren', 'hebat', 
    'excellent', 'good', 'great', 'aman', 'terjamin', 'percaya', 
    'halal', 'berkualitas', 'resmi', 'sertifikat', 'jelas', 'setuju',
    'dukung', 'support', 'wajib', 'penting', 'benar', 'betul'
]

kata_negatif = [
    'buruk', 'jelek', 'kecewa', 'marah', 'kesal', 'tidak', 'gagal',
    'ribet', 'susah', 'lama', 'mahal', 'curang', 'haram', 'palsu',
    'bohong', 'tipu', 'masalah', 'komplain', 'protes', 'tolak',
    'salah', 'kontroversi', 'konstitusi', 'diskriminasi', 'babi'
]

def label_sentimen(text):
    if pd.isna(text):
        return 'Netral'
    
    text_lower = text.lower()
    pos_count = sum(1 for kata in kata_positif if kata in text_lower)
    neg_count = sum(1 for kata in kata_negatif if kata in text_lower)
    
    if pos_count > neg_count:
        return 'Positif'
    elif neg_count > pos_count:
        return 'Negatif'
    else:
        return 'Netral'

df['sentimen_lexicon'] = df['content_final'].apply(label_sentimen)

print("Distribusi Sentimen (Lexicon-based):")
print(df['sentimen_lexicon'].value_counts())

In [None]:
# Visualisasi distribusi sentimen lexicon
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
colors = ['#6bcb77', '#ffd93d', '#ff6b6b']
sentimen_counts = df['sentimen_lexicon'].value_counts()

plt.bar(sentimen_counts.index, sentimen_counts.values, color=colors, edgecolor='black')
plt.xlabel('Sentimen', fontsize=12)
plt.ylabel('Jumlah', fontsize=12)
plt.title('Distribusi Sentimen (Lexicon-based)\nData Sertifikasi Halal', fontsize=14, fontweight='bold')

for i, (idx, v) in enumerate(zip(sentimen_counts.index, sentimen_counts.values)):
    plt.text(i, v + 10, str(v), ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('output/distribusi_sentimen_lexicon.png', dpi=300, bbox_inches='tight')
plt.show()

# ***TF-IDF VECTORIZATION***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(df['content_final'])

print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")

# Tampilkan top features
feature_names = tfidf.get_feature_names_out()
print(f"\nTop 20 features: {feature_names[:20].tolist()}")

# ***PENERAPAN K-MEANS CLUSTERING***

**MENENTUKAN JUMLAH CLUSTER OPTIMAL (Elbow & Silhouette)**

---

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Elbow Method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

print("Menghitung inertia dan silhouette score untuk K=2 hingga K=10...")
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(tfidf_matrix)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(tfidf_matrix, kmeans.labels_))
    print(f"K={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={silhouette_scores[-1]:.4f}")

In [None]:
# Plot Elbow dan Silhouette
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow Plot
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Jumlah Cluster (K)', fontsize=12)
axes[0].set_ylabel('Inertia (SSE)', fontsize=12)
axes[0].set_title('Elbow Method', fontsize=14)
axes[0].grid(True, alpha=0.3)

# Silhouette Plot
axes[1].plot(K_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Jumlah Cluster (K)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].set_title('Silhouette Score', fontsize=14)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('output/elbow_silhouette_kmeans.png', dpi=300, bbox_inches='tight')
plt.show()

# Pilih K optimal berdasarkan silhouette score tertinggi
optimal_k = list(K_range)[np.argmax(silhouette_scores)]
print(f"\nK optimal berdasarkan Silhouette Score: {optimal_k}")

**CLUSTERING DENGAN K=3 (Negatif, Netral, Positif)**

---

In [None]:
# Gunakan K=3 untuk sentimen (Negatif, Netral, Positif)
K = 3
print(f"Menggunakan K={K} untuk clustering sentimen (Negatif, Netral, Positif)")

kmeans_final = KMeans(n_clusters=K, random_state=42, n_init=10)
df['cluster'] = kmeans_final.fit_predict(tfidf_matrix)

print(f"\nDistribusi Cluster:")
print(df['cluster'].value_counts().sort_index())

In [None]:
# Labeling sentimen berdasarkan cluster
def hitung_skor_cluster(cluster_id):
    texts = df[df['cluster'] == cluster_id]['content_final']
    all_text = ' '.join(texts)
    
    positif_count = sum(1 for kata in kata_positif if kata in all_text)
    negatif_count = sum(1 for kata in kata_negatif if kata in all_text)
    
    return positif_count - negatif_count

cluster_scores = {i: hitung_skor_cluster(i) for i in range(K)}
print("Skor sentimen per cluster:")
for cluster, score in cluster_scores.items():
    print(f"  Cluster {cluster}: {score}")

# Urutkan cluster berdasarkan skor
sorted_clusters = sorted(cluster_scores.items(), key=lambda x: x[1])
label_mapping = {}
labels = ['Negatif', 'Netral', 'Positif']

for i, (cluster_id, score) in enumerate(sorted_clusters):
    label_mapping[cluster_id] = labels[i]

print(f"\nLabel Mapping: {label_mapping}")

# Apply label
df['sentimen_kmeans'] = df['cluster'].map(label_mapping)

print("\nDistribusi Sentimen (K-Means):")
print(df['sentimen_kmeans'].value_counts())

In [None]:
# Visualisasi distribusi sentimen K-Means
plt.figure(figsize=(10, 8))
colors = ['#ff6b6b', '#ffd93d', '#6bcb77']
sentimen_counts = df['sentimen_kmeans'].value_counts()

plt.pie(sentimen_counts.values, 
        labels=sentimen_counts.index, 
        autopct='%1.1f%%',
        colors=colors,
        explode=[0.02] * len(sentimen_counts),
        shadow=True,
        startangle=90)

plt.title('Distribusi Sentimen Data Sertifikasi Halal\n(K-Means Clustering)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('output/distribusi_sentimen_kmeans.png', dpi=300, bbox_inches='tight')
plt.show()

# ***WORDCLOUD***

In [None]:
from wordcloud import WordCloud

# Fix for numpy 2.0 compatibility
np.asarray_orig = np.asarray
def asarray_fix(*args, **kwargs):
    kwargs.pop('copy', None)
    return np.asarray_orig(*args, **kwargs)
np.asarray = asarray_fix

def generate_wordcloud(text, title, color, filename):
    """Generate word cloud untuk teks tertentu"""
    wordcloud = WordCloud(
        width=800, 
        height=400,
        background_color='white',
        colormap=color,
        max_words=100,
        min_font_size=10,
        max_font_size=100,
        random_state=42
    ).generate(text)
    
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.savefig(f'output/{filename}', dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Word cloud disimpan: output/{filename}")

In [None]:
# Word Cloud untuk semua data
all_text = ' '.join(df['content_final'].dropna().astype(str))
generate_wordcloud(all_text, 'Word Cloud - Semua Data Sertifikasi Halal', 'viridis', 'wordcloud_all.png')

In [None]:
# Word Cloud per sentimen
sentimen_colors = {
    'Negatif': 'Reds',
    'Netral': 'Greys',
    'Positif': 'Greens'
}

for sentimen, color in sentimen_colors.items():
    text = ' '.join(df[df['sentimen_kmeans'] == sentimen]['content_final'].dropna().astype(str))
    if text.strip():
        generate_wordcloud(
            text, 
            f'Word Cloud - Sentimen {sentimen}', 
            color, 
            f'wordcloud_{sentimen.lower()}.png'
        )

# ***FREKUENSI KATA***

In [None]:
from collections import Counter

def get_top_words(df, sentimen, n=20):
    """Get top n words untuk sentimen tertentu"""
    text = ' '.join(df[df['sentimen_kmeans'] == sentimen]['content_final'].dropna().astype(str))
    words = text.split()
    word_counts = Counter(words)
    return word_counts.most_common(n)

for sentimen in ['Positif', 'Netral', 'Negatif']:
    print(f"\n{'='*50}")
    print(f"Top 15 Kata - Sentimen {sentimen}:")
    print(f"{'='*50}")
    top_words = get_top_words(df, sentimen, 15)
    for i, (word, count) in enumerate(top_words, 1):
        print(f"  {i:2}. {word:20} : {count}")

In [None]:
# Visualisasi Top 10 Kata per Sentimen
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

colors_bar = {'Positif': '#6bcb77', 'Netral': '#ffd93d', 'Negatif': '#ff6b6b'}

for idx, sentimen in enumerate(['Positif', 'Netral', 'Negatif']):
    top_words = get_top_words(df, sentimen, 10)
    words = [w[0] for w in top_words]
    counts = [w[1] for w in top_words]
    
    axes[idx].barh(words[::-1], counts[::-1], color=colors_bar[sentimen], edgecolor='black')
    axes[idx].set_xlabel('Frekuensi', fontsize=11)
    axes[idx].set_title(f'Top 10 Kata - {sentimen}', fontsize=12, fontweight='bold')
    
plt.tight_layout()
plt.savefig('output/frekuensi_kata_per_sentimen.png', dpi=300, bbox_inches='tight')
plt.show()

# ***SIMPAN HASIL***

In [None]:
# Simpan hasil ke CSV
output_df = df[['tanggal', 'waktu', 'content', 'username', 'content_final', 'cluster', 'sentimen_lexicon', 'sentimen_kmeans']]
output_df.to_csv('output/hasil_analisis_sentimen_sertifikasi_halal.csv', index=False, encoding='utf-8-sig')
print("Hasil disimpan ke: output/hasil_analisis_sentimen_sertifikasi_halal.csv")

In [None]:
# Ringkasan
print("="*60)
print("RINGKASAN HASIL ANALISIS")
print("="*60)
print(f"Total Data Awal: {len(data)}")
print(f"Total Data Setelah Preprocessing: {len(df)}")
print(f"Jumlah Cluster: {K}")
print(f"Silhouette Score: {silhouette_score(tfidf_matrix, kmeans_final.labels_):.4f}")
print(f"\nDistribusi Sentimen (K-Means):")
print(df['sentimen_kmeans'].value_counts())
print("\n" + "="*60)
print("ANALISIS SELESAI!")
print("="*60)