In [3]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


In [4]:
# Ganti dengan nama file lo sendiri
data1 = pd.read_csv("dataset_cnn_10k_cleaned.csv")
data2 = pd.read_csv("dataset_kompas_4k_cleaned.csv")
data3 = pd.read_csv("dataset_tempo_6k_cleaned.csv")
data4 = pd.read_csv("dataset_turnbackhoax_10_cleaned.csv")

# Gabung semua dataset
data = pd.concat([data1, data2, data3, data4], ignore_index=True)

print(data.head())
print(data.columns)


   Unnamed: 0                                              Title  \
0           0  Anies di Milad BKMT: Pengajian Menghasilkan Ib...   
1           1  Edy Soal Pilgub Sumut: Kalau yang Maju Abal-ab...   
2           2  PKB Bakal Daftarkan Menaker Ida Fauziyah Jadi ...   
3           3    Gede Pasek Doakan AHY Jadi Capres atau Cawapres   
4           4  PKN Siapkan Jabatan Khusus Buat Anas Urbaningr...   

                       Timestamp  \
0  Selasa, 21 Feb 2023 21:22 WIB   
1  Selasa, 21 Feb 2023 20:46 WIB   
2  Selasa, 21 Feb 2023 20:33 WIB   
3  Selasa, 21 Feb 2023 19:58 WIB   
4  Selasa, 21 Feb 2023 18:56 WIB   

                                            FullText  \
0  Jakarta, CNN Indonesia -- Mantan Gubernur DKI ...   
1  Medan, CNN Indonesia -- Gubernur Sumatera Utar...   
2  Jakarta, CNN Indonesia -- Partai Kebangkitan B...   
3  Jakarta, CNN Indonesia -- Ketua Umum Partai Ke...   
4  Jakarta, CNN Indonesia -- Dewan Pimpinan Pusat...   

                                     

In [5]:
import re
from functools import lru_cache
from tqdm import tqdm
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Setup stemmer & stopwords
stemmer = StemmerFactory().create_stemmer()
stop_factory = StopWordRemoverFactory()
extra_stopwords = ["dengan","bahwa","karena","sudah","juga","akan","untuk"]
stop_words = set(stop_factory.get_stop_words() + extra_stopwords)

# Cache untuk stemming -> hemat waktu banget
@lru_cache(maxsize=200000)
def stem_cache(word):
    return stemmer.stem(word)

# Compile regex sekali (lebih cepat)
url_re = re.compile(r"http\S+|www\S+")
nonalpha_re = re.compile(r"[^a-z\s]")
multi_sp_re = re.compile(r"\s+")

def clean_text_cached(text):
    if pd.isna(text):
        return ""
    s = text.lower()
    s = url_re.sub(" ", s)
    s = nonalpha_re.sub(" ", s)
    s = multi_sp_re.sub(" ", s).strip()
    tokens = [w for w in s.split() if w not in stop_words]
    tokens = [stem_cache(w) for w in tokens]  # stemming pakai cache
    return " ".join(tokens)

# Fungsi apply per chunk biar ga berat
def apply_in_chunks(series, func, chunk=2000):
    out_chunks = []
    for i in tqdm(range(0, len(series), chunk)):
        out = series.iloc[i:i+chunk].apply(func)
        out_chunks.append(out)
    return pd.concat(out_chunks)

# Terapkan ke kolom Narasi
data['clean_text'] = apply_in_chunks(data['Narasi'].astype(str), clean_text_cached, chunk=2000)

# Cek hasil
print(data[['Narasi','clean_text','hoax']].head())


100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [43:15<00:00, 162.23s/it]

  Narasi clean_text  hoax
0    NaN        nan     0
1    NaN        nan     0
2    NaN        nan     0
3    NaN        nan     0
4    NaN        nan     0





In [10]:
vectorizer = TfidfVectorizer(
    ngram_range=(1,3),   # unigram, bigram, trigram
    max_features=15000,  # fitur lebih banyak
    sublinear_tf=True
)

X = vectorizer.fit_transform(data['clean_text'])
y = data['hoax'].values   # ganti dari 'label' ke 'hoax'

print("Shape fitur:", X.shape)
print("Jumlah target:", len(y))
print("Contoh label unik:", data['hoax'].unique())


Shape fitur: (31353, 15000)
Jumlah target: 31353
Contoh label unik: [0 1]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [12]:
model = MultinomialNB(alpha=0.1)  # alpha kecil biar lebih tajam
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["REAL","HOAX"]))


Accuracy: 0.8721097113697975
              precision    recall  f1-score   support

        REAL       0.84      1.00      0.91      4195
        HOAX       1.00      0.61      0.76      2076

    accuracy                           0.87      6271
   macro avg       0.92      0.81      0.84      6271
weighted avg       0.89      0.87      0.86      6271



In [13]:
def prediksi_berita(teks):
    teks_clean = clean_text(teks)
    fitur = vectorizer.transform([teks_clean])
    probas = model.predict_proba(fitur)[0]
    pred = model.predict(fitur)[0]

    print("Teks:", teks[:100], "...")
    print("Probabilitas => REAL:", probas[0], " | HOAX:", probas[1])
    print("Prediksi:", "HOAX" if pred == 1 else "REAL")
    print("-"*60)
    return "HOAX" if pred == 1 else "REAL"

# Contoh test berita
uji_berita = [
    "Presiden Joko Widodo meresmikan tol baru yang menghubungkan Jakarta–Semarang hari ini...",
    "Badan Meteorologi Klimatologi dan Geofisika (BMKG) mengumumkan potensi hujan deras...",
    "Minum air rebusan kabel listrik dipercaya bisa menyembuhkan penyakit jantung tanpa obat dokter.",
    "Pemerintah akan membagikan uang tunai Rp10 juta kepada semua warga yang memiliki KTP elektronik..."

]

for teks in uji_berita:
    prediksi_berita(teks)


NameError: name 'clean_text' is not defined