In [1]:
# ===== Cell 1: Setup & Cek Environment =====
import os, sys
import platform
import pandas as pd
import numpy as np

# Cek versi paket penting (untuk debugging)
def version_str(pkg):
    try:
        return pkg.__version__
    except Exception:
        return "tidak tersedia"

print("Python:", platform.python_version())
print("OS:", platform.system(), platform.release())
print("pandas:", version_str(pd))
print("numpy :", version_str(np))

# Cek scikit-learn, imbalanced-learn, Sastrawi, tqdm, joblib
sklearn_v = "tidak tersedia"
imblearn_v = "tidak tersedia"
sastrawi_v = "tidak tersedia"
tqdm_v = "tidak tersedia"
joblib_v = "tidak tersedia"

try:
    import sklearn
    sklearn_v = sklearn.__version__
except:
    pass

try:
    import imblearn
    imblearn_v = imblearn.__version__
except:
    pass

try:
    import Sastrawi
    sastrawi_v = "terinstal"
except:
    pass

try:
    from tqdm import tqdm
    tqdm_v = "terinstal"
except:
    pass

try:
    import joblib
    joblib_v = joblib.__version__
except:
    pass

print("scikit-learn:", sklearn_v)
print("imbalanced-learn:", imblearn_v)
print("Sastrawi:", sastrawi_v)
print("tqdm:", tqdm_v)
print("joblib:", joblib_v)

# Tampilkan working dir dan daftar file CSV/XLSX
cwd = os.getcwd()
print("\nWorking directory:", cwd)
files = os.listdir(cwd)
csv_files = [f for f in files if f.lower().endswith((".csv",".xlsx"))]

print(f"\nJumlah file CSV/XLSX di folder: {len(csv_files)}")
for f in csv_files:
    print(" -", f)


Python: 3.12.6
OS: Windows 11
pandas: 2.2.2
numpy : 2.1.1
scikit-learn: 1.5.2
imbalanced-learn: 0.14.0
Sastrawi: terinstal
tqdm: terinstal
joblib: 1.4.2

Working directory: C:\Users\muham\Skripsi

Jumlah file CSV/XLSX di folder: 5
 - dataset_cnn_10k_cleaned.csv
 - dataset_kompas_4k_cleaned.csv
 - dataset_sosmed_hoax.csv
 - dataset_tempo_6k_cleaned.csv
 - dataset_turnbackhoax_10_cleaned.csv


In [2]:
# ===== Cell 2a: Load & Gabungkan Dataset Berita =====
import pandas as pd

# Baca dataset berita
data1 = pd.read_csv("dataset_cnn_10k_cleaned.csv")
data2 = pd.read_csv("dataset_kompas_4k_cleaned.csv")
data3 = pd.read_csv("dataset_tempo_6k_cleaned.csv")
data4 = pd.read_csv("dataset_turnbackhoax_10_cleaned.csv")

# Fungsi standarisasi dataset berita
def standardize_dataset(df, sumber="unknown"):
    # Pilih teks utama
    if "text_new" in df.columns:
        teks = df["text_new"].fillna("")
    elif "FullText" in df.columns:
        teks = df["FullText"].fillna("")
    elif "Narasi" in df.columns:
        teks = df["Narasi"].fillna("")
    elif "text" in df.columns:
        teks = df["text"].fillna("")
    else:
        raise ValueError(f"Tidak ada kolom teks valid di dataset {sumber}")
    
    # Pilih label
    if "hoax" in df.columns:
        label = df["hoax"].astype(int)
    elif "label" in df.columns:
        label = df["label"].astype(int)
    else:
        raise ValueError(f"Tidak ada kolom label di dataset {sumber}")
    
    return pd.DataFrame({
        "text": teks,
        "label": label,
        "sumber": sumber
    })

# Standarisasi semua dataset
datasets_berita = []
datasets_berita.append(standardize_dataset(data1, "CNN"))
datasets_berita.append(standardize_dataset(data2, "Kompas"))
datasets_berita.append(standardize_dataset(data3, "Tempo"))
datasets_berita.append(standardize_dataset(data4, "Turnbackhoax"))

# Gabungkan
data_berita = pd.concat(datasets_berita, ignore_index=True)

# Info dataset gabungan
print("Ukuran dataset berita:", data_berita.shape)
print("\nDistribusi label (0=REAL, 1=HOAX):")
print(data_berita["label"].value_counts())

print("\nContoh data gabungan:")
print(data_berita.sample(5))


Ukuran dataset berita: (31353, 3)

Distribusi label (0=REAL, 1=HOAX):
label
0    20972
1    10381
Name: count, dtype: int64

Contoh data gabungan:
                                                    text  label        sumber
24617  Hasil Periksa Fakta Renanda Dwina Putri (Anggo...      1  Turnbackhoax
26750  Pernyataan bahwa virus Corona baru penyebab Co...      1  Turnbackhoax
10558  Mengapa Hoaks dan Isu PKI Masih Laku untuk Pro...      0        Kompas
7570   Menag: Umrah Desember Jadi Uji Coba untuk Buka...      0           CNN
28119  Screenshot berita yang berjudul ‚ÄúKALAU SAYA ME...      1  Turnbackhoax


In [3]:
# ===== Cell 2b: Load & Standarisasi Dataset Sosmed =====
import pandas as pd

# Baca dataset sosmed
data_sosmed_raw = pd.read_csv("dataset_sosmed_hoax.csv")

# Fungsi standarisasi dataset sosmed
def standardize_sosmed(df, sumber="Twitter"):
    # Pilih teks utama
    if "text" in df.columns:
        teks = df["text"].fillna("")
    else:
        raise ValueError("‚ö†Ô∏è Tidak ada kolom 'text' di dataset sosmed")

    # Pastikan label ada
    if "label" in df.columns:
        label = df["label"].astype(int)
    else:
        raise ValueError("‚ö†Ô∏è Tidak ada kolom 'label' di dataset sosmed")
    
    return pd.DataFrame({
        "text": teks,
        "label": label,
        "sumber": sumber
    })

# Standarisasi dataset sosmed
data_sosmed = standardize_sosmed(data_sosmed_raw)

# Info dataset sosmed
print("Ukuran dataset sosmed:", data_sosmed.shape)
print("\nDistribusi label (0=REAL, 1=HOAX):")
print(data_sosmed["label"].value_counts())

print("\nContoh data sosmed:")
print(data_sosmed.sample(5))


Ukuran dataset sosmed: (1126, 3)

Distribusi label (0=REAL, 1=HOAX):
label
0    563
1    563
Name: count, dtype: int64

Contoh data sosmed:
                                                  text  label   sumber
921  Manusia tidak akan memahami satu sama lain,seb...      1  Twitter
206  Ada yang diperpanjang tapi bukan extra time #ppkm      0  Twitter
99     Hiburan dulu guys #ppkm https://t.co/OVfw4jB530      0  Twitter
397  Diselundupkan saat PPKM Darurat, Rokok Bodong ...      0  Twitter
421  Halo-halo Jakarta @DKIJakarta &amp; Pak Gub @a...      0  Twitter


In [4]:
# ===== Cell 3: Preprocessing untuk Berita & Sosmed =====
import re
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tqdm import tqdm

# Setup stemmer & stopwords
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stop_factory = StopWordRemoverFactory()
extra_stopwords = ["dengan","bahwa","karena","sudah","juga","akan","untuk"]
stop_words = set(stop_factory.get_stop_words() + extra_stopwords)

# Cache buat stemming (biar lebih cepat, ga ulang-ulang)
stem_cache = {}

def clean_text(text):
    if pd.isna(text):
        return ""
    s = text.lower()
    s = re.sub(r"http\S+|www\S+", " ", s)   # hapus URL
    s = re.sub(r"[^a-zA-Z\s]", " ", s)      # hapus angka & simbol
    s = re.sub(r"\s+", " ", s).strip()      # hapus spasi ganda
    
    tokens = [w for w in s.split() if w not in stop_words]
    tokens = [stem_cache[w] if w in stem_cache else stemmer.stem(w) for w in tokens]
    for w in tokens:
        if w not in stem_cache:
            stem_cache[w] = stemmer.stem(w)
    return " ".join(tokens)

# Fungsi apply per chunk (biar ga berat kalau data banyak)
def apply_in_chunks(series, func, chunk=2000):
    out_chunks = []
    for i in tqdm(range(0, len(series), chunk)):
        out = series.iloc[i:i+chunk].apply(func)
        out_chunks.append(out)
    return pd.concat(out_chunks)

# Preprocessing data berita
print("‚ö° Preprocessing dataset berita...")
data_berita['clean_text'] = apply_in_chunks(data_berita['text'].astype(str), clean_text, chunk=2000)

# Preprocessing data sosmed
print("‚ö° Preprocessing dataset sosmed...")
data_sosmed['clean_text'] = apply_in_chunks(data_sosmed['text'].astype(str), clean_text, chunk=2000)

print("‚úÖ Preprocessing selesai!")
print("\nContoh hasil preprocessing data berita:")
print(data_berita[['text','clean_text','label']].head())

print("\nContoh hasil preprocessing data sosmed:")
print(data_sosmed[['text','clean_text','label']].head())


‚ö° Preprocessing dataset berita...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [3:49:14<00:00, 859.65s/it]


‚ö° Preprocessing dataset sosmed...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [02:05<00:00, 125.58s/it]

‚úÖ Preprocessing selesai!

Contoh hasil preprocessing data berita:
                                                text  \
0  Anies di Milad BKMT: Pengajian Menghasilkan Ib...   
1  Edy Soal Pilgub Sumut: Kalau yang Maju Abal-ab...   
2  PKB Bakal Daftarkan Menaker Ida Fauziyah Jadi ...   
3  Gede Pasek Doakan AHY Jadi Capres atau Cawapre...   
4  PKN Siapkan Jabatan Khusus Buat Anas Urbaningr...   

                                          clean_text  label  
0  anies milad bkmt aji hasil ibu ibu tahu mantan...      0  
1  edy soal pilgub sumut kalau maju abal abal pak...      0  
2  pkb bakal daftar menaker ida fauziyah jadi cal...      0  
3  gede pasek doa ahy jadi capres cawapres ketua ...      0  
4  pkn siap jabat khusus buat anas urbaningrum us...      0  

Contoh hasil preprocessing data sosmed:
                                                text  \
0  Aturan 20 Menit Makan di Tempat Tak Terpantau ...   
1       #BeritaTerkini #PPKM https://t.co/sPOewNypIu   
2  Laju penyeb




In [5]:
# ===== Cell 4a: Vectorization & Split untuk Dataset Berita =====
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Vectorizer TF-IDF untuk berita
vectorizer_berita = TfidfVectorizer(
    ngram_range=(1,2),   # unigram + bigram
    max_features=20000,  # ambil max 20k fitur
    sublinear_tf=True
)

# Pakai clean_text dari Cell 3
X_berita = vectorizer_berita.fit_transform(data_berita["clean_text"])
y_berita = data_berita["label"].values

# Ambil nama fitur (kata & bigram)
feature_names = np.array(vectorizer_berita.get_feature_names_out())

# Hitung rata-rata bobot TF-IDF tiap fitur
tfidf_mean = X_berita.mean(axis=0).A1  # ubah ke array 1D

# Ambil 10 fitur dengan nilai TF-IDF tertinggi
top_n = 10
top_idx = np.argsort(tfidf_mean)[-top_n:][::-1]

# Buat tabel
tfidf_table = pd.DataFrame({
    "Kata / Frasa": feature_names[top_idx],
    "Nilai TF-IDF": tfidf_mean[top_idx]
})

tfidf_table


# Split train-test
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_berita, y_berita, test_size=0.2, random_state=42, stratify=y_berita
)

print("=== Dataset Berita ===")
print("Shape X_train:", Xb_train.shape)
print("Shape X_test :", Xb_test.shape)
print("Distribusi train:", dict(zip(*np.unique(yb_train, return_counts=True))))
print("Distribusi test :", dict(zip(*np.unique(yb_test, return_counts=True))))


=== Dataset Berita ===
Shape X_train: (25082, 20000)
Shape X_test : (6271, 20000)
Distribusi train: {np.int64(0): np.int64(16777), np.int64(1): np.int64(8305)}
Distribusi test : {np.int64(0): np.int64(4195), np.int64(1): np.int64(2076)}


In [6]:
# ===== Cell 4a.1: Tabel Contoh Hasil TF-IDF =====
import numpy as np
import pandas as pd
from IPython.display import display

# Ambil nama fitur (kata & bigram)
feature_names = np.array(vectorizer_berita.get_feature_names_out())

# Hitung rata-rata bobot TF-IDF tiap fitur
tfidf_mean = X_berita.mean(axis=0).A1  # konversi sparse ‚Üí array 1D

# Tentukan jumlah kata yang ingin ditampilkan
top_n = 10

# Ambil indeks TF-IDF tertinggi
top_idx = np.argsort(tfidf_mean)[-top_n:][::-1]

# Buat tabel
tfidf_table = pd.DataFrame({
    "Kata / Frasa": feature_names[top_idx],
    "Nilai TF-IDF": tfidf_mean[top_idx]
})

# Tampilkan tabel (WAJIB pakai display)
display(tfidf_table)


Unnamed: 0,Kata / Frasa,Nilai TF-IDF
0,sebut,0.022581
1,kata,0.021812
2,partai,0.019775
3,jadi,0.019381
4,politik,0.017351
5,jokowi,0.016882
6,presiden,0.016825
7,indonesia,0.015083
8,jelas,0.014453
9,laku,0.014391


In [7]:
# ===== Cell 5a: Training Multinomial Naive Bayes untuk Berita =====
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Inisialisasi model
mnb_berita = MultinomialNB()

# Training
mnb_berita.fit(Xb_train, yb_train)

# Prediksi di data test
yb_pred = mnb_berita.predict(Xb_test)

# Evaluasi
print("=== Evaluasi Model (MNB - Berita) ===")
print("Akurasi:", accuracy_score(yb_test, yb_pred))
print("\nClassification Report:")
print(classification_report(yb_test, yb_pred, target_names=["REAL","HOAX"]))
print("\nConfusion Matrix:")
print(confusion_matrix(yb_test, yb_pred))


=== Evaluasi Model (MNB - Berita) ===
Akurasi: 0.9811832243661298

Classification Report:
              precision    recall  f1-score   support

        REAL       0.99      0.99      0.99      4195
        HOAX       0.97      0.97      0.97      2076

    accuracy                           0.98      6271
   macro avg       0.98      0.98      0.98      6271
weighted avg       0.98      0.98      0.98      6271


Confusion Matrix:
[[4137   58]
 [  60 2016]]


In [10]:
# ===== Cell 5a.1: Analisis Probabilitas Kata per Kelas (MNB - Berita) =====
import pandas as pd
import numpy as np

# Ambil nama fitur dari TF-IDF
feature_names = np.array(vectorizer_berita.get_feature_names_out())

# Ambil probabilitas log P(kata|kelas)
log_prob = mnb_berita.feature_log_prob_

# Ubah ke probabilitas asli
prob = np.exp(log_prob)

# Buat DataFrame
df_prob = pd.DataFrame({
    "Kata": feature_names,
    "P(Kata | Real)": prob[0],
    "P(Kata | Hoax)": prob[1]
})

# Tambahkan selisih probabilitas
df_prob["Selisih (Hoax - Real)"] = df_prob["P(Kata | Hoax)"] - df_prob["P(Kata | Real)"]

# Ambil 10 kata paling khas Hoax & Real
top_hoax = df_prob.sort_values("Selisih (Hoax - Real)", ascending=False).head(5)
top_real = df_prob.sort_values("Selisih (Hoax - Real)").head(5)

# Gabungkan
tabel_mnb = pd.concat([top_real, top_hoax]).reset_index(drop=True)

tabel_mnb


Unnamed: 0,Kata,P(Kata | Real),P(Kata | Hoax),Selisih (Hoax - Real)
0,partai,0.002526,0.00015,-0.002376
1,politik,0.002186,0.000178,-0.002009
2,kata,0.002395,0.000793,-0.001602
3,milu,0.001656,0.000107,-0.001549
4,ketua,0.001705,0.00025,-0.001456
5,narasi,5.5e-05,0.002229,0.002174
6,referensi,1.7e-05,0.002074,0.002057
7,konten,3.5e-05,0.002086,0.002051
8,kategori,3.9e-05,0.002065,0.002026
9,sumber,0.000108,0.002118,0.00201


In [9]:
# ===== Cell 4b: Vectorization & Split untuk Sosmed =====
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# Vectorizer TF-IDF untuk sosmed
vectorizer_sosmed = TfidfVectorizer(
    ngram_range=(1,2),   # unigram + bigram
    max_features=20000,  # max 20k fitur
    sublinear_tf=True
)

# Transformasi teks sosmed
Xs = vectorizer_sosmed.fit_transform(data_sosmed["clean_text"])
ys = data_sosmed["label"].values

# Split train-test
Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    Xs, ys, test_size=0.2, random_state=42, stratify=ys
)

print("Shape Xs_train:", Xs_train.shape)
print("Shape Xs_test :", Xs_test.shape)
print("Distribusi train:", dict(zip(*np.unique(ys_train, return_counts=True))))
print("Distribusi test :", dict(zip(*np.unique(ys_test, return_counts=True))))


Shape Xs_train: (900, 19466)
Shape Xs_test : (226, 19466)
Distribusi train: {np.int64(0): np.int64(450), np.int64(1): np.int64(450)}
Distribusi test : {np.int64(0): np.int64(113), np.int64(1): np.int64(113)}


In [9]:
# ===== Cell 5b: Training Multinomial Naive Bayes untuk Sosmed =====
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Inisialisasi model MNB khusus sosmed
mnb_sosmed = MultinomialNB()

# Training
mnb_sosmed.fit(Xs_train, ys_train)

# Prediksi di data test
ys_pred = mnb_sosmed.predict(Xs_test)

# Evaluasi
print("=== Evaluasi Model Sosmed (MNB) ===")
print("Akurasi:", accuracy_score(ys_test, ys_pred))
print("\nClassification Report:")
print(classification_report(ys_test, ys_pred, target_names=["REAL","HOAX"]))
print("\nConfusion Matrix:")
print(confusion_matrix(ys_test, ys_pred))


=== Evaluasi Model Sosmed (MNB) ===
Akurasi: 0.8141592920353983

Classification Report:
              precision    recall  f1-score   support

        REAL       0.83      0.79      0.81       113
        HOAX       0.80      0.84      0.82       113

    accuracy                           0.81       226
   macro avg       0.82      0.81      0.81       226
weighted avg       0.82      0.81      0.81       226


Confusion Matrix:
[[89 24]
 [18 95]]


In [10]:
# ===== Cell 6a: Fungsi Prediksi Berita =====

def prediksi_berita(teks, model=mnb_berita, vec=vectorizer_berita):
    teks_clean = clean_text(teks)
    fitur = vec.transform([teks_clean])
    
    probas = model.predict_proba(fitur)[0]
    pred = model.predict(fitur)[0]
    
    print("üì∞ [BERITA]")
    print("Teks (potongan):", teks[:120], "...")
    print("Probabilitas => REAL:", probas[0], " | HOAX:", probas[1])
    print("Prediksi:", "HOAX ‚ùå" if pred == 1 else "REAL ‚úÖ")
    print("-"*80)
    return "HOAX" if pred == 1 else "REAL"

# Contoh tes berita
prediksi_berita("Presiden Joko Widodo meresmikan proyek kereta cepat Jakarta‚ÄìBandung.")


üì∞ [BERITA]
Teks (potongan): Presiden Joko Widodo meresmikan proyek kereta cepat Jakarta‚ÄìBandung. ...
Probabilitas => REAL: 0.7775386911818677  | HOAX: 0.22246130881812903
Prediksi: REAL ‚úÖ
--------------------------------------------------------------------------------


'REAL'

In [11]:
# ===== Cell 6b: Fungsi Prediksi Sosmed =====

def prediksi_sosmed(teks, model=mnb_sosmed, vec=vectorizer_sosmed):
    teks_clean = clean_text(teks)
    fitur = vec.transform([teks_clean])
    
    probas = model.predict_proba(fitur)[0]
    pred = model.predict(fitur)[0]
    
    print("üì± [SOSMED]")
    print("Teks (potongan):", teks[:120], "...")
    print("Probabilitas => REAL:", probas[0], " | HOAX:", probas[1])
    print("Prediksi:", "HOAX ‚ùå" if pred == 1 else "REAL ‚úÖ")
    print("-"*80)
    return "HOAX" if pred == 1 else "REAL"

# Contoh tes sosmed
prediksi_sosmed("Minum air rebusan kabel listrik terbukti ampuh menyembuhkan segala penyakit, segera sebarkan!")


üì± [SOSMED]
Teks (potongan): Minum air rebusan kabel listrik terbukti ampuh menyembuhkan segala penyakit, segera sebarkan! ...
Probabilitas => REAL: 0.3772036056280858  | HOAX: 0.6227963943719159
Prediksi: HOAX ‚ùå
--------------------------------------------------------------------------------


'HOAX'

In [11]:
# ===== Cell 7: Simpan Model & Vectorizer =====
import joblib

# Simpan model berita & vectorizer
joblib.dump(mnb_berita, "model_hoax_berita.pkl")
joblib.dump(vectorizer_berita, "tfidf_vectorizer_berita.pkl")

# Simpan model sosmed & vectorizer
joblib.dump(mnb_sosmed, "model_hoax_sosmed.pkl")
joblib.dump(vectorizer_sosmed, "tfidf_vectorizer_sosmed.pkl")

print("‚úÖ Semua model & vectorizer berhasil disimpan!")
print(" - model_hoax_berita.pkl")
print(" - tfidf_vectorizer_berita.pkl")
print(" - model_hoax_sosmed.pkl")
print(" - tfidf_vectorizer_sosmed.pkl")


‚úÖ Semua model & vectorizer berhasil disimpan!
 - model_hoax_berita.pkl
 - tfidf_vectorizer_berita.pkl
 - model_hoax_sosmed.pkl
 - tfidf_vectorizer_sosmed.pkl


In [12]:
# ===== Cell 8: Load Model & Vectorizer =====
import joblib

# Load model & vectorizer berita
mnb_berita_loaded = joblib.load("model_hoax_berita.pkl")
vectorizer_berita_loaded = joblib.load("tfidf_vectorizer_berita.pkl")

# Load model & vectorizer sosmed
mnb_sosmed_loaded = joblib.load("model_hoax_sosmed.pkl")
vectorizer_sosmed_loaded = joblib.load("tfidf_vectorizer_sosmed.pkl")

print("‚úÖ Semua model & vectorizer berhasil diload kembali!")


‚úÖ Semua model & vectorizer berhasil diload kembali!


In [13]:
# ===== Cell 9: Fungsi Prediksi dengan Model yang Dilooad =====

def prediksi_berita_loaded(teks):
    teks_clean = clean_text(teks)  # pakai fungsi preprocessing Cell 3
    fitur = vectorizer_berita_loaded.transform([teks_clean])
    pred = mnb_berita_loaded.predict(fitur)[0]
    return "HOAX ‚ùå" if pred == 1 else "REAL ‚úÖ"

def prediksi_sosmed_loaded(teks):
    teks_clean = clean_text(teks)
    fitur = vectorizer_sosmed_loaded.transform([teks_clean])
    pred = mnb_sosmed_loaded.predict(fitur)[0]
    return "HOAX ‚ùå" if pred == 1 else "REAL ‚úÖ"

# Coba tes
print(prediksi_berita_loaded("Presiden Jokowi meresmikan kereta cepat Jakarta-Bandung"))
print(prediksi_sosmed_loaded("awas jangan sampai viral vaksin ini langsung sembuh katanya"))


REAL ‚úÖ
HOAX ‚ùå
