In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pickle # To save model

In [156]:
# Random seed for consistency
np.random.seed(42)

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ikhsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [157]:
# Label Encoder use to Encode target labels with value between 0 and n_classes-1
Encoder = LabelEncoder()

In [158]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [159]:
# TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features.
Tfidf_vect = TfidfVectorizer()

In [160]:
data = pd.read_excel("dataset/dataset_berita.xlsx")
data.head(5)
data['label'].value_counts()

label
1    3464
0    1017
Name: count, dtype: int64

In [201]:
data2 = pd.read_csv("dataset/clean.csv")
data2.head(5)
data2['label'].value_counts()

label
1    4382
0    2693
Name: count, dtype: int64

In [209]:
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data2)

In [227]:
df1.head(5)

Unnamed: 0,ID,judul,narasi,label
0,71,Pemakaian Masker Menyebabkan Penyakit Legionna...,A caller to a radio talk show recently shared ...,0
1,461,Instruksi Gubernur Jateng tentang penilangan ...,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...,1
2,495,Foto Jim Rohn: Jokowi adalah presiden terbaik ...,Jokowi adalah presiden terbaik dlm sejarah ban...,1
3,550,"ini bukan politik, tapi kenyataan Pak Jokowi b...","Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...",1
4,681,Foto Kadrun kalo lihat foto ini panas dingin,Kadrun kalo lihat foto ini panas dingin . .,1


In [261]:
dropcol = df1.drop(["ID",'judul'], axis='columns')

In [253]:
deleted = df2.drop('ntoken',axis='columns')

In [255]:
clean = deleted.rename(columns={"text":"narasi"})

In [257]:
clean.head(5)

Unnamed: 0,narasi,label
0,Hakim Wahyu Iman Santoso Alami Kecelakaan Tung...,1
1,MEGAWATI DAN PUAN BERMAIN SLOT Nenek lampir pe...,1
2,JONATHAN LATUMAHINA SEORANG NASRANI DAN PENYUS...,1
3,"PDI-P Diblacklist dari Peserta Pilpres, Tak Bi...",1
4,Presiden Joe Biden dan Volodymyr Zelenskyy Ber...,1


In [266]:
frames = [dropcol,clean]

In [272]:
df = pd.concat(frames)
df['label'].value_counts()

label
1    7846
0    3710
Name: count, dtype: int64

In [274]:
feature = df['narasi']
label = df['label']

In [276]:
feature

0       A caller to a radio talk show recently shared ...
1       Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...
2       Jokowi adalah presiden terbaik dlm sejarah ban...
3       Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...
4             Kadrun kalo lihat foto ini panas dingin . .
                              ...                        
7070    Fajar Alfian dan Muhammad Rian Ardianto berhas...
7071    Bambang Sidik Achmadi, mantan Kasat Samapta Po...
7072    Kapal MT Kristin milik Pertamina mengalami keb...
7073    Pencegahan Penyakit yang dipicu oleh cacing in...
7074    bus yang mengangkut Calon Jamaah Haji asal Ind...
Name: narasi, Length: 11556, dtype: object

In [278]:
label

0       0
1       1
2       1
3       1
4       1
       ..
7070    0
7071    0
7072    0
7073    0
7074    1
Name: label, Length: 11556, dtype: int64

In [280]:
# melakukan stemming pada setiap baris
lower = [stemmer.stem(row.lower()) for row in feature]

# Hasil stem dan lower
lower[:5]

['a caller to a radio talk show recently shared that his wife was hospitalized n told she had covid n only a couple of days left to live a doctor friend suggested she be tested for legionnaires disease because she wore the same mask every day all day long turns out it was legionnaires disease from the moisture n bacteria in her mask she was given antibiotics n within two days was better what if these spikes in covid are really something else due to mask induced infections',
 'yth seluruh anggota grup sesuai instruksi gubernur jawa tengah hasil rapat tim gugus tugas covid 19 jateng sbb',
 'jokowi adalah presiden baik dlm sejarah bangsa indonesia jim rohn motivator baik dunia',
 'maaf mas2 dan mbak2 ini bukan politik tapi nyata pak jokowi hasil pulang 11 000 triliun uang negara dari swiss 11ribu triliun siap di bawa balik ke indonesia ruu treaty on mutual legal assistance in criminal matters between the republic of indonesia and the swiss confederation mla ri-swiss resmi sah dpr ri tangg

In [282]:
# Melakukan tokenisasi untuk setiap baris dataset
tokens = [word_tokenize(element) for element in lower]

# Hasil tokenisasi
tokens[:5]

[['a',
  'caller',
  'to',
  'a',
  'radio',
  'talk',
  'show',
  'recently',
  'shared',
  'that',
  'his',
  'wife',
  'was',
  'hospitalized',
  'n',
  'told',
  'she',
  'had',
  'covid',
  'n',
  'only',
  'a',
  'couple',
  'of',
  'days',
  'left',
  'to',
  'live',
  'a',
  'doctor',
  'friend',
  'suggested',
  'she',
  'be',
  'tested',
  'for',
  'legionnaires',
  'disease',
  'because',
  'she',
  'wore',
  'the',
  'same',
  'mask',
  'every',
  'day',
  'all',
  'day',
  'long',
  'turns',
  'out',
  'it',
  'was',
  'legionnaires',
  'disease',
  'from',
  'the',
  'moisture',
  'n',
  'bacteria',
  'in',
  'her',
  'mask',
  'she',
  'was',
  'given',
  'antibiotics',
  'n',
  'within',
  'two',
  'days',
  'was',
  'better',
  'what',
  'if',
  'these',
  'spikes',
  'in',
  'covid',
  'are',
  'really',
  'something',
  'else',
  'due',
  'to',
  'mask',
  'induced',
  'infections'],
 ['yth',
  'seluruh',
  'anggota',
  'grup',
  'sesuai',
  'instruksi',
  'gubernur'

In [290]:
print(tokens[2])

['jokowi', 'adalah', 'presiden', 'baik', 'dlm', 'sejarah', 'bangsa', 'indonesia', 'jim', 'rohn', 'motivator', 'baik', 'dunia']


In [292]:
X_train, X_test, y_train, y_test = train_test_split(tokens, label, test_size=0.2, stratify=label)

In [294]:
X_train

[['kristen', 'ortodoks', 'syiria'],
 ['mohon',
  'bisa',
  'terus',
  'ke',
  'para',
  'ortu',
  'dari',
  'dir',
  'dik',
  'bapak',
  'ibu',
  'untuk',
  'kita',
  'tahu',
  'sama',
  'menteri',
  'didik',
  'dan',
  'budaya',
  'kemendikbud',
  'lalu',
  'kanal',
  'sahabat',
  'keluarga',
  'keluar',
  'daftar',
  '16',
  'game',
  'yg',
  'anggap',
  'bahaya',
  'bagi',
  'anak',
  'yaitu'],
 ['orang',
  'wanita',
  'nama',
  'ayu',
  'indraswari',
  'jadi',
  'korban',
  'bunuh',
  'dan',
  'mutilasi',
  'di',
  'wisma',
  'jalan',
  'kaliurang',
  'yogyakarta',
  'oleh',
  'laku',
  'inisial',
  'hp',
  'alias',
  'p',
  'laku',
  'dan',
  'korban',
  'belum',
  'sempat',
  'hubung',
  'badan',
  'laku',
  'sali',
  'buat',
  'dan',
  'minta',
  'maaf',
  'lalu',
  'surat',
  'sesal',
  'laku',
  'hasil',
  'tangkap',
  'oleh',
  'polisi',
  'telah',
  'lari',
  'diri',
  'dan',
  'jual',
  'salah',
  'satu',
  'handphone',
  'milik',
  'korban'],
 ['area',
  'proyek',
  'banda

In [296]:
len(X_train)

9244

In [298]:
len(X_test)

2312

In [300]:
# Encoder for Data Label
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

y_train

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [302]:
# Fitting dataset terhadap tf-idf
Tfidf_vect.fit(["".join(row) for row in X_train])

In [304]:
# Save the vectorizer using pickle
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(Tfidf_vect, file)

In [306]:
# Mentransformasikan hasil fitting terhadap data X_train dan X_test
X_train_Tfidf = Tfidf_vect.transform([" ".join(row) for row in X_train])
X_test_Tfidf = Tfidf_vect.transform([" ".join(row) for row in X_test])

In [340]:
model = MultinomialNB()
model.fit(X_train_Tfidf, y_train)

akurasi = model.score(X_test_Tfidf, y_test)
print("Akurasi dari model :"+str((akurasi)*100))

Akurasi dari model :70.02595155709342


In [310]:
y_pred = model.predict(X_test_Tfidf)
report = classification_report(y_test,y_pred)

In [312]:
print(report)

              precision    recall  f1-score   support

           0       0.90      0.07      0.14       742
           1       0.69      1.00      0.82      1570

    accuracy                           0.70      2312
   macro avg       0.80      0.54      0.48      2312
weighted avg       0.76      0.70      0.60      2312



In [314]:
len(y_test)*0.94554

2186.08848

In [322]:
with open("mytext.txt","w", encoding="utf-8") as f:
  f.write(feature.iloc[10])

In [324]:
with open("mytext.txt","r",encoding="utf-8") as f:
  text=f.read()

In [326]:
text

'Akhirnya, Jokowi bersedia mengundurkan diri.\n Gue harus siap2 nih'

In [328]:
vectorized_ip = Tfidf_vect.transform([text])

In [330]:
output_array = model.predict(vectorized_ip)
result = int(output_array[0])
if(result==1):
  print("Teks tersebut merupakan berita hoax!")
else:
  print("Teks tersebut merupakan berita tidak hoax!")

Teks tersebut merupakan berita hoax!


In [332]:
# Save the model to a file
with open('model_berita.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)