In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

In [2]:
np.random.seed(42)
nltk.download('punk')

[nltk_data] Error loading punk: Package 'punk' not found in index


False

In [3]:
# Create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [4]:
# Label Encoder use to Encode target labels with value between 0 and n_classes-1
Encoder = LabelEncoder()

In [5]:
# TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features.
Tfidf_vect = TfidfVectorizer()

In [6]:
df=pd.read_csv("train.csv")

In [7]:
df.head()

Unnamed: 0,ID,label,tanggal,judul,narasi,nama file gambar
0,71,1,17-Aug-20,Pemakaian Masker Menyebabkan Penyakit Legionna...,A caller to a radio talk show recently shared ...,71.jpg
1,461,1,17-Jul-20,Instruksi Gubernur Jateng tentang penilangan ...,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...,461.png
2,495,1,13-Jul-20,Foto Jim Rohn: Jokowi adalah presiden terbaik ...,Jokowi adalah presiden terbaik dlm sejarah ban...,495.png
3,550,1,8-Jul-20,"ini bukan politik, tapi kenyataan Pak Jokowi b...","Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...",550.png
4,681,1,24-Jun-20,Foto Kadrun kalo lihat foto ini panas dingin,Kadrun kalo lihat foto ini panas dingin . .,681.jpg


In [8]:
df.describe()

Unnamed: 0,ID,label
count,4231.0,4231.0
mean,495577.536753,0.818955
std,288916.337205,0.385101
min,71.0,0.0
25%,244075.5,1.0
50%,496444.0,1.0
75%,749050.0,1.0
max,999860.0,1.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4231 entries, 0 to 4230
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                4231 non-null   int64 
 1   label             4231 non-null   int64 
 2   tanggal           4231 non-null   object
 3   judul             4231 non-null   object
 4   narasi            4231 non-null   object
 5   nama file gambar  4231 non-null   object
dtypes: int64(2), object(4)
memory usage: 198.5+ KB


In [10]:
df.isnull().sum()

ID                  0
label               0
tanggal             0
judul               0
narasi              0
nama file gambar    0
dtype: int64

In [11]:
df=df.fillna('')

In [12]:
df.isnull().sum()

ID                  0
label               0
tanggal             0
judul               0
narasi              0
nama file gambar    0
dtype: int64

In [13]:
df.columns

Index(['ID', 'label', 'tanggal', 'judul', 'narasi', 'nama file gambar'], dtype='object')

In [14]:
df=df.drop(['ID', 'judul', 'tanggal'], axis=1)

In [15]:
df.head()

Unnamed: 0,label,narasi,nama file gambar
0,1,A caller to a radio talk show recently shared ...,71.jpg
1,1,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...,461.png
2,1,Jokowi adalah presiden terbaik dlm sejarah ban...,495.png
3,1,"Maaf Mas2 dan Mbak2, ini bukan politik, tapi k...",550.png
4,1,Kadrun kalo lihat foto ini panas dingin . .,681.jpg


In [16]:
feature = df['narasi']
label = df['label']
port_stem=PorterStemmer()
port_stem
port_stem.stem("Hi thIs is chando * % %@@@")

'hi this is chando * % %@@@'

In [17]:
def stemming(content):
    con=re.sub('[^a-zA-Z]', ' ', content)
    con=con.lower()
    con=con.split()
    con=[port_stem.stem(word) for word in con if not word in stopwords.words('english')]
    con=' '.join(con)
    return con

In [18]:
import nltk
nltk.download('stopwords')

stemming('Hi this is chando')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'hi chando'

In [19]:
df['narasi']= df['narasi'].apply(stemming)

In [20]:
x=df['narasi']

In [21]:
y=df['label']

In [22]:
y.shape

(4231,)

In [23]:
# train_test_split digunakan untuk memecah dataset menjadi 2 bagian
# X_train dan y_train mewakili data yang akan dilakukan pada fitting model(Training model)
# X_test dan y_test  mewakili data yang akan dilakukan pada evaluasi model
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, stratify=label)

In [24]:
# Melihat ukuran data latih dan data uji
print('X_train : ', len(x_train))
print('X_test : ', len(x_test))

X_train :  2961
X_test :  1270


In [25]:
# Encoder for Data Features
df = pd.DataFrame(x_test)
df.head()

Unnamed: 0,narasi
994,bacot
710,inilah wajah uang kerta kita tahun renungkanlah
938,assallamuallaikum sampaikan ke rekan tgl febru...
3576,massa yang terdiri ata usia anak anak hingga r...
399,assalamualaikum menginformasikan pelaksanaan p...


In [26]:
# Encoder for Data Label
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.transform(y_test) 

In [27]:
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [28]:
# Fitting dataset terhadap tf-idf
Tfidf_vect.fit(["".join(row) for row in x_train])

In [29]:
# Mentransformasikan hasil fitting terhadap data X_train dan X_test
x_train_Tfidf = Tfidf_vect.transform([" ".join(row) for row in x_train])
x_test_Tfidf = Tfidf_vect.transform([" ".join(row) for row in x_test])

In [30]:
# Classifier - Algorithm - SVM
# fitting/training datasets pada algoritma SVM(Support Vector Machine)
SVM = svm.SVC(C=1.0, kernel='linear', degree=1, gamma="auto", verbose=True)
SVM.fit(x_train_Tfidf, y_train)  # predict the labels on validation dataset

[LibSVM]

In [31]:
# Menggunakan metrics accuracy untuk melihat performa model
predictions_SVM = SVM.predict(x_test_Tfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, y_test)*100)
rf = RandomForestClassifier()
rf.fit(x_train_Tfidf, y_train)

SVM Accuracy Score ->  81.88976377952756


In [32]:
prediction_rf = rf.predict(x_test_Tfidf)
print("RandomForest Accuracy Score -> ", accuracy_score(prediction_rf, y_test)*100)

RandomForest Accuracy Score ->  81.88976377952756


In [33]:
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [34]:
import pickle

In [35]:
pickle.dump(Tfidf_vect, open('vector.pkl', 'wb'))

In [36]:
pickle.dump(SVM, open('model.pkl', 'wb'))

In [37]:
vector_form=pickle.load(open('vector.pkl', 'rb'))

In [38]:
load_model=pickle.load(open('model.pkl', 'rb'))

In [39]:
def fake_news(news):
    news = stemming(news)  # Preprocess the input
    input_data = [news]
    vector_form1 = vector_form.transform(input_data)  # Vectorize
    prediction = load_model.predict(vector_form1)  # Predict using the model
    return prediction

In [40]:
val=fake_news("""Yusuf Ridwan mengatakan, kantong plastik kresek berwarna hitam mengandung zat karsinogen dan zat pewarna yang bisa tercampur dengan daging. Akibat kontaminasi zat ini dapat menyebabkan kanker dalam waktu lama.Yusuf Ridwan mengatakan, kantong plastik kresek berwarna hitam mengandung zat karsinogen dan zat pewarna yang bisa tercampur dengan daging. Akibat kontaminasi zat ini dapat menyebabkan kanker dalam waktu lama. """)

In [41]:
if val==[0]:
    print('reliable')
else:
    print('unreliable')

unreliable
