In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

df = pd.read_csv('../input/covid-fake-news/covid_fake_news.csv')

In [2]:
print(df.shape)
df.head()

(1164, 4)


Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


# Handling missing values (and filling with empty string)

In [3]:
df.isnull().sum()

title     82
text      10
source    20
label      5
dtype: int64

In [4]:
df = df.fillna('')
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


# Label preprocessing

In [5]:
df['label'].value_counts()

TRUE    584
Fake    345
fake    230
          5
Name: label, dtype: int64

In [6]:
df.loc[df['label']=='fake', 'label'] = 'FAKE'
df.loc[df['label']=='Fake', 'label'] = 'FAKE'
df['label'].value_counts()

TRUE    584
FAKE    575
          5
Name: label, dtype: int64

In [7]:
print('Sebelum drop:', df.shape)
df = df[df['label']!='']
print('Setelah drop:', df.shape)

Sebelum drop: (1164, 4)
Setelah drop: (1159, 4)


In [8]:
df['label'].value_counts()

TRUE    584
FAKE    575
Name: label, dtype: int64

In [9]:
def convert_label(label):
    if label=='FAKE':
        return 0
    elif label=='TRUE':
        return 1

df['label'] = df['label'].apply(convert_label)
df['label'].value_counts()

1    584
0    575
Name: label, dtype: int64

# Text cleaning (text dan title)

In [10]:
def clean(text):
    # Set of stopwords dalam bhs Inggris 
    # (diambil dari library nltk)
    stop_words = set(stopwords.words('english'))
    
    # Jadiin huruf kecil semua
    text = text.lower()
    
    # Hapus html tags
    text = re.sub(r'<[^>]*>', '', text)
    
    # Hapus twitter usernames
    text = re.sub(r'@[A-Za-z0-9]+','',text)
    
    # Hapus urls
    text = re.sub('https?://[A-Za-z0-9]','',text)
    
    # Hapus angka
    text = re.sub('[^a-zA-Z]',' ',text)
    
    # String dipisah per kata
    word_tokens = word_tokenize(text)
    
    # Buang stopwords
    filtered_sentence = []
    for word_token in word_tokens:
        if word_token not in stop_words:
            filtered_sentence.append(word_token)
    
    # Gabungin lagi kata yg udah dipisah
    text = (' '.join(filtered_sentence))
    return text

In [11]:
word_tokenize("Saya lagi ngoding covid fake news detection nih!")

['Saya', 'lagi', 'ngoding', 'covid', 'fake', 'news', 'detection', 'nih', '!']

In [12]:
# Contoh doang
clean('Hello World 22 <html> <p> and Ardi,  or if they 3878, I am @hehe')

'hello world ardi'

In [13]:
for column in df.columns:
    if column != 'label':
        df[column] = df[column].apply(clean)
df.head()

Unnamed: 0,title,text,source,label
0,due recent outbreak coronavirus covid world he...,need add water drugs vaccines ready administer...,coronavirusmedicalkit com,0
1,,hydroxychloroquine shown effective rate treati...,rudygiuliani,0
2,,fact hydroxychloroquine shown effective rate t...,charliekirk,0
3,,corona virus man made virus created wuhan labo...,joannewrightforcongress,0
4,,finance research wuhan lab corona virus create...,joannewrightforcongress,0


# Train test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:2], df['label'],
                                                    test_size=0.2, 
                                                    random_state=11)

print(X_train.shape)
print(X_test.shape)

(927, 2)
(232, 2)


# Vectorize semua kata

In [15]:
# Inisialisasi objek Count Vectorizer
title_vectorizer = CountVectorizer()
text_vectorizer = CountVectorizer()

In [16]:
X_train_title = title_vectorizer.fit_transform(X_train['title']).toarray()
X_train_text = text_vectorizer.fit_transform(X_train['text']).toarray()

print("Shape nya title\t:", X_train_title.shape)
print("Shape nya text\t:", X_train_text.shape)

Shape nya title	: (927, 1941)
Shape nya text	: (927, 19345)


In [17]:
X_test_title = title_vectorizer.transform(X_test['title']).toarray()
X_test_text = text_vectorizer.transform(X_test['text']).toarray()

print(X_test_title.shape)
print(X_test_text.shape)

(232, 1941)
(232, 19345)


In [18]:
X_train_title_text = np.hstack((X_train_title, X_train_text))
X_test_title_text = np.hstack((X_test_title, X_test_text))

print("Bentuk keseluruhan data train\t:", X_train_title_text.shape)
print("Bentuk keseluruhan data test\t:", X_test_title_text.shape)

Bentuk keseluruhan data train	: (927, 21286)
Bentuk keseluruhan data test	: (232, 21286)


# Machine learning pakai multinomial naive bayes

In [19]:
clf = MultinomialNB()
clf.fit(X_train_title_text, y_train)
print("Accuracy on train data\t:", clf.score(X_train_title_text, y_train))
print("Accuracy on test data\t:", clf.score(X_test_title_text, y_test))

Accuracy on train data	: 0.9676375404530745
Accuracy on test data	: 0.9267241379310345


# Test pakai data sendiri

In [20]:
test_title = 'Is Covid real?'
test_text = 'Coronavirus is a man-made virus created in Wuhan laboratory'

In [22]:
test_title = clean(test_title)
test_text = clean(test_text)

print(test_title)
print(test_text)

covid real
coronavirus man made virus created wuhan laboratory


In [23]:
test_title_vec = title_vectorizer.transform([test_title]).toarray()
test_text_vec = text_vectorizer.transform([test_text]).toarray()

print(test_title_vec.shape)
print(test_text_vec.shape)

(1, 1941)
(1, 19345)


In [24]:
test_title_text = np.hstack((test_title_vec, test_text_vec))
print(test_title_text.shape)

(1, 21286)


In [25]:
print('Prediksi akhir\t\t:', clf.predict(test_title_text))
print('Probabilitas tiap class\t:', clf.predict_proba(test_title_text))

Prediksi akhir		: [0]
Probabilitas tiap class	: [[0.98234128 0.01765872]]
