In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from common import *
import os


In [18]:
if not os.path.exists('data.csv'):
    if not os.path.exists('Fake.csv') and not os.path.exists('True.csv'):
        download_data()
    label_data('Fake.csv', 'fake_labeled.csv', 1)
    label_data('True.csv', 'true_labeled.csv', 0)
    os.system("awk '(NR == 1) || (FNR > 1)' *labeled.csv > data.csv")

In [19]:
df = pd.read_csv('data.csv', usecols=['text', 'label'], encoding='utf-8')
df = df.sample(n=NUM_SAMPLES, random_state=RANDOM_SEED)
df.head()

Unnamed: 0,text,label
4528,Donald Trump is calling for one of the most co...,1
31727,WASHINGTON (Reuters) - Former Republican U.S. ...,0
10937,You re never to young to commit jihad Teachers...,1
13470,Laura Ingraham reminds the Never Trump people ...,1
40814,BERLIN/HANOVER (Reuters) - Germany s Social De...,0


# BoW

In [20]:
countVectorizer = CountVectorizer()
embeddings_countVectorizer = preprocess_data(df['text'], countVectorizer)

X_tr, X_ts, y_tr, y_ts= train_test_split(embeddings_countVectorizer, df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

X_tr = X_tr.toarray()
X_ts = X_ts.toarray()

feature_size = X_tr.shape[1]

print(f'Train shape: {X_tr.shape}')
print(f'Test shape: {X_ts.shape}')

Train shape: (1500, 49912)
Test shape: (1500, 49912)


In [21]:
def train_models(X_tr, y_tr, X_ts, y_ts, models):
    for model in models:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_ts)

        score1 = metrics.accuracy_score(y_ts, y_pred)
        print("accuracy:   %0.3f" % score1)
        print(metrics.classification_report(y_ts, y_pred,
                                            target_names=['Positive', 'Negative']))
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_ts, y_pred))
        
        y_pred = model.predict_proba(X_ts)
        print("Prediction probability:")
        print(y_pred)

In [22]:
proba_models = [MultinomialNB()]

In [23]:
train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

accuracy:   0.922
              precision    recall  f1-score   support

    Positive       0.89      0.95      0.92       718
    Negative       0.95      0.90      0.92       782

    accuracy                           0.92      1500
   macro avg       0.92      0.92      0.92      1500
weighted avg       0.92      0.92      0.92      1500

confusion matrix:
[[682  36]
 [ 81 701]]
Prediction probability:
[[8.66357871e-01 1.33642129e-01]
 [1.00000000e+00 9.20608421e-22]
 [6.41169980e-20 1.00000000e+00]
 ...
 [4.09715794e-19 1.00000000e+00]
 [1.24574664e-16 1.00000000e+00]
 [5.12786000e-11 1.00000000e+00]]


# TF-IDF

In [24]:
tfidfVectorizer = TfidfVectorizer()
embeddings_Tfidf = preprocess_data(df['text'], tfidfVectorizer)

X_tr, X_ts, y_tr, y_ts= train_test_split(embeddings_Tfidf, df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

X_tr = X_tr.toarray()
X_ts = X_ts.toarray()

feature_size = X_tr.shape[1]

print(f'Train shape: {X_tr.shape}')
print(f'Test shape: {X_ts.shape}')

Train shape: (1500, 49912)
Test shape: (1500, 49912)


In [25]:
train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

accuracy:   0.899
              precision    recall  f1-score   support

    Positive       0.85      0.95      0.90       718
    Negative       0.95      0.85      0.90       782

    accuracy                           0.90      1500
   macro avg       0.90      0.90      0.90      1500
weighted avg       0.90      0.90      0.90      1500

confusion matrix:
[[684  34]
 [117 665]]
Prediction probability:
[[0.56971403 0.43028597]
 [0.65121106 0.34878894]
 [0.2992888  0.7007112 ]
 ...
 [0.12051147 0.87948853]
 [0.34054657 0.65945343]
 [0.49025386 0.50974614]]
