In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import (RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from common import *
import os
from sklearn.metrics import classification_report


In [18]:
if not os.path.exists('data.csv'):
    if not os.path.exists('Fake.csv') and not os.path.exists('True.csv'):
        download_data()
    label_data('Fake.csv', 'fake_labeled.csv', 1)
    label_data('True.csv', 'true_labeled.csv', 0)
    os.system("awk '(NR == 1) || (FNR > 1)' *labeled.csv > data.csv")

In [19]:
df = pd.read_csv('data.csv', usecols=['text', 'label'], encoding='utf-8')
df = df.sample(n=NUM_SAMPLES, random_state=RANDOM_SEED)
df.head()

Unnamed: 0,text,label
4528,Donald Trump is calling for one of the most co...,1
31727,WASHINGTON (Reuters) - Former Republican U.S. ...,0
10937,You re never to young to commit jihad Teachers...,1
13470,Laura Ingraham reminds the Never Trump people ...,1
40814,BERLIN/HANOVER (Reuters) - Germany s Social De...,0


In [20]:
def train_models(X_tr, y_tr, X_ts, y_ts, models):
    for model in models:
        score = cross_val_score(model, X_tr, y_tr, cv=5)
        msg = ("{0}:\n\tMean accuracy on development set\t= {1:.3f} "
            "(+/- {2:.3f})".format(model.__class__.__name__,
                                    score.mean(),
                                    score.std()))
        print(msg)
        
        model.fit(X_tr, y_tr)
        pred_eval = model.predict(X_ts)
        acc_eval = accuracy_score(y_ts, pred_eval)
        print("\tAccuracy on evaluation set\t\t= {0:.3f}".format(acc_eval))
        print("Probability: ")
        proba = model.predict_proba(X_ts)
        print(proba)
        print(classification_report(y_ts, pred_eval))


In [21]:
ensamble_models = [ RandomForestClassifier(random_state=1),
                    GradientBoostingClassifier(random_state=1),
                    AdaBoostClassifier(random_state=1)  ]

In [22]:
countVectorizer = CountVectorizer()
embeddings_countVectorizer = preprocess_data(df['text'], countVectorizer)

X_tr, X_ts, y_tr, y_ts= train_test_split(embeddings_countVectorizer, df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

print(f'Train shape: {X_tr.shape}')
print(f'Test shape: {X_ts.shape}')

Train shape: (1500, 49912)
Test shape: (1500, 49912)


In [23]:
train_models(X_tr, y_tr, X_ts, y_ts, ensamble_models)

RandomForestClassifier:
	Mean accuracy on development set	= 0.966 (+/- 0.012)
	Accuracy on evaluation set		= 0.968
Probability: 
[[0.45 0.55]
 [0.56 0.44]
 [0.38 0.62]
 ...
 [0.27 0.73]
 [0.25 0.75]
 [0.38 0.62]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       718
           1       0.98      0.96      0.97       782

    accuracy                           0.97      1500
   macro avg       0.97      0.97      0.97      1500
weighted avg       0.97      0.97      0.97      1500

GradientBoostingClassifier:
	Mean accuracy on development set	= 0.991 (+/- 0.006)
	Accuracy on evaluation set		= 0.989
Probability: 
[[0.00390776 0.99609224]
 [0.99657346 0.00342654]
 [0.00697744 0.99302256]
 ...
 [0.00390776 0.99609224]
 [0.00390776 0.99609224]
 [0.00506603 0.99493397]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       718
           1       1.00      0.98      0.99       782

    acc

In [24]:
tfidfVectorizer = TfidfVectorizer()
embeddings_Tfidf = preprocess_data(df['text'], tfidfVectorizer)

X_tr, X_ts, y_tr, y_ts= train_test_split(embeddings_Tfidf, df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

print(f'Train shape: {X_tr.shape}')
print(f'Test shape: {X_ts.shape}')

Train shape: (1500, 49912)
Test shape: (1500, 49912)


In [25]:
train_models(X_tr, y_tr, X_ts, y_ts, ensamble_models)

RandomForestClassifier:
	Mean accuracy on development set	= 0.966 (+/- 0.008)
	Accuracy on evaluation set		= 0.967
Probability: 
[[0.39 0.61]
 [0.54 0.46]
 [0.39 0.61]
 ...
 [0.25 0.75]
 [0.29 0.71]
 [0.37 0.63]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       718
           1       0.98      0.96      0.97       782

    accuracy                           0.97      1500
   macro avg       0.97      0.97      0.97      1500
weighted avg       0.97      0.97      0.97      1500

GradientBoostingClassifier:
	Mean accuracy on development set	= 0.993 (+/- 0.006)
	Accuracy on evaluation set		= 0.990
Probability: 
[[0.00342275 0.99657725]
 [0.99720467 0.00279533]
 [0.00547589 0.99452411]
 ...
 [0.00297852 0.99702148]
 [0.00297852 0.99702148]
 [0.0039143  0.9960857 ]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       718
           1       0.99      0.99      0.99       782

    acc