In [2]:
#This file creates the classifier to detect fake news
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, average_precision_score, recall_score, f1_score
import pickle
import spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [3]:
def remove_nouns(texts):
    output = []
    for doc in nlp.pipe(texts):
        noun_text = " ".join(token.lemma_ for token in doc if token.pos_ == 'NOUN')
        output.append(noun_text)
    return output

In [4]:
#load csv of data

##Load Fake Data set and give it a label of 1 
fake_df = pd.read_csv('datasets/Fake.csv')
fake_df['label'] = 1

true_df = pd.read_csv('datasets/True.csv')
true_df['label'] = 0

labeled_df = pd.concat([fake_df, true_df])
labeled_df = labeled_df.sample(frac=1).reset_index(drop=True)
labeled_df['text'] = remove_nouns(labeled_df['text'])
##Save a copy of labled dataset for ease of use next time
labeled_df.to_csv('datasets/labeled.csv')


In [5]:
#grab labels
labels = labeled_df.label

In [6]:
#Split dataset
x_train, x_test, y_train, y_test = train_test_split(labeled_df['text'], labels, test_size = 0.2, random_state = 7)

In [7]:
#initialize TFIDF
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

In [8]:
#Vectorize test and train set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [9]:
#Create the Classfier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [10]:
#Test Classfier
y_pred = pac.predict(tfidf_test)
accuracy = accuracy_score(y_test,y_pred)
precision = average_precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
print(f'Accuracy: {round(accuracy*100,2)}%')
print('Precision: {}'.format(round(precision,4)))
print('Recall: {}'.format(round(recall,4)))
print('F1 Score: {}'.format(round(f1,4)))

Accuracy: 94.34%
Precision: 0.9181
Recall: 0.9538
F1 Score: 0.9456


In [11]:
#Create confusion Matrix
#FORMAT:
#   True Pos, False Neg
#   False Pos, True Neg
confusion_matrix(y_test,y_pred, labels=[1, 0])

array([[4418,  214],
       [ 294, 4054]])

In [12]:
#Save PAC classifer
pickle.dump(pac, open('pac_nouns_removed.pkl', 'wb'))

In [13]:
#Save Vectorizer
pickle.dump(tfidf_vectorizer, open('tfidf_nouns_removed.pkl', 'wb'))