# Fake News Detector
Building a system to identify unreliable news articles.

In [None]:
# Make necessary imports
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Read the data
df = pd.read_csv('../input/fake-news/train.csv')

In [None]:
# Rows and columns in the data
df.shape

In [None]:
# Take a glimpse of the data
df.head()

In [None]:
# Get more information about the data
df.info()

In [None]:
# Check for missing data in each feature/column
df.isna().sum()

In [None]:
# Drop unrelated features first, then drop missing data
df = df.drop(columns=['title', 'author']).dropna()

In [None]:
# Double check missing data
df.isna().sum()

In [None]:
# Get the labels. 1: unreliable, 0: reliable
labels = df.label
labels.head()

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size = 0.2, random_state = 7)

In [None]:
# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

## Pipeline

In [None]:
pipe = Pipeline(steps = [('clf', PassiveAggressiveClassifier())])
                         
                         
search_space = [{'clf': [PassiveAggressiveClassifier()]},
                {'clf': [MultinomialNB()]},
                {'clf': [BernoulliNB()]},
                {'clf': [RandomForestClassifier()]}]
                         
gridsearch = GridSearchCV(estimator=pipe,
                          param_grid = search_space,
                          scoring = 'accuracy')
                         
best_model = gridsearch.fit(tfidf_train, y_train)

In [None]:
print('Best accuracy: %f using %s'%(best_model.best_score_, best_model.best_params_))

In [None]:
y_pred = best_model.predict(tfidf_test)

# Build confusion matrix. 1: unreliable, 0: reliable
confusion_matrix(y_test, y_pred, labels=[1, 0])

## Test Data

In [None]:
# Read the data
test_data = pd.read_csv('../input/fake-news/test.csv')

In [None]:
# Assign ids to an object to use it later for Kaggle submission
test_id = test_data['id']

In [None]:
# Rows and columns in the data
test_data.shape

In [None]:
# Take a glimpse of the data
test_data.head()

In [None]:
# Get some information about the data
test_data.info()

In [None]:
# How many missing data in each feature/column
test_data.isna().sum()

In [None]:
# Drop unrelated features first, then fill missing data
# Fill NAs instead of dropping, since the submission is expecting same number of observations as the original one
test_data = test_data.drop(columns=['id','title', 'author']).fillna('fake and unreliable')

In [None]:
# Double check missing data
test_data.isna().sum()

In [None]:
# Confirm that cleaned test data has same observations as the original one (i.e. 5200)
test_data.shape

In [None]:
# Transform test data
test_vectorized = tfidf_vectorizer.transform(test_data['text'])

In [None]:
# Predict test data
test_predictions = pac.predict(test_vectorized)

In [None]:
# Join test data's ids with their respective predicted labels
submission = pd.DataFrame({'id':test_id, 'label':test_predictions})
submission.shape

In [None]:
submission.head()

In [None]:
# Save the submission file
submission.to_csv('submission.csv', index=False)