In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## 1. Read the dataset

In [2]:
news = pd.read_csv('fake_or_real_news.csv', index_col=0)

In [3]:
news.head()

Unnamed: 0,title,text,label
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6335 entries, 8476 to 4330
Data columns (total 3 columns):
title    6335 non-null object
text     6335 non-null object
label    6335 non-null object
dtypes: object(3)
memory usage: 198.0+ KB


In [5]:
news.describe().T

Unnamed: 0,count,unique,top,freq
title,6335,6256,OnPolitics | 's politics blog,5
text,6335,6060,"Killing Obama administration rules, dismantlin...",58
label,6335,2,REAL,3171


In [6]:
news.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

Two classes are basically balanced

## 2. Create dataset

#### 2.1 Convert label from string to int

In [7]:
le2 = LabelEncoder()
news['binary_label']=le2.fit_transform(news['label'])

#### 2.2 Reset index

In [8]:
news = news.reset_index(drop=True)

In [9]:
news.head()

Unnamed: 0,title,text,label,binary_label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1


#### 2.3 Split the data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news['text'], news.binary_label , random_state=10)

## 3. Extract Features and Build Models

#### 3.1 TfidfVectorizer

#### 3.1.1 default stop words 

In [11]:
# Transform dataset
tfidf_vectorizer_default = TfidfVectorizer(stop_words='english', max_df=0.7) 
tfidf_train_default = tfidf_vectorizer_default.fit_transform(X_train) 
tfidf_test_default = tfidf_vectorizer_default.transform(X_test)

# Build model
clf_default = MultinomialNB() 
clf_default.fit(tfidf_train_default, y_train)

# Validate
pred_default = clf_default.predict(tfidf_test_default)
score_default = accuracy_score(y_test, pred_default)
print ("accuracy:   %0.3f" % score_default)
print(classification_report(y_test, pred_default, target_names=['FAKE', 'REAL']))

accuracy:   0.840
             precision    recall  f1-score   support

       FAKE       0.96      0.70      0.81       769
       REAL       0.77      0.97      0.86       815

avg / total       0.86      0.84      0.84      1584



#### 3.1.2 stop words down load from http://www.lextek.com/manuals/onix/stopwords1.html

In [12]:
# Download the stopwords file from the website
import requests
import bs4

res = requests.get('http://www.lextek.com/manuals/onix/stopwords1.html')
soup = bs4.BeautifulSoup(res.text,"lxml")
stop_words=[tag.text for tag in soup.select('pre')]
stop_words=[l.split('\n\n') for l in stop_words]
stop_words=stop_words[0][6:]

In [13]:
# Transform dataset
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.7) 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)

# Build model
clf = MultinomialNB() 
clf.fit(tfidf_train, y_train)

# Validate
pred = clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print ("accuracy:   %0.3f" % score)
print(classification_report(y_test, pred, target_names=['FAKE', 'REAL']))

accuracy:   0.847
             precision    recall  f1-score   support

       FAKE       0.95      0.72      0.82       769
       REAL       0.79      0.97      0.87       815

avg / total       0.87      0.85      0.84      1584



#### Since the recall score of FAKE calss and the accuracy of the model all improved, the stop words file down loaded is more suitable for the dataset used in this project

#### 3.1.3 Improve the number of gram

* ngram_range=(2,2)

In [14]:
# Transform dataset
tfidf_vectorizer_2 = TfidfVectorizer(stop_words=stop_words, ngram_range=(2,2) ,max_df=0.7) 
tfidf_train_2 = tfidf_vectorizer_2.fit_transform(X_train) 
tfidf_test_2 = tfidf_vectorizer_2.transform(X_test)

# Build model
clf_2 = MultinomialNB() 
clf_2.fit(tfidf_train_2, y_train)

# Validate
pred_2 = clf_2.predict(tfidf_test_2)
score_2 = accuracy_score(y_test, pred_2)
print ("accuracy:   %0.3f" % score_2)
print(classification_report(y_test, pred_2, target_names=['FAKE', 'REAL']))

accuracy:   0.903
             precision    recall  f1-score   support

       FAKE       0.98      0.82      0.89       769
       REAL       0.85      0.99      0.91       815

avg / total       0.91      0.90      0.90      1584



* ngram_range=(3,3)

In [15]:
# Transform dataset
tfidf_vectorizer_3 = TfidfVectorizer(stop_words=stop_words, ngram_range=(3,3) ,max_df=0.7) 
tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train) 
tfidf_test_3 = tfidf_vectorizer_3.transform(X_test)

# Build model
clf_3 = MultinomialNB() 
clf_3.fit(tfidf_train_3, y_train)

# Validate
pred_3 = clf_3.predict(tfidf_test_3)
score_3 = accuracy_score(y_test, pred_3)
print ("accuracy:   %0.3f" % score_3)
print(classification_report(y_test, pred_3, target_names=['FAKE', 'REAL']))

accuracy:   0.924
             precision    recall  f1-score   support

       FAKE       0.95      0.90      0.92       769
       REAL       0.91      0.95      0.93       815

avg / total       0.93      0.92      0.92      1584



#### The accuracy and recall score improve a lot after the number of gram improved;
* However, when the ngram_range grows larger, the model need too long time to run. So I choose ngram_range=(3,3).

#### 3.1.4 Tuning parameters for MultinomialNB

In [16]:
clf_3_tune = MultinomialNB() 

para = {'alpha': np.arange(0.1,1.1,0.1)}

kfold=KFold(n_splits=5, random_state=10)

clf_3_grid = GridSearchCV(clf_3_tune, para, cv=kfold, scoring= 'accuracy')

#Fitting
clf_3_grid.fit(tfidf_train_3, y_train)

#Printing reports
print("Best Parameters: {}".format(clf_3_grid.best_params_))
print("Best cross-validation Accuracy: {:.4f}".format(clf_3_grid.best_score_))
print("Best estimator:\n{}".format(clf_3_grid.best_estimator_))

Best Parameters: {'alpha': 0.8}
Best cross-validation Accuracy: 0.8272
Best estimator:
MultinomialNB(alpha=0.8, class_prior=None, fit_prior=True)


#### 3.1.5 Build the model with the best parameters

In [17]:
# Transform dataset
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(3,3) ,max_df=0.7) 
tfidf_train = tfidf_vectorizer_3.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer_3.transform(X_test)

# Build model
clf_tfidf = MultinomialNB(alpha=0.8) 
clf_tfidf.fit(tfidf_train, y_train)

# Validate
pred_tfidf = clf_tfidf.predict(tfidf_test)
score_tfidf = accuracy_score(y_test, pred_tfidf)
print ("accuracy:   %0.3f" % score_tfidf)
print(classification_report(y_test, pred_tfidf, target_names=['FAKE', 'REAL']))

accuracy:   0.925
             precision    recall  f1-score   support

       FAKE       0.95      0.90      0.92       769
       REAL       0.91      0.95      0.93       815

avg / total       0.93      0.92      0.92      1584



#### 3.2 CountVectorizer 

#### 3.2.1 Build models with different stop words

* stop_words='english'

In [18]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train) 
count_test = count_vectorizer.transform(X_test)

clf2 = MultinomialNB() 
clf2.fit(count_train, y_train)
pred2 = clf2.predict(count_test)

score = accuracy_score(y_test, pred2)
print ("accuracy:   %0.3f" % score)
print(classification_report(y_test, pred2, target_names=['FAKE', 'REAL']))

accuracy:   0.888
             precision    recall  f1-score   support

       FAKE       0.92      0.84      0.88       769
       REAL       0.86      0.93      0.90       815

avg / total       0.89      0.89      0.89      1584



* stop_words=stop_words

In [19]:
count_vectorizer = CountVectorizer(stop_words=stop_words)
count_train = count_vectorizer.fit_transform(X_train) 
count_test = count_vectorizer.transform(X_test)

clf2 = MultinomialNB() 
clf2.fit(count_train, y_train)
pred2 = clf2.predict(count_test)

score = accuracy_score(y_test, pred2)
print ("accuracy:   %0.3f" % score)
print(classification_report(y_test, pred2, target_names=['FAKE', 'REAL']))

accuracy:   0.887
             precision    recall  f1-score   support

       FAKE       0.92      0.84      0.88       769
       REAL       0.86      0.93      0.89       815

avg / total       0.89      0.89      0.89      1584



#### Since the preformance difference between the two models is small, I still choose to use the stop words down loaded

#### 3.2.2 Improve the number of 'ngram_range'

In [20]:
count_vectorizer_2 = CountVectorizer(stop_words=stop_words, ngram_range=(2, 2))
count_train_2 = count_vectorizer_2.fit_transform(X_train) 
count_test_2 = count_vectorizer_2.transform(X_test)

clf_2 = MultinomialNB() 
clf_2.fit(count_train_2, y_train)
pred_2 = clf_2.predict(count_test_2)

score_2 = accuracy_score(y_test, pred_2)
print ("accuracy:   %0.3f" % score_2)
print(classification_report(y_test, pred_2, target_names=['FAKE', 'REAL']))

accuracy:   0.910
             precision    recall  f1-score   support

       FAKE       0.98      0.83      0.90       769
       REAL       0.86      0.98      0.92       815

avg / total       0.92      0.91      0.91      1584



In [21]:
count_vectorizer_3 = CountVectorizer(stop_words=stop_words, ngram_range=(3, 3))
count_train_3 = count_vectorizer_3.fit_transform(X_train) 
count_test_3 = count_vectorizer_3.transform(X_test)

clf_3 = MultinomialNB() 
clf_3.fit(count_train_3, y_train)
pred_3 = clf_3.predict(count_test_3)

score_3 = accuracy_score(y_test, pred_3)
print ("accuracy:   %0.3f" % score_3)
print(classification_report(y_test, pred_3, target_names=['FAKE', 'REAL']))

accuracy:   0.924
             precision    recall  f1-score   support

       FAKE       0.96      0.88      0.92       769
       REAL       0.90      0.96      0.93       815

avg / total       0.93      0.92      0.92      1584



#### 3.2.3 Apply grid search to find the best 'alpha'

In [22]:
clf_tune_count = MultinomialNB() 

para = {'alpha': np.arange(0.1,1.1,0.1)}

kfold=KFold(n_splits=5, random_state=10)

clf_grid_count = GridSearchCV(clf_tune_count, para, cv=kfold, scoring= 'accuracy')

#Fitting
clf_grid_count.fit(count_train_3, y_train)

#Printing reports
print("Best Parameters: {}".format(clf_grid_count.best_params_))
print("Best cross-validation Accuracy: {:.4f}".format(clf_grid_count.best_score_))
print("Best estimator:\n{}".format(clf_grid_count.best_estimator_))

Best Parameters: {'alpha': 1.0}
Best cross-validation Accuracy: 0.7649
Best estimator:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


#### 3.2.4 Build the model with best parameter

In [23]:
count_vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(3, 3))
count_train = count_vectorizer.fit_transform(X_train) 
count_test = count_vectorizer.transform(X_test)

clf_count = MultinomialNB(alpha=1.0) 
clf_count.fit(count_train, y_train)
pred_count = clf_3.predict(count_test)

score_count = accuracy_score(y_test, pred_count)
print ("accuracy:   %0.3f" % score_count)
print(classification_report(y_test, pred_count, target_names=['FAKE', 'REAL']))

accuracy:   0.924
             precision    recall  f1-score   support

       FAKE       0.96      0.88      0.92       769
       REAL       0.90      0.96      0.93       815

avg / total       0.93      0.92      0.92      1584



## 4. Conclusion

In [24]:
print ("accuracy:   %0.3f" % score_tfidf)
print('TfidfVectorizer:\n\n',classification_report(y_test, pred_tfidf, target_names=['FAKE', 'REAL']),'\n\n')
print ("accuracy:   %0.3f" % score_count)
print('CountVectorizer:\n\n',classification_report(y_test, pred_count, target_names=['FAKE', 'REAL']))

accuracy:   0.925
TfidfVectorizer:

              precision    recall  f1-score   support

       FAKE       0.95      0.90      0.92       769
       REAL       0.91      0.95      0.93       815

avg / total       0.93      0.92      0.92      1584
 


accuracy:   0.924
CountVectorizer:

              precision    recall  f1-score   support

       FAKE       0.96      0.88      0.92       769
       REAL       0.90      0.96      0.93       815

avg / total       0.93      0.92      0.92      1584



#### With naive bayes classifier, TfidfVectorizer feature extraction is more suitable for this dataset