# Detection of Fake News

In [1]:
import pandas as pd
import re

all_data = pd.read_csv("fake_news_train.csv", sep = ",", usecols=range(5))
all_data['label'] = all_data['label'].astype(int)
all_data = all_data.drop(columns = "id")
all_data.head()

Unnamed: 0,title,author,text,label
0,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
1,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
2,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
3,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
4,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0


Check whether dataset contains any NaN values and drop rows that contains NaN values

In [2]:
all_data.isna().any()
all_data = all_data.dropna()
all_data.isna().any()

title     False
author    False
text      False
label     False
dtype: bool

Split data to train and test (%20)

In [3]:
train = all_data.sample(frac = 0.8) 
test = all_data.drop(train.index)

In [4]:
from sklearn.feature_extraction import stop_words
stops = stop_words.ENGLISH_STOP_WORDS

**Delete stop words and make all words lowercase for bag-of-words. Also, using regular expressions discard numbers and punctuations.**

In [5]:
train['title'] = train['title'].apply(lambda x: ' '.join([re.sub("[^a-zA-Z]"," ",word.lower()) for word in x.split() if word.lower() not in stops]))
train['text'] = train['text'].apply(lambda x: ' '.join([re.sub("[^a-zA-Z]"," ",word.lower()) for word in x.split() if word.lower() not in stops]))
test['title'] = test['title'].apply(lambda x: ' '.join([re.sub("[^a-zA-Z]"," ",word.lower()) for word in x.split() if word.lower() not in stops]))
test['text'] = test['text'].apply(lambda x: ' '.join([re.sub("[^a-zA-Z]"," ",word.lower()) for word in x.split() if word.lower() not in stops]))
train.head()

Unnamed: 0,title,author,text,label
4454,trump spokesperson katrina pierson caught blat...,Andrew Bradford,trump spokesperson katrina pierson caught blat...,1
12742,experts isis root true islam defeated guns,dailouk,home world experts isis root true islam d...,1
1421,peak millennial cities can t assume continued...,Conor Dougherty,past decade american cities transformed young...,0
11098,singer kaya jones shares support trump thanks...,Daniel Nussbaum,singer dj kaya jones took social media week do...,0
6988,nut job new york times collaborates deep st...,Joel B. Pollak,new york times waited president donald trump s...,0


In [6]:
counts = train.groupby(by = "label", as_index=False).count()
number_of_real_news = counts.at[0, "title"]
number_of_fake_news = counts.at[1, "title"]
number_of_total_news = number_of_fake_news + number_of_real_news

In [7]:
# Separate fake and real news to different datasets
train_fake = train.drop(train[train.label == 0].index)
train_real = train.drop(train[train.label == 1].index)

Function to return n-gram version of the string s.

In [8]:
def make_ngram(string, n):
    string = string.lower()
    tokens = [token for token in string.split(" ") if token != ""]  
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

n-gram Naive bayes implementation with Laplace smoothing and logarithmic calculation to avoid underflow while multiplication.

In [9]:
import math

def naive_bayes(text, vocabulary, type_matrix_sum, type_total_news, type_total_words, n):
    text_split = make_ngram(text, n)
    type_prob = type_total_news / number_of_total_news      # Fake or real news probability across all news
    denominator = type_total_words + total_unique_words     # Denominator for laplace smoothing
    multiplied_log_probability = math.log(type_prob)
    for word in text_split:                                 # Find each word's probability
        try:
            word_index = vocabulary[word]
            number_of_appearances = type_matrix_sum[0,word_index]
        except KeyError:
            number_of_appearances = 0
        number_of_appearances += 1  # Laplace smoothing 
        prob = number_of_appearances / denominator
        prob = math.log(prob)       # avoid underflow
        multiplied_log_probability += prob

    return multiplied_log_probability

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Prediction from Titles

titles_cv = CountVectorizer(strip_accents="ascii", ngram_range=(1, 1))
titles_real_cv = CountVectorizer(strip_accents="ascii", ngram_range=(1, 1))
titles_fake_cv = CountVectorizer(strip_accents="ascii", ngram_range=(1, 1))

title_fitted = titles_cv.fit(train["title"])
total_unique_words = len(title_fitted.vocabulary_)

fake_news_words_matrix = titles_fake_cv.fit_transform(train_fake["title"])
total_fake_words = fake_news_words_matrix.count_nonzero()
real_news_words_matrix = titles_real_cv.fit_transform(train_real["title"])
total_real_words = real_news_words_matrix.count_nonzero()

sum_words_fake = fake_news_words_matrix.sum(axis = 0)
fake_words_freq = [(word, sum_words_fake[0, idx]) for word, idx in titles_fake_cv.vocabulary_.items()]
fake_words_freq = sorted(fake_words_freq, key = lambda x: x[1], reverse=True)

sum_words_real = real_news_words_matrix.sum(axis = 0)
real_words_freq = [(word, sum_words_real[0, idx]) for word, idx in titles_real_cv.vocabulary_.items()]
real_words_freq = sorted(real_words_freq, key = lambda x: x[1], reverse=True)

In [11]:
# Find title prediction accuracy

total_test = len(test["title"])
correct = 0

for i in range(total_test):
    correct_label = test["label"].iloc[i]
    fake_prob = naive_bayes(test["title"].iloc[i], titles_fake_cv.vocabulary_, sum_words_fake, number_of_fake_news, total_fake_words, 1) 
    real_prob = naive_bayes(test["title"].iloc[i], titles_real_cv.vocabulary_, sum_words_real, number_of_real_news, total_real_words, 1)
    if real_prob > fake_prob:
        result_label = 0
    else:
        result_label = 1
    if result_label == correct_label:
        correct += 1
        

In [12]:
print("PREDICTION FROM TITLE ACCURACY : " + str(100 * (correct / total_test)))

PREDICTION FROM TITLE ACCURACY : 90.07529089664614


  
**Most 10 Frequent Words in Fake News Headlines:**

In [13]:
print(fake_words_freq[:10])

[('trump', 793), ('hillary', 577), ('clinton', 480), ('election', 242), ('new', 238), ('video', 211), ('comment', 211), ('war', 192), ('fbi', 178), ('world', 158)]


**Most 10 Frequent Words in Real News Headlines:**

In [14]:
print(real_words_freq[:10])  


[('new', 4286), ('york', 4064), ('times', 4029), ('breitbart', 1499), ('trump', 1474), ('donald', 413), ('obama', 197), ('clinton', 192), ('says', 183), ('briefing', 156)]


It is clear to see that 3 most common words which appear in reliable news titles are **"new"**(4272 times), **"york"**(4045), 
**"times"**(4012). Which make sense when we think about New York Times is a very reputable newspaper, so most of the news which include these words 
are reliable news. These 3 words together appeared **more than 4000 times** in real news headlines. However, they almost never appeared together in 
a fake news headline.

For fake news headlines, majority of the words are about politics and elections. "trump", "hillary" and "clinton" words are ahead
of other words by far. However, "trump" word appears in real news headlines two times more than in fake news headlines. So, it
is not possible to make sense from "trump" word. On the other hand, **"hillary"**(587 times), **"clinton"**(486) and **"election"**(259) 
words are appear in fake news headlines way more than real news headlines.   

When we make predictions only on news titles, accuracy score is almost %90 which is really well. This can be a result of
above inferences. Titles are very short sentences and there are some very weighted words which specified above.
With this words, it is easier to make predictions about news labels. **So it is very feasible to predict whether 
a headline is from a real or fake news from words that appear in the headline.**

In [15]:
# Prediction from texts 1-Gram

texts_cv = CountVectorizer(strip_accents="ascii", ngram_range=(0, 1))
texts_real_cv = CountVectorizer(strip_accents="ascii", ngram_range=(0, 1))
texts_fake_cv = CountVectorizer(strip_accents="ascii", ngram_range=(0, 1))

text_fitted = texts_cv.fit(train["text"])
total_unique_words = len(text_fitted.vocabulary_)

fake_news_words_matrix_text = texts_fake_cv.fit_transform(train_fake["text"])
total_fake_words_text = fake_news_words_matrix_text.count_nonzero()

real_news_words_matrix_text = texts_real_cv.fit_transform(train_real["text"])
total_real_words_text = real_news_words_matrix_text.count_nonzero()

fake_news_words_matrix_text_sum = fake_news_words_matrix_text.sum(axis = 0)
real_news_words_matrix_text_sum = real_news_words_matrix_text.sum(axis = 0)

In [16]:
# Find 1-Gram text prediction accuracy
correct = 0
for i in range(total_test):
    correct_label = test["label"].iloc[i]
    fake_prob = naive_bayes(test["text"].iloc[i], texts_fake_cv.vocabulary_, fake_news_words_matrix_text_sum, number_of_fake_news, total_fake_words_text, 1) 
    real_prob = naive_bayes(test["text"].iloc[i], texts_real_cv.vocabulary_, real_news_words_matrix_text_sum, number_of_real_news, total_real_words_text, 1)
    if real_prob > fake_prob:
        result_label = 0
    else:
        result_label = 1
    if result_label == correct_label:
        correct += 1
     
print("1-GRAM TEXT ACCURACY : " + str(100 * (correct / total_test)))

1-GRAM TEXT ACCURACY : 90.99931553730322


In [17]:
# Prediction from texts 2-Gram

texts_cv = CountVectorizer(strip_accents="ascii", ngram_range=(2, 2))
texts_real_cv = CountVectorizer(strip_accents="ascii", ngram_range=(2, 2))
texts_fake_cv = CountVectorizer(strip_accents="ascii", ngram_range=(2, 2))

text_fitted = texts_cv.fit(train["text"])
total_unique_words = len(text_fitted.vocabulary_)
fake_news_words_matrix_text = texts_fake_cv.fit_transform(train_fake["text"])
total_fake_words_text = fake_news_words_matrix_text.count_nonzero()

real_news_words_matrix_text = texts_real_cv.fit_transform(train_real["text"])
total_real_words_text = real_news_words_matrix_text.count_nonzero()

fake_news_words_matrix_text_sum = fake_news_words_matrix_text.sum(axis = 0)
real_news_words_matrix_text_sum = real_news_words_matrix_text.sum(axis = 0)

In [18]:
# Find 2-gram text prediction accuracy

correct = 0
for i in range(total_test):
    correct_label = test["label"].iloc[i]
    fake_prob = naive_bayes(test["text"].iloc[i], texts_fake_cv.vocabulary_, fake_news_words_matrix_text_sum, number_of_fake_news, total_fake_words_text, 2) 
    real_prob = naive_bayes(test["text"].iloc[i], texts_real_cv.vocabulary_, real_news_words_matrix_text_sum, number_of_real_news, total_real_words_text, 2)
    if real_prob > fake_prob:
        result_label = 0
    else:
        result_label = 1
    if result_label == correct_label:
        correct += 1
     
print("2-GRAM TEXT ACCURACY : " + str(100 * (correct / total_test)))

2-GRAM TEXT ACCURACY : 90.86242299794661


As expected, when we use bi-gram instead of uni-gram, accuracy score is little bit higher. When we use bi-gram, our word matrix is trying to understand word groups
such as "new york", "donald trump" etc. . This way it is more likely to predict the correct class for articles. 

Also, it is possible to make accuracy score even higher with use of 3-gram or 4-gram. For example, when using 3-gram, CountVectorizer will make sense from
"new york times" or "national football league". 

In [19]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

texts_tf_cv = TfidfVectorizer(strip_accents="ascii", ngram_range=(1, 1), max_features=100000)
texts_real_tf_cv = TfidfVectorizer(strip_accents="ascii", ngram_range=(1, 1), max_features=100000)
texts_fake_tf_cv = TfidfVectorizer(strip_accents="ascii", ngram_range=(1, 1), max_features=100000)

text_fitted_tf = texts_tf_cv.fit(train["text"])
total_unique_words_tf = len(text_fitted_tf.vocabulary_)

fake_news_words_matrix_text_tf = texts_fake_tf_cv.fit_transform(train_fake["text"])
total_fake_words_text_tf = fake_news_words_matrix_text_tf.count_nonzero()
real_news_words_matrix_text_tf = texts_real_tf_cv.fit_transform(train_real["text"])
total_real_words_text_tf = real_news_words_matrix_text_tf.count_nonzero()

fake_news_words_matrix_text_sum_tf = fake_news_words_matrix_text_tf.sum(axis = 0)
real_news_words_matrix_text_sum_tf = real_news_words_matrix_text_tf.sum(axis = 0)

In [20]:
# Find TF-IDF text prediction accuracy

correct = 0
for i in range(total_test):
    correct_label = test["label"].iloc[i]
    fake_prob = naive_bayes(test["text"].iloc[i], texts_fake_tf_cv.vocabulary_, fake_news_words_matrix_text_sum_tf, number_of_fake_news, total_fake_words_text_tf, 1) 
    real_prob = naive_bayes(test["text"].iloc[i], texts_real_tf_cv.vocabulary_, real_news_words_matrix_text_sum_tf, number_of_real_news, total_real_words_text_tf, 1)
    if real_prob > fake_prob:
        result_label = 0
    else:
        result_label = 1
    if result_label == correct_label:
        correct += 1
     
print("TEXT ACCURACY TF-IDF: " + str(100 * (correct / total_test)))

TEXT ACCURACY TF-IDF: 80.66392881587954


When narrowing our features with **max_features** parameter, accuracy of TF-IDF is a little bit higher than before. This way, only counting
most weighted 100.000 words in our texts. That confirms **classification results can be improved by selecting a subset of extremely effective 
words for the dictionary.**

In [21]:
fake_weight = fake_news_words_matrix_text_tf.sum(axis = 0)
fake_words_weights = [(word, fake_weight[0, idx]) for word, idx in texts_fake_tf_cv.vocabulary_.items()]
fake_words_weights = sorted(fake_words_weights, key = lambda x: x[1], reverse=True)

**10 words whose presence most strongly predicts that the news is fake:**

In [22]:
print(fake_words_weights[:10])

[('trump', 194.9549280508746), ('clinton', 163.9016865652972), ('hillary', 126.00582193245671), ('people', 97.4289833401401), ('it', 90.9163909334322), ('election', 85.99562267619844), ('said', 83.22597802510128), ('new', 70.76061452954936), ('fbi', 70.5707168328888), ('obama', 68.76582479942536)]


**10 words whose absence most strongly predicts that the news is fake:**

In [23]:
print(fake_words_weights[-10:])

[('shean', 0.004446212888346782), ('beisan', 0.004446212888346782), ('mortared', 0.004446212888346782), ('unbending', 0.004446212888346782), ('palmah', 0.004446212888346782), ('omelet', 0.004446212888346782), ('defensively', 0.004446212888346782), ('inhabitancy', 0.004446212888346782), ('fialkoff', 0.004446212888346782), ('jeremyrhammond', 0.004446212888346782)]


In [24]:
real_weight = real_news_words_matrix_text_tf.sum(axis = 0)
real_words_weights = [(word, real_weight[0, idx]) for word, idx in texts_fake_tf_cv.vocabulary_.items()]
real_words_weights = sorted(real_words_weights, key = lambda x: x[1], reverse=True)

**10 words whose presence most strongly predicts that the news is real:**

In [25]:
print(real_words_weights[:10])

[('narcisstic', 375.73381148089294), ('santelli', 308.7544867750158), ('unheard', 297.5939964871302), ('prohibited', 135.27438595627038), ('inviting', 124.24292018909804), ('philth', 119.9454329006608), ('norman', 114.53135995459839), ('narrates', 100.2017257583236), ('collins', 92.86192143269939), ('liita', 90.79738769803623)]


**10 words whose absence most strongly predicts that the news is real:**

In [26]:
print(real_words_weights[-10:])

[('captivity', 0.004952348433626786), ('fff', 0.004952348433626786), ('derma', 0.004952348433626786), ('tester', 0.004952348433626786), ('amerikkkan', 0.004952348433626786), ('escuchando', 0.004952348433626786), ('subterr', 0.004952348433626786), ('resealed', 0.004952348433626786), ('veganer', 0.004952348433626786), ('machu', 0.004952348433626786)]


### Analyzing effect of the stopwords
Stopwords are most common words in languages. Such as "the","and","or","with" etc. In CountVectorizer method, if stop words are dont get deleted 
they will be most recent words, so it would be impossible to make any valuable predictions. However, in TF-IDF, because of all words have a 
weight, stopwords will have least weights. In that situation, stopwords won't be as important as in CountVectorizer.