In [1]:
# importing all the required libraries

import pandas as pd
import numpy as np
import re
from collections import defaultdict # explained why was imported when used
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# Loading the two datasets and adding the column stating whether the news instance is true or false

df_real = pd.read_csv('True.csv')
df_real['RealNews?'] = True
df_fake = pd.read_csv('Fake.csv')
df_fake['RealNews?'] = False

In [3]:
# Using pd.concat instead of df.append, as newer pandas version do not support .append for dataframes

df = pd.concat([df_real, df_fake], ignore_index=True)

In [4]:
# Verifying the length of the dataframe (44,898 rows)

len(df)

44898

In [5]:
# Verifying that df is a dataframe with title, text, subject, date, and RealNews? as its columns | (RealNews? -> target variable)

df.head()

Unnamed: 0,title,text,subject,date,RealNews?
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [6]:
# Creating a new column document for the entire df, where df['document']=(title + text)

df['document'] = df[['title', 'text']].agg(' '.join, axis=1)

In [7]:
# Converting the df['document'] into lower-case for simplicity

df['document'] = df['document'].apply(lambda x: x.lower())

In [8]:
df.head()

Unnamed: 0,title,text,subject,date,RealNews?,document
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True,"as u.s. budget fight looms, republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True,u.s. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True,senior u.s. republican senator: 'let mr. muell...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True,fbi russia probe helped by australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True,trump wants postal service to charge 'much mor...


In [9]:
# Split df into a training set and test set with 20% for testing and with shuffle=True we ensure randomness while splitting the data

df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True)

## Naive Baiyes Classification Process
This table shows the words and occurrence of that word in a particular class

    Term   |  T  |  F
    -------------------
    Word-1 | 36  | 23  
    Word-2 | 12  | 31  
    Word-3 | 43  | 11  
    -------------------
    Total  | 200 | 87

Now, if we want to find the class for a phrase that contains **Word-1** and **Word-3**, we will find the likelihood of that phrase belonging to a particular class and then classify it to the class with the maximum likelihood, which can be found by using the **Baye's Theorem** given as

    P(y|X) = P(X|y) * P(y)
            --------------
                P(X)

    if,
    X = x1, x2, x3, ..., xn
    then,
    P(y|x1, x2, x3, ..., xn) =  P(x1|y) * P(x2|y) * P(x3|y) * ... * P(xn|y) * P(y) * P(y)* P(y)* P(y)
                               -----------------------------------------------------------------------
                                             P(x1) * P(x2) * P(x3) * ... * P(xn)
    
    where,
    P(X|y) = P(x1|y) * P(x2|y) * P(x3|y) * ... * P(xn|y), and
    P(X) = P(x1) * P(x2) * P(x3) * ... * P(xn)

Coming back to our classification, we are considering the words (features i.e. X) to be independent of each other, and therefore we can calculate the likelihood by:

    P(y|X) = P(X|y) * P(y)

                                (!Word-2)      (Word-1)        (Word-3)            (Total T)   
    T-Likelihood (P(T|X))   =   ---------  x  -----------  x  -----------  x  -------------------
                                (Total T)      (Total T)       (Total T)      (Total T + Total F)


                                (!Word-2)      (Word-1)        (Word-3)            (Total F)   
    F-Likelihood (P(F|X))   =   ---------  x  -----------  x  -----------  x  -------------------
                                (Total F)      (Total F)       (Total F)      (Total T + Total F)

We are trying to implement the **Multinomial Naive Bayes** since this approach ensures that all relevant features, both present and absent, contribute to the final probability calculation for each class, and therefore we will also account for the absence of any word(s)


In [11]:
# Tokenizing words

# defaultdict from the collections library creates dictionaries with default values, making it easier to count words with Laplace smoothing.
# from collections import defaultdict

# Tokenize function
# This function tokenize splits a document into individual words based on non-word characters (e.g., spaces, punctuation) using a regular expression.

def tokenize(document):
    return re.split(r"\W+", document)


In [12]:
# Count word frequencies for each class with,
# Each word’s count is initialized to 1 (Laplace smoothing), which prevents zero probability for unseen words during classification.

# realWordCount dictionaries to count word occurrences in real news documents

realWordCount = defaultdict(lambda: 1)  # Laplace smoothing with initial count of 1

# fakeWordCount dictionaries to count word occurrences in fake news documents

fakeWordCount = defaultdict(lambda: 1) # Laplace smoothing with initial count of 1

# This loop iterates through each row in the training dataset.
# It tokenizes the document column of each row.
# For each word, if the document is RealNews? = True, it increments realWordCount[word]; otherwise, it increments fakeWordCount[word].

for _, row in df_train.iterrows():
    for word in tokenize(row['document']):
        if row['RealNews?']:
            realWordCount[word] += 1
        else:
            fakeWordCount[word] += 1


In [13]:
# Calculate total word counts

# totalRealWords store the total number of words in real news documents, by summing all word counts

totalRealWords = sum(realWordCount.values())

# totalFakeWords store the total number of words in fake news documents, by summing all word counts

totalFakeWords = sum(fakeWordCount.values())


In [14]:
# Probability functions

# Calculate the probability of a given word in real news by dividing the word’s count by the total word count for real

def prob_word_given_real(word):
    return realWordCount[word] / totalRealWords

# Calculate the probability of a given word in fake news by dividing the word count by the total word count for fake news.

def prob_word_given_fake(word):
    return fakeWordCount[word] / totalFakeWords


In [15]:
# Class priors i.e. P(y)

# prob_real represents the prior probability for real news

real_count = len(df_train[df_train['RealNews?']==True])
prob_real = real_count / len(df_train)

# prob_fake represents the prior probability for fake news

fake_count = len(df_train[df_train['RealNews?']==False])
prob_fake = fake_count / len(df_train)


In [16]:
# Vocabulary: all unique words in both realWordCount and fakeWordcount

vocab = set(realWordCount.keys()).union(set(fakeWordCount.keys()))

In [17]:
# Multinomial Naive Bayes Classifier Function

# log_prob_real and log_prob_fake are initialized as the logarithm of the prior probabilities for real and fake news.
# Using logarithms helps avoid underflow issues when multiplying small probabilities and won't really affect the actual result of the classifier
# Also [log(a*b) = log(a)+log(b)]

def classify(document):
    # Log of class priors
    log_prob_real = np.log(prob_real)
    log_prob_fake = np.log(prob_fake)
    
    # Tokenize the document
    
    document_words = set(tokenize(document))
    
    # Calculating log of the probabilities for each word in the vocabulary
    # If the word is in the document then we compute P(word | class), otherwise we compute P(!word | class) = 1 - P(word | class)
    
    for word in vocab:
        if word in document_words:
            log_prob_real += np.log(prob_word_given_real(word))
            log_prob_fake += np.log(prob_word_given_fake(word))
        
        else:
            log_prob_real += np.log(1 - prob_word_given_real(word))
            log_prob_fake += np.log(1 - prob_word_given_fake(word))
    
    # True if real news (log_prob_real > log_prob_fake), else False for fake news
    return log_prob_real > log_prob_fake


In [18]:
df_train.shape, df_test.shape

((35918, 6), (8980, 6))

In [19]:
# Predictions

# y_true stores the actual labels (real or fake) from the testing dataset.

y_true = df_test['RealNews?']

# y_pred stores the predicted labels generated by applying the classify function to each document in the testing dataset.

y_pred = df_test['document'].apply(classify)

In [20]:
# Report precision, recall, and F1 score for the classification performance.
# We can also set average='binary' as an attribute that will make the function to focus the metrics on the binary classification problem 

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
print(f"Precision:{precision}, \nRecall: {recall}, \nF1 Score: {f1}")

Precision:0.967483506126296, 
Recall: 0.944994246260069, 
F1 Score: 0.9561066480381883
