# CS5830 Project 5: Naive Bayes

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Models
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Download dataset from https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
df = pd.read_csv('data/WELFake_Dataset.csv', index_col=0)
df = df.dropna()
df

In [None]:
### FUNCTIONS ###

def get_scores(y_label, y_pred):
    precision, recall, f1score, support = precision_recall_fscore_support(
        y_label, y_pred, average='binary', pos_label=1
    ) # average='binary', pos_label=1 == Only report scores for ability to predict real news

    print(f'Precision: {round(precision, 4)}')
    print(f'Recall: {round(recall, 4)}')
    print(f'f1-score: {round(f1score, 4)}')


def print_confusion_matrix(y_label, y_pred):
    confusion_matrix = pd.crosstab(y_label, y_pred, rownames=['Actual'], colnames=['Predicted'])
    percent_matrix = confusion_matrix / confusion_matrix.sum().sum()
    percent_matrix = np.round(percent_matrix*100, 2).astype(str) + '%'
    nums_and_percents = confusion_matrix.astype(str) + '\n(' + percent_matrix + ')'

    sns.heatmap(confusion_matrix, annot=nums_and_percents, fmt='', cmap='Oranges', )


def get_news_probs(mnb: MultinomialNB, cv: CountVectorizer, X_train: np.ndarray, col_title: str):
    vocabulary = cv.get_feature_names_out()

    coefficients = mnb.feature_log_prob_
    fake_news_prob = np.exp(coefficients[0])
    real_news_prob = np.exp(coefficients[1])

    fake_news_prob = pd.Series(fake_news_prob, index=vocabulary) # Probability of each word given a fake news article
    real_news_prob = pd.Series(real_news_prob, index=vocabulary) # Probability of each word given a real news article
    diff = fake_news_prob - real_news_prob
    fake_or_real = np.where(diff > 0, 'Fake', 'Real')
    occurences = X_train.sum(axis=0)
    occurences = np.array(occurences).flatten()

    return pd.DataFrame({
        f'P(word | Fake {col_title.capitalize()})': fake_news_prob,
        f'P(word | Real {col_title.capitalize()})': real_news_prob,
        'Difference': diff,
        'More Likely': fake_or_real,
        'Occurences': occurences
    })

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[['title', "text"]], df['label'])

# Use count vectorizor for title
cv = CountVectorizer(stop_words=stopwords.words('english'))
X_train_counts = cv.fit_transform(X_train["title"])
X_test_counts = cv.transform(X_test["title"])
display(X_train_counts)

mnb = MultinomialNB()
mnb.fit(X_train_counts, y_train)
y_pred_title = mnb.predict(X_test_counts)

get_scores(y_test, y_pred_title)
print_confusion_matrix(y_test, y_pred_title)
plt.title('Article Title Confusion Matrix')

In [None]:
news_probs_title = get_news_probs(mnb, cv, X_train_counts, "title")

sorted_occuring = news_probs_title.sort_values(by='Occurences', ascending=False)
most_occuring_real = sorted_occuring[sorted_occuring['More Likely'] == 'Real'][0:10]
most_occuring_fake = sorted_occuring[sorted_occuring['More Likely'] == 'Fake'][0:10]

display(most_occuring_real)
display(most_occuring_fake)

In [None]:
X_train_counts = cv.fit_transform(X_train["text"])
X_test_counts = cv.transform(X_test["text"])

mnb.fit(X_train_counts, y_train)
y_pred_text = mnb.predict(X_test_counts)

get_scores(y_test, y_pred_text)
print_confusion_matrix(y_test, y_pred_text)
plt.title('Article Text Confusion Matrix')

In [None]:
news_probs_text = get_news_probs(mnb, cv, X_train_counts, "text")

sorted_occuring = news_probs_text.sort_values(by='Occurences', ascending=False)
most_occuring_real = sorted_occuring[sorted_occuring['More Likely'] == 'Real'][0:10]
most_occuring_fake = sorted_occuring[sorted_occuring['More Likely'] == 'Fake'][0:10]

display(most_occuring_real)
display(most_occuring_fake)

In [None]:
# Vote using a simple logical OR
y_pred_combined = y_pred_title | y_pred_text
get_scores(y_test, y_pred_combined)
print_confusion_matrix(y_test, y_pred_combined)
plt.title('Combined Confusion Matrix')