# CS5830 Project 5: Naive Bayes

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Download dataset from https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
df = pd.read_csv('data/WELFake_Dataset.csv', index_col=0)
df = df.dropna()
df

In [None]:
# average='binary', pos_label=0 == Only report scores for predicting fake news
def get_scores(y_label, y_pred):
    precision, recall, f1score, support = precision_recall_fscore_support(y_label, y_pred, average='binary')
    print(f'Precision: {round(precision, 4)}')
    print(f'Recall: {round(recall, 4)}')
    print(f'f1-score: {round(f1score, 4)}')

def print_confusion_matrix(y_label, y_pred):
    confusion_matrix = pd.crosstab(y_label, y_pred, rownames=['Actual'], colnames=['Predicted'])
    percent_matrix = confusion_matrix.apply(lambda x: x / x.sum() * 100, axis=1)
    percent_matrix = percent_matrix.round(2).astype(str) + '%'
    sns.heatmap(confusion_matrix, annot=percent_matrix, fmt='', cmap='Oranges', )

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[['title', "text"]], df['label'])

# Use count vectorizor for title
cv = CountVectorizer(stop_words='english')
X_train_counts = cv.fit_transform(X_train["title"])
X_test_counts = cv.transform(X_test["title"])
display(X_train_counts)

mnb = MultinomialNB()
mnb.fit(X_train_counts, y_train)
y_pred_title = mnb.predict(X_test_counts)

get_scores(y_test, y_pred_title)
print_confusion_matrix(y_test, y_pred_title)

In [None]:
vocabulary = cv.get_feature_names_out()

coefficients = mnb.feature_log_prob_
most_influential_word_indices = np.argsort(-coefficients, axis=1)
most_influential_words = [vocabulary[idx] for idx in most_influential_word_indices]

fake_news_words = set(most_influential_words[0][:100])
real_news_words = set(most_influential_words[1][:100])

fake_news_unique_words = fake_news_words - real_news_words
real_news_unique_words = real_news_words - fake_news_words

print(f'Most influential fake news words: {fake_news_unique_words}')
print(f'Most influential real news word: {real_news_unique_words}')

In [None]:
X_train_counts = cv.fit_transform(X_train["text"])
X_test_counts = cv.transform(X_test["text"])

mnb.fit(X_train_counts, y_train)
y_pred_text = mnb.predict(X_test_counts)

get_scores(y_test, y_pred_text)
print_confusion_matrix(y_test, y_pred_text)

In [None]:
# Vote using a simple logical OR
y_pred_combined = y_pred_title | y_pred_text

get_scores(y_test, y_pred_combined)