# CS5830 Project 5: Naive Bayes

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB



In [3]:
# Download dataset from https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
df = pd.read_csv('data/WELFake_Dataset.csv', index_col=0)
df = df.dropna()
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [4]:
# average='binary', pos_label=0 == Only report scores for predicting fake news
def get_scores(y_label, y_pred):
    precision, recall, f1score, support = precision_recall_fscore_support(y_label, y_pred, average='binary')
    print(f'Precision: {precision}')
    print(f'f1-score: {f1score}')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df[['title', "text"]], df['label'])

# Use count vectorizor for title
cv = CountVectorizer(stop_words='english')
X_train_counts = cv.fit_transform(X_train["title"])
X_test_counts = cv.transform(X_test["title"])
display(X_train_counts)

mnb = MultinomialNB()
mnb.fit(X_train_counts, y_train)
y_pred_title = mnb.predict(X_test_counts)

get_scores(y_test, y_pred_title)

<53652x27658 sparse matrix of type '<class 'numpy.int64'>'
	with 462594 stored elements in Compressed Sparse Row format>

Precision: 0.8804737585569923
f1-score: 0.88465527594301


In [49]:
vocabulary = cv.get_feature_names_out()

coefficients = mnb.feature_log_prob_
most_influential_word_indices = np.argsort(-coefficients, axis=1)
most_influential_words = [vocabulary[idx] for idx in most_influential_word_indices]

fake_news_words = set(most_influential_words[0][:100])
real_news_words = set(most_influential_words[1][:100])

fake_news_unique_words = fake_news_words - real_news_words
real_news_unique_words = real_news_words - fake_news_words

print(f'Most influential fake news words: {fake_news_unique_words}')
print(f'Most influential real news word: {real_news_unique_words}')

Most influential fake news words: {'ria', 'inquiry', 'backed', 'pakistan', 'kurds', 'strike', 'focus', 'mexican', 'strategy', 'coal', 'asylum', 'philippines', 'israeli', 'budget', 'russians', 'hold', 'past', 'sources', 'indonesia', 'boost', 'afghanistan', 'policies', 'asia', 'tillerson', 'hurricane', 'illinois', 'reuters', 'reforms', 'referendum', 'address', 'path', 'quit', 'thursday', 'arms', 'replace', 'duterte', 'poland', 'international', 'worker', 'abuse', 'meets', 'fears', 'donors', 'kerry', 'request', 'fires', 'heads', 'veto', 'draws', 'education', 'december', 'steps', 'committee', 'agrees', 'reform', 'japan', 'irish', 'remain', 'congo', 'passes', 'looms', 'chairman', 'efforts', 'coalition', 'watchdog', '18', 'puerto', 'ireland', 'capital', 'spanish', 'held', 'industry', 'libya', 'britain', 'ruling', 'hampshire', 'short', 'shutdown', 'rate', 'tensions', 'embassy', 'brazil', 'signals', 'spain', 'erdogan', 'probes', 'needed', 'bombing', 'detained', 'kremlin', 'conflict', 'jeb', 'ko

In [None]:
X_train_counts = cv.fit_transform(X_train["text"])
X_test_counts = cv.transform(X_test["text"])

mnb.fit(X_train_counts, y_train)
y_pred_text = mnb.predict(X_test_counts)

get_scores(y_test, y_pred_text)

Precision: 0.9045718846197697
f1-score: 0.8870255464251775


In [None]:
# Vote using a simple logical OR
y_pred_combined = y_pred_title | y_pred_text

get_scores(y_test, y_pred_combined)

Precision: 0.8536298618595082
f1-score: 0.9017335058214747
