# Making the model (ignore)

In [None]:
!pip install kagglehub

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")

print("Path to dataset files:", path)

In [2]:
path_fake = path+'/News _dataset/Fake.csv'
path_real = path+'/News _dataset/True.csv'

In [3]:
import pandas as pd
import numpy as np

In [4]:
df_fake = pd.read_csv(path_fake)
df_real = pd.read_csv(path_real)

In [5]:
print("Lengths of this datasets:")
print("Fake:", len(df_fake))
print("Real:", len(df_real))

Lengths of this datasets:
Fake: 23481
Real: 21417


# Current approach: Use the entire data set. Train a Naive-Bayes model. (ignore)

In [10]:
df_fake['label'] = ['Fake' for i in range(len(df_fake))]
df_real['label'] = ['Real' for i in range(len(df_real))]
df_total = pd.concat([df_fake, df_real])

In [16]:
from sklearn.model_selection import train_test_split

x = df_total['text']
y = df_total['label']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

training_data = pd.DataFrame()
training_data['text'] = X_train
training_data['label'] = y_train
test_data = pd.DataFrame()
test_data['text'] = X_test
test_data['label'] = y_test

In [27]:
from collections import Counter
fake_fd = Counter()
real_fd = Counter()
fake_count = 0
real_count = 0

for row in training_data.iterrows():
  if row[1]['label'] == 'Fake': # For whatever reason all the info is in row[1]
    fake_count += 1
    for word in row[1]['text'].split():
      fake_fd[word] += 1
  if row[1]['label'] == 'Real':
    real_count += 1
    for word in row[1]['text'].split():
      real_fd[word] += 1

print(fake_fd.most_common(10))
print(real_fd.most_common(10))

[('the', 384632), ('to', 228062), ('of', 185759), ('and', 169501), ('a', 162009), ('in', 121691), ('that', 110339), ('s', 101856), ('is', 84173), ('for', 70399)]
[('the', 337221), ('to', 194431), ('of', 163017), ('a', 149018), ('and', 142920), ('in', 135237), ('on', 83598), ('that', 65814), ('for', 61912), ('said', 57525)]


In [29]:
# prior probabilities
fake_prior = fake_count / (len(training_data))
real_prior = real_count / (len(training_data))

print(fake_prior)
print(real_prior)

0.5239155855003063
0.47608441449969374


In [32]:
from math import log

# Prepares for iterations
real_total = real_fd.total()
fake_total = fake_fd.total()
predicted_real = 0
predicted_fake = 0
predictions = []

# Loops through each review in the development data
for row in test_data.iterrows():

  # Uses log of prior score as base probability
  fake_score = log(fake_prior)
  real_score = log(real_prior)

  # Loops through each word in the development data, computing it's real and fake score. (With log probabilities)
  for word in row[1]['text'].split():

    # Add-one smoothing implementation for positive
    if real_fd[word] == 0:
      real_score += log((real_fd[word]+1)/(real_total+len(real_fd)))
    else:
      real_score += log(real_fd[word]/real_total)

    # Add-one smoothing implementation for negative
    if fake_fd[word] == 0:
      fake_score += log((fake_fd[word]+1)/(fake_total+len(fake_fd)))
    else:
      fake_score += log(fake_fd[word]/fake_total)

  # Predicts label
  predicted_label = 'Real'
  if fake_score > real_score:
    predicted_label = 'Fake'
    predicted_fake += 1
  else:
    predicted_real += 1

  # Appends to
  predictions.append(predicted_label)

# Compute accuracy
true_prediction = 0
for prediction, row in zip(predictions, test_data.iterrows()):
  if prediction == row[1]['label']:
    true_prediction += 1
accuracy = true_prediction / len(test_data)

# Print the model accuracy
print("Model Accuracy: " + str(accuracy))

Model Accuracy: 0.9804008908685968


In [34]:
# Save current dataset
training_data.to_csv('training_data.csv')
test_data.to_csv('test_data.csv')

# Loading the model (ignore)

In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from math import log
import math

# Load datasets
class FakeNewsDataset():
  def __init__(self, train_path, test_path):
    self.train_data = pd.read_csv(train_path)
    self.test_data = pd.read_csv(test_path)
    self.data = pd.concat([self.train_data, self.test_data])
    print(f"Initialized dataset: {len(self.data)}")

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

# Naive-Bayes Model
class NaiveBayes():
  def __init__(self, dataset: FakeNewsDataset):

    # Initialize dataset. NOTE: Must be FakeNewsDataset structure
    self.dataset = dataset

    # Initialize prior variables
    self.fake_fd = Counter()
    self.real_fd = Counter()
    self.fake_prior = 0
    self.real_prior = 0
    fake_count = 0
    real_count = 0
    for row in self.dataset.train_data.iterrows():
      if row[1]['label'] == 'Fake': # For whatever reason all the info is in row[1]
        fake_count += 1
        for word in row[1]['text'].split():
          self.fake_fd[word] += 1
      if row[1]['label'] == 'Real':
        real_count += 1
        for word in row[1]['text'].split():
          self.real_fd[word] += 1
    self.fake_prior = fake_count / (len(self.dataset.train_data))
    self.real_prior = real_count / (len(self.dataset.train_data))

    # Test the model
    self.real_total = self.real_fd.total()
    self.fake_total = self.fake_fd.total()
    predictions = []

    # Loops through each review in the development data
    for row in self.dataset.test_data.iterrows():

      # Predict
      predicted_label, real_score, fake_score = self.predict(row[1]['text'])

      # Appends to
      predictions.append(predicted_label)

    # Compute accuracy
    true_prediction = 0
    for prediction, row in zip(predictions, self.dataset.test_data.iterrows()):
      if prediction == row[1]['label']:
        true_prediction += 1
    self.accuracy = (true_prediction / len(self.dataset.test_data)) * 100
    print(f"Initialized Naive-Bayes model with {self.accuracy:.2f} accuracy")

  def predict(self, text):

    # Uses log of prior score as base probability
    fake_score = log(self.fake_prior)
    real_score = log(self.real_prior)

    # Loops through each word in the development data, computing it's real and fake score. (With log probabilities)
    for word in text.split():

      # Add-one smoothing implementation for positive
      if self.real_fd[word] == 0:
        real_score += log((self.real_fd[word]+1)/(self.real_total+len(self.real_fd)))
      else:
        real_score += log(self.real_fd[word]/self.real_total)

      # Add-one smoothing implementation for negative
      if self.fake_fd[word] == 0:
        fake_score += log((self.fake_fd[word]+1)/(self.fake_total+len(self.fake_fd)))
      else:
        fake_score += log(self.fake_fd[word]/self.fake_total)

    # Choose which label to predict
    predicted_label = 'Real'
    if fake_score > real_score:
      predicted_label = 'Fake'
    return predicted_label, real_score, fake_score


Using the model

In [46]:
model = NaiveBayes(dataset)

Initialized Naive-Bayes model with 98.04 accuracy


In [57]:
# Examples from the same dataset
real_example = "PARIS (Reuters) - The prospect Donald Trump could become president of the United States is â€œa big worry,â€ Alain Juppe, the pollsters favorite to become President of France next year, was quoted as saying on Wednesday. â€œI donâ€™t know Mr Trump, but thereâ€™s a question mark and a big worry,â€ Juppe, who hopes to be the main candidate of the countryâ€™s center right in Aprilâ€™s presidential elections, told the magazine Paris Match. â€œHis total ignorance of Europe, his disdain for France, his isolationist and protectionist points of view, his outrageous simplifications, his constant changes of tack, are a real concern. But it is for the people of the United States to choose.â€ Trump is the Republican Party candidate for the U.S. presidential elections on Nov. 8."
fake_example = "This racist literally committed the hate crime in the name of Donald Trump.On Saturday morning, Khondoker Usama and his friend were getting gas for their vehicle at a Kwik Shop in Wichita, Kansas when they saw a white man hurling obscenities at a black man. Upon seeing Usama and his friend, the man focused on them. Then suddenly it turned onto us, calling us  brown trash, go home. Trump will win,  Usama recalled. You want to live in this country, you better leave,  the man warned Usama and his friend, who is Hispanic.Usama stood up to the bully and told him,  This is my country; who are you to tell me that? That s when the white man became violent, according to the Wichita Eagle.The exchange was heated, Usama said, and he tried to defuse the situation, but his friend got punched and taken to the ground. He said he tried to get between the attacker and his friend but then was pushed himself. He thought he saw the attacker reaching for his pocket and feared he had a weapon, he said, so he backed away and called 911. He kept kicking the student who was laying on the ground,  Usama said.  He was kicking him; it was a gut-wrenching scene. He saw that I was calling the police and got back on his motorcycle and circled around us and was saying  Trump, Trump, Trump, we will make America great again. You losers will be thrown out of the wall. Usama says he didn t know the man who attacked him and his friend, but he also doesn t  know why anyone would do anything so hateful and so wrong to any individual. This isn t the first time a Trump supporter has physically assaulted a person of color, although many of the incidents usually occur at Trump rallies.In North Carolina last week, a white Trump supporter sucker-punched a black man being led out of the rally by security. At the same rally, another white Trump supporter slapped a back man and yelled obscenities at him.And Trump rallies in St. Louis and Chicago nearly turned into race wars as white supporters hurled obscenities, racial epithets, and threats of violence towards protesters. Some protesters were even assaulted. Yet Donald Trump claims no one has gotten hurt at his rallies and he has called for more violence.As for Usama, he urged others to speak up if they find themselves being verbally or physically assaulted. There may be other people who are fearing the same thing. So this is really important in the times we are in, the challenges we are facing as minorities in this country; we better get united and we better speak up. Make no mistake, Donald Trump and the Republican Party are responsible for these racists and their actions. His white supporters literally think they are entitled to be assholes toward minorities and Trump only makes them feel bulletproof and immune to prosecution because he has said he would pay for their legal defense if they get arrested for assault or hate crimes.If Trump becomes president, this is what his America will look like. An America where white racists have free reign to harass and attack people of color at will because they have the blessing of the White House to do so. That s a scary and dangerous America to live in. Featured image via YouTube"

In [83]:
# Predict result.
# Note: Log probabilities means that the greater number
# will be chosen. (i.e. closer to 0)
prediction, real_score, fake_score = model.predict("Trump is president")
print(f"Prediction: {prediction}\n Real: {real_score}\n Fake: {fake_score}\n")

Prediction: Fake
 Real: -18.38230571356164
 Fake: -17.493687387107165



Another approach: The previous Naive-Bayes classifier classifies best on the level of documents. Let's see how well it does on the level of sentences.

In [105]:
import nltk
nltk.download('all') # or specific packages like 'punkt', 'wordnet'

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

# Final Model & Usage

In [None]:
!pip install kagglgehub
!pip install nltk

In [7]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from math import log
import math
import nltk
import kagglehub

# Load datasets (pre-existing)
class FakeNewsDataset():
  def __init__(self, train_path, test_path):
    self.train_data = pd.read_csv(train_path)
    self.test_data = pd.read_csv(test_path)
    self.data = pd.concat([self.train_data, self.test_data])
    print(f"Initialized dataset: {len(self.data)}")

  def __len__(self):
    return len(self.data)

# Load dataset (through kaggle)
class FakeNewsDatasetKaggle(FakeNewsDataset):
  def __init__(self):

    # Download latest version
    path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")
    path_real = path+'/News _dataset/True.csv'
    path_fake = path+'/News _dataset/Fake.csv'

    # Shape the data into our desired format
    df_fake = pd.read_csv(path_fake)
    df_real = pd.read_csv(path_real)
    df_fake['label'] = ['Fake' for i in range(len(df_fake))]
    df_real['label'] = ['Real' for i in range(len(df_real))]

    # Split the data into 80/20 train-test
    df_total = pd.concat([df_fake, df_real])
    self.train_data, self.test_data = train_test_split(df_total, test_size=0.2)
    self.data = pd.concat([self.train_data, self.test_data])
    print(f"Initialized dataset: {len(self.data)}")

  def __len__(self):
    return len(self.data)

# Naive-Bayes Model
class NaiveBayes():
  def __init__(self, dataset: FakeNewsDataset, unit='article'):

    # Initialize dataset. NOTE: Must be FakeNewsDataset structure
    self.dataset = dataset

    # Initialize prior variables
    self.fake_fd = Counter()
    self.real_fd = Counter()
    self.fake_prior = 0
    self.real_prior = 0
    fake_count = 0
    real_count = 0
    total_count = 0
    for row in self.dataset.train_data.iterrows():
      if row[1]['label'] == 'Fake': # For whatever reason all the info is in row[1]
        fake_count += 1
        total_count += 1
        for word in row[1]['text'].split():
          self.fake_fd[word] += 1
      if row[1]['label'] == 'Real':
        real_count += 1
        total_count += 1
        for word in row[1]['text'].split():
          self.real_fd[word] += 1
    self.fake_prior = fake_count / total_count
    self.real_prior = real_count / total_count

    # Test the model
    self.real_total = self.real_fd.total()
    self.fake_total = self.fake_fd.total()
    predictions = []

    # Loops through each review in the development data
    for row in self.dataset.test_data.iterrows():

      # Handles articles
      if unit=='article':

        # Predict
        predicted_label, real_score, fake_score = self.predict(row[1]['text'])

        # Appends to
        predictions.append(predicted_label)

      # Handles sentences
      elif unit=='sentence':

        # Split into sentences
        sentences = nltk.sent_tokenize(row[1]['text'])

        # Iterate through each sentence
        sentence_predictions = []
        for sentence in sentences:

          # Predict
          predicted_label, real_score, fake_score = self.predict(sentence)

          # Appends to current sentence prediction
          sentence_predictions.append(predicted_label)

        # Append onto final prediction
        predictions.append(sentence_predictions)


    # Compute accuracy of article prediction
    if unit=='article':
      true_prediction = 0
      for prediction, row in zip(predictions, self.dataset.test_data.iterrows()):
        if prediction == row[1]['label']:
          true_prediction += 1
      self.accuracy = (true_prediction / len(self.dataset.test_data)) * 100

    # Compute accuracy of sentence prediction
    elif unit=='sentence':
      true_prediction = 0
      total_sentences = 0
      for prediction, row in zip(predictions, self.dataset.test_data.iterrows()):
        answer = row[1]['label']
        for pred in prediction:
          total_sentences += 1
          if pred == answer:
            true_prediction += 1
      self.accuracy = (true_prediction / total_sentences) * 100

    # Done
    print(f"Initialized Naive-Bayes model with {self.accuracy:.2f} accuracy")

  def predict(self, text):

    # Uses log of prior score as base probability
    fake_score = log(self.fake_prior)
    real_score = log(self.real_prior)

    # Loops through each word in the development data, computing it's real and fake score. (With log probabilities)
    for word in text.split():

      # Add-one smoothing implementation for positive
      if self.real_fd[word] == 0:
        real_score += log((self.real_fd[word]+1)/(self.real_total+len(self.real_fd)))
      else:
        real_score += log(self.real_fd[word]/self.real_total)

      # Add-one smoothing implementation for negative
      if self.fake_fd[word] == 0:
        fake_score += log((self.fake_fd[word]+1)/(self.fake_total+len(self.fake_fd)))
      else:
        fake_score += log(self.fake_fd[word]/self.fake_total)

    # Choose which label to predict
    predicted_label = 'Real'
    if fake_score > real_score:
      predicted_label = 'Fake'
    return predicted_label, real_score, fake_score


In [8]:
dataset = FakeNewsDatasetKaggle()
model = NaiveBayes(dataset, unit='article')

Initialized dataset: 44898
Initialized Naive-Bayes model with 98.10 accuracy


In [9]:
# Predict result.
# Note: Log probabilities means that the greater number
# will be chosen. (i.e. closer to 0)
prediction, real_score, fake_score = model.predict("We the people of the United States.")
print(f"Prediction: {prediction}\n Real: {real_score}\n Fake: {fake_score}\n")

Prediction: Real
 Real: -39.1225706285355
 Fake: -39.84096867386622



Conclusion: This model probably can't detect fake news as accurately because it doesn't have knowledge of the real world. However, it's pretty capable of detecting real news from fake news based off of the probabilities within the text. Therefore, due to this model's fast prediction speed, it's probably best to use this as a secondary source.


A potential pipeline could be:


1.   Naive Bayes flags a sentences as fake news.
2.   Another check is done by another model.
3.   A final check is done by checking if the sentence has any reputable sources.



