# Naive-Bayes Classifer for "Fake News" Dataset

The classes we are attempting to predict are:
- 0 => reliable news
- 1 => unreliable/fake news

## 3 steps to Naive-Bayes

- **Step 1:** *Separate dataset by class*
- **Step 2:** *Summarise dataset (i.e. calculate probabilties)*
- **Step 3:** *Predict for new data*

In [1]:
#imports go here

import pandas as pd
from dask import delayed
import numpy as np
import math
from time import sleep
import re
import string
import random
from collections import Counter
import sys
np.set_printoptions(threshold=sys.maxsize)
##Imports the song playing capacity.
import webbrowser

from csv import reader

In [2]:
def count_words(text):
    return Counter(text)

In [3]:
def loadData(name):
    return pd.read_csv(name, names = ['ID','Title','Author','Text','Label'])

In [4]:
def removeStringWords(string, badWords):
    size = len(badWords)
    for i in range(size):
        try:
            string.remove(badWords[i])
        except:
            continue
        
    return string

In [5]:
def createIndices(dataSize, percent):
    sizeChoice = int(dataSize * 0.2)
    randomChoices = random.sample(range(dataSize-1),sizeChoice)
    half = int(sizeChoice/2)
    firstHalf = randomChoices[0:half]
    secondHalf = randomChoices[half:]
    return firstHalf, secondHalf

In [6]:
def createDataSets(totData, percent):
    dataSize = len(totData)
    testIndices, validIndices = createIndices(dataSize, percent)
    trainIndices = np.empty(0)
    for i in range(dataSize):
        
        if i not in testIndices and i not in validIndices:
            trainIndices = np.append(trainIndices, i)
            
    testFrame = totData.drop(trainIndices)
    testFrame = testFrame.drop(validIndices)
    validFrame = totData.drop(trainIndices)
    validFrame = validFrame.drop(testIndices)
    trainFrame = totData.drop(testIndices)
    trainFrame = trainFrame.drop(validIndices)
    return trainFrame, testFrame, validFrame

In [7]:
# Jesse's method for cleaning data

#remove NaN, punctuation and new lines
def getCleanedData(data):
    feature_names = np.array(['ID', 'Title', 'Author', 'Text'])
    
    #convert NaNs in order to remove later
    for i in range(3):
        string_replacement = ""
        if i == 1:
            string_replacement = "-NO AUTHOR-"
        else:
            string_replacement = "NaN"
            
        for j in range(len(data)):
            if pd.isnull(data[j][i+1]):
                data[j][2] = string_replacement
                
    #punctuation to remove
    remove = string.punctuation
    remove = remove + "“"
    remove = remove + "”"
    remove = remove + "’"
    remove = remove + '‘'
    remove = remove + '—'
    remove = remove + '–'
    
    #remove rows that contain NaN
    data = data[np.all(data != "NaN", axis = 1)]
    
    #remove punctuation, new lines, and convert words to lowercase
    for i in range(3):
        for j in range(len(data)):
            data[j][i+1] = data[j][i+1].replace("\n","").translate(str.maketrans('', '', remove)).lower()
            
    #remove rows that contain '' after removing a bunch of things
    data = data[np.all(data != '', axis = 1)]
    return feature_names, data

In [8]:
def cleanString(text):
    badWords = ['not','you','at','from','of','us','in','have','yes','no','are','','for','but','that','it','this','he','she','they','that','a','an','who','where','there','his','her','their','i','my','we','our','were','the','if','as','and','in','on','we','to','also','so','is','its']
    try:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.split(' ')
        text = removeStringWords(text, badWords)
        #print('success')
        #sleep(0.01)
        return text
    except:
        #print('fail')
        return

In [9]:
def appendArr(text, y):
    return text.append(y)

In [10]:
def getCounters(data, title, labelTitle, labelDesired):
        dataTitle = data.loc[data[labelTitle] == labelDesired]
        titleArray = dataTitle[title].to_numpy()
        results = []
        for i in titleArray:
            y = delayed(cleanString)(i)
            try:
                p = float(y[0])
                pass
            except:
                if y is not None:
                    #print(y)
                    appendArr(results,y)
                
        texts = delayed(results)
        return texts.compute()

In [11]:
def appendArr(text, y):
    return text.append(y)

In [12]:
def addCounters(prevCount, currCount):
    return prevCount + currCount

In [13]:
def tempNameCounter(dataFrame, title, labelTitle, labelDesired):
    titleCounterReal = getCounters(trainFrame, title, labelTitle, labelDesired)
    results = []
    resultstwo = []
    j = 0
    for i in titleCounterReal:
        y = delayed(count_words)(i)
        if j % 2:
            results.append(y)
        else:
            resultstwo.append(y) 
    bigCount = delayed(addCounters)(results,resultstwo)
    done = bigCount.compute() 
    #bigCount.visualize()
    sadMe = Counter()
    for i in done:
        sadMe = sadMe + count_words(i)
    return sadMe

In [14]:
def getRelevantInfo(dataFrame, title, labelTitle, labels):
    results = []
    j = 0
    for i in labels:
        #print(tempNameCounter(dataFrame, title, labelTitle, i))
        results.append(tempNameCounter(dataFrame, title, labelTitle, i))
        #print("result at index " + str(j) + " corresponds to " + title + " of label " + str(i))
        j +=1
        
    return results

In [15]:
def getTitles(dataFrame):
    print("Running Titles")
    return  getRelevantInfo(dataFrame, 'Title', 'Label', [0,1])

In [16]:
def getText(dataFrame):
    print("Running Text")
    return getRelevantInfo(dataFrame, 'Text', 'Label', [0,1])

In [17]:
def getAuthor(dataFrame):
    print("Running Text")
    return getRelevantInfo(dataFrame, 'Author', 'Label', [0,1])

In [18]:
def unzip(data):
    wordsReal, numberReal = [list(c) for c in zip(*list(data[0].items()))]
    wordsFake, numberFake = [list(c) for c in zip(*list(data[1].items()))]
    return [wordsReal,numberReal,wordsFake,numberFake]

In [19]:
def createAllDataString(results):
    #use the longest array to do the least work
    allWords = []
    allNumbers = []
    realWords = results[0]
    realNumbers = results[1]
    fakeWords = results[2]
    fakeNumbers = results[3]
    allWords = fakeWords
    allNumbers = fakeNumbers
   # print(fakeWords)
   # print(realWords)
    for i in realWords:
        indexReal = realWords.index(i)
        if i in allWords:
            indexAll = allWords.index(i)
            allNumbers[indexAll] = allNumbers[indexAll] + realNumbers[indexReal]
        else:
            #print("added Word")
            allWords.append(i)
            allNumbers.append(realNumbers[indexReal])
            
                
    #to verify just check that the lengths match up, the assumption is that there will be a smaller all words compared to realWords + fakeWords
    #print(len(allWords))
    #print(len(realWords) + len(fakeWords))
    return allWords, allNumbers
        

In [20]:
#Please only use this to test to see if it generates the binary data correctly it is referenced in createBinaryDataStrings.
#The idea is that should we have all the words captured the result array since it is constituted of words should be equal to the length of the the two sets of words at it's max but obviously
#we expect it to be a bit or a lot smaller than it, so I made a way of testing it to see that everytime it finds a word it counts it as a 1(for being true in the array)
#if the printed counter is not the same as teh array in size then clearly something must be missing.
#hence why this will definitely show that the above works
def testCreateBinaryDataStrings(realArray, fakeArray, resultArray):
    print(len(realArray))
    print(len(fakeArray))
    j = 0 
    for i in realArray:
        if i in resultArray:
            j += 1
    print(j)
    j = 0
    for i in fakeArray:
        if i in resultArray:
            j += 1
    print(j)
    print(len(resultArray))

In [21]:
def hotCodeWordArray(dataPoint, allData):
    # allData = np.append(allData, "SEXYGODZILLA") this is just me checking if it works
    binary = np.ones(0, dtype = bool)
    for i in allData:
        if i in dataPoint:
            binary = np.append(binary, True)
        else:
            binary = np.append(binary,False)
    return binary

In [22]:
def playSongWhenDone():
    # feel free to add songs or remove them as you want, just follow the format of nextNum,songLink
    temp = pd.read_csv('songs.csv', names = ['id', 'url'])
    ID = temp['id'].to_numpy()
    urls = temp['url'].to_numpy()
    i = random.sample(range(len(ID)),1)
    song = urls[i]
    try:
        webbrowser.open(song[0])
    except:
        print("no internet")

## Step 1: Separate dataset by class

Done by using the unzip() and getRelevantInfo() methods

We also separate our data into training, testing, and validation subsets.

In [23]:
%%time

# load dataset from CSV file
filename = 'Actual Data.csv'
data = loadData(filename)

# create training, testing, and validation datasets from dataset 'data'
trainFrame, testFrame, validFrame = createDataSets(data, 0.2)

# store data before cleaning as a backup
trainFrameRaw = trainFrame
testFrameRaw = testFrame
validFrameRaw = validFrame

# this returns an array of arrays of the form [realWords, realNumbers, fakeWords, fakeNumbers]
# realWords => words occurring in data classed as real, similarly for fakeWords
# realNumbers => number of occurrences of words in realWords, similarly for fakeNumbers
# i.e. realNumbers[0] = "apple", realNumbers[0] = 5 means that the word "apple" occurs 5 times in articles classed as real news.
# trainTextData = unzip(getText(trainFrame))

Wall time: 2.41 s


In [24]:
%%time
# clean data, remove nulls and NaN values

trainFrameNumpy = trainFrame.to_numpy()
test_feature_names, trainFrameNumpy = getCleanedData(trainFrameNumpy)
test_feature_names = np.append(test_feature_names, 'Label')
n = len(trainFrameNumpy)
trainFrame = pd.DataFrame(trainFrameNumpy, np.arange(n), test_feature_names)

Wall time: 7.17 s


In [25]:
# separate training data by class
# loc gets every row where the column value meets some condition

#I understand you are doing this for your class priors so cool
real = trainFrame.loc[trainFrame['Label'] == 0]
fake = trainFrame.loc[trainFrame['Label'] == 1]

In [26]:
%%time
trainTextData = unzip(getText(trainFrame))
trainTitleData = unzip(getTitles(trainFrame))

Running Text
Running Titles
Wall time: 15min 36s


In [27]:
%%time
#allTitles = createAllDataString(trainTitleData)
#createAllDataString creates a 2D array of [all unique words in dataset, occurrences of unique words]

Wall time: 0 ns


In [28]:
%%time
allText = createAllDataString(trainTextData)

Wall time: 13min 51s


In [29]:
%time
# explicitly separate and store real words, fake words and their occurrences

realTextWords = trainTextData[0]
realTextNumbers = trainTextData[1]
fakeTextWords = trainTextData[2]
fakeTextNumbers = trainTextData[3]

Wall time: 0 ns


## Step 2: Summarise the dataset (calculate probabilities)

We need:
   - the probability that an article is real or fake news
   - for each article, a vector of words that occur in it (vocabulary)
   - a vocab vector for real and fake, and the number of occurences of each word. (This is done already above.) The global vocabulary is just the union of realWords and fakeWords.
   
P(class given data) = ( product of P(Xi given class) * P(class) ) / P(Data)

In [30]:
# get total number of articles in training set
dataSize = len(trainFrame) 

# get number of real articles and number of fake articles
#this is fine because of what you did initially
numReal = len(real)
numFake = len(fake)

classPriors = np.array([(numReal/dataSize),(numFake/dataSize)])
print(classPriors)

[0.5123369 0.4876631]


### Get class conditional table

To do this we need:
- priors for each class
- to be able to vectorise the text of an article to show what words it contains that we have seen before.
    - this then means that we need to vectorise the text from our training articles as well, or some way to check if a training article contains a word we have seen in the entire training set.

In [31]:
realTitleWords = trainTitleData[0]
realTitleNumbers = trainTitleData[1]
fakeTitleWords = trainTitleData[2]
fakeTitleNumbers = trainTitleData[3]

In [32]:
#What this does is not a vectorise anymore, and since I need to try and save on memory space at times to help you down the road I changed the vectorised function.
#What this does is it creates 2 arrays, one array containing ALL words for both the fake and real entries of that piece you gave it.
#The other array is the array containing those counts, in order.
#it will seperate it for you, so long as you give it an all_____info where that ____ is whatever you wanted text,title or author
#allTitles = createAllDataString(trainTitleData)

In [33]:
#Next you have the vectorise, so the vectorise function does things a bit weirder but regardless will work on any string vector you give it.


#hotTitlesTrue = hotCodeWordArray(trainTitleData[0], allTitles[0])
#hotTitlesFake = hotCodeWordArray(trainTitleData[2], allTitles[0])
#hotTextTrue = hotCodeWordArray(trainTitleData[0], allText[0])
#hotTextFalse = hotCodeWordArray(trainTitleData[2], allText[0])

#the inputs are as follows, the result_____ (0 is for only real strings of that result and 2 for only fake strings of that result)
#always all____[0] since this contains all the strings for that thing. Whereas [1] contains all the counts
#This is all just a boolean set.


#if you give it a new entry it will work just fine, I recommend looking at your interaction with pandas that you be a little careful on how you interact with it.
#I personally would get it to to read a csv file similar to load data and then break it up into arrays where each array is that respective thing
#I.e a author, title, text and label array (which you can hide for testing but use in validation)
# a functional showcase is in this function - https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array
#you should want to do that and just give it the right data.


# I need to get each article as a boolean vector
# related to all the words we have seen over all of the articles, real and fake

# then iterate over each article's vector
# count articles that contain each word for real and for fake
# these count values go into the class conditional table
# classCon[0][0] => (# of real articles containing word 0)/(# of real articles)

In [34]:
%%time
# need to count occurrences of each word in real and fake articles
# this is where the actual training happens

numWords = len(allText[0])
classOccurrences = np.zeros([numWords, 2])
# every row of classOccurrences => [numRealArticles, numFakeArticles]

# for every article, update class occurrences
for i in range(len(trainFrame)):
    row = trainFrame.iloc[i]
    rowClass = row['Label']
    # vectorise article text
    rowVector = hotCodeWordArray(row['Text'], allText[0])
    if (i % 100) == 0:
        print("Training from row %d" % i)
    
    # for every seen word
    for j in range(len(allText[0])):
        # if word j occurs in the article increment class occurences
        if rowVector[j] == True:
            if rowClass == 0:
                # real article
                classOccurrences[j][0] += 1
            else:
                # fake article
                classOccurrences[j][1] += 1

Training from row 0
Training from row 100
Training from row 200
Training from row 300
Training from row 400
Training from row 500
Training from row 600
Training from row 700
Training from row 800
Training from row 900
Training from row 1000
Training from row 1100
Training from row 1200
Training from row 1300
Training from row 1400
Training from row 1500
Training from row 1600
Training from row 1700
Training from row 1800
Training from row 1900
Training from row 2000
Training from row 2100
Training from row 2200
Training from row 2300
Training from row 2400
Training from row 2500
Training from row 2600
Training from row 2700
Training from row 2800
Training from row 2900
Training from row 3000
Training from row 3100
Training from row 3200
Training from row 3300
Training from row 3400
Training from row 3500
Training from row 3600
Training from row 3700
Training from row 3800
Training from row 3900
Training from row 4000
Training from row 4100
Training from row 4200
Training from row 4300


## Step 3: Predict for new data

Now that we have our class conditional table (just take the values from classOccurrences and divide by numReal and numFake) we can predict the class of an unseen article by vectorising it and using Bayes' Theorem:

- P(class|X) = (product of P(Xi|class)) * P(class) / P(X)

We define a method to predict the class of a given article, either from the body of the article or the title.

The function takes in
- some text (article text or title)
- vector of all unique words in training data
- class conditional model (w/o numReal and numFake denominators)
- class priors

and returns a value indicating the predicted class:
- 0 => real/reliable
- 1 => fake/unreliable

In [141]:
# log-sum-exp normalisation trick to avoid underflow

def logsumexp(x, y):
    sum = 0
    max_value = max(x, y)
   #print("max_value: %f" % max_value)
    sum += math.exp(x - max_value)
    sum += math.exp(y - max_value)
   #print("sum: %f" % sum)
    
    return math.log10(sum) + max_value

In [151]:
# plan to implement Laplace smoothing
# currently the algorithm pretty much ignores unseen words

def predict(text, allText, classCon, classPriors, numReal, numFake):
    textVector = hotCodeWordArray(text, allText)
    pGivenReal = 0
    pGivenFake = 0
    
    # get P(text|real) and P(text|fake)
    for i in range(len(textVector)):
        realCon = (classCon[i][0]/numReal)
        fakeCon = (classCon[i][1]/numFake)
        
        if realCon == 0:
            realCon = math.log10(1 / (numReal + 1))
        else:
            realCon = math.log10(realCon)
            
        if fakeCon == 0:
            fakeCon = math.log10(1 / (numFake + 1))
        else:
            fakeCon = math.log10(fakeCon)
            
        #print(realCon)
        #print(fakeCon)
        
        #check if word from training data is in article
        if textVector[i] == True:
            pGivenReal += realCon
            pGivenFake += fakeCon
        else:
            rComp = 1 - (classCon[i][0]/numReal)
            fComp = 1 - (classCon[i][1]/numFake)
            if rComp != 0:
                pGivenReal += math.log10(1 - (classCon[i][0]/numReal))
            if fComp != 0:
                pGivenFake += math.log10(1 - (classCon[i][1]/numFake))
                
        #print("pGivenReal: %f        pGivenFake: %f" % (pGivenReal, pGivenFake))
            
    #print("pGivenReal: %f        pGivenFake: %f" % (pGivenReal, pGivenFake))
    
    # calculate P(real|text) and P(fake|text)
    pReal = pGivenReal + math.log10(classPriors[0])
    pFake = pGivenFake + math.log10(classPriors[1])
    #print("ADD PRIORS --- pReal: %f        pFake: %f" % (pReal, pFake))
    norm = logsumexp(pReal, pFake)
    #print("norm: %f" % norm)
    pReal = math.exp(pReal - norm)
    pFake = math.exp(pFake - norm)
    #print("NORMALISED --- pReal: %f        pFake: %f" % (pReal, pFake))
    
    sum = pReal + pFake
    #print("sum: %f" % sum)
    pReal /= sum
    pFake /= sum
    
    #pReal = (pGivenReal * classPriors[0])/( (pGivenReal*classPriors[0]) + (pGivenFake*classPriors[1]))
    #pFake = 1 - pReal
    
    #print()
    #print("pReal: %f        pFake: %f" % (pReal, pFake))
    
    # compare results and hence predict class
    if pReal > pFake:
        return 0
    else:
        return 1

Below we process the testing data for use in the predict() function

In [37]:
%%time


# clean data, remove nulls and NaN values

testFrame = testFrame.drop('Label', axis=1)
testFrameNumpy = testFrame.to_numpy()
test_feature_names, testFrameNumpy = getCleanedData(testFrameNumpy)
n = len(testFrameNumpy)

print(test_feature_names)

testFrame = pd.DataFrame(testFrameNumpy, np.arange(n), test_feature_names)

['ID' 'Title' 'Author' 'Text']
Wall time: 1.02 s


In [58]:
# print results to text files to avoid having to rerun this code :(
# also could make it easier to add articles later, improve model
temp = classOccurrences.astype(int)

outputFile = open("classOccurrences.txt", "w")
print(temp, file=outputFile)

outputFile.close()

In [59]:
allTextList = list(allText[0])
allTextFile = open("allText.txt", "w", encoding="utf-8")
print(allTextList, file=allTextFile)

allTextFile.close()

In [61]:
classOccurrences = temp

Now that the model has been trained and saved to a file (yay), test model on testing data

In [85]:
# need labels back, or at least be able to check testFrame with testFrameRaw, same indices

testIDs = testFrame.iloc[:, 0].values
print(testIDs.shape)
testingData = testFrameRaw.loc[testFrameRaw["ID"].isin(testIDs)]

(2022,)
ID                                                     1155
Title     Exclusive — Sarah Palin on Paul Ryan’s ‘RINO-C...
Author                                        Matthew Boyle
Text      Former Alaska Gov. Sarah Palin, the 2008 Repub...
Label                                                     0
Name: 1155, dtype: object

ID                                                     1155
Title     exclusive  sarah palin on paul ryans rinocare ...
Author                                        matthew boyle
Text      former alaska gov sarah palin the 2008 republi...
Name: 100, dtype: object


In [171]:
# test predict function on a small subset

pred = np.zeros(25)

for i in range(150, 170):
    article = testFrame.iloc[i]
    articleText = article['Text']
    prediction = predict(articleText, allText[0], classOccurrences, classPriors, numReal, numFake)
    rawArticle = testFrameRaw.loc[testFrameRaw["ID"] == article["ID"]]
    label = rawArticle['Label'].item()
    pred[i - 150] = prediction
    print("prediction: %d    label: %d" % (prediction, label))
    
print(pred)

prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
[0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0.
 0.]


In [175]:
%time
# testing

# for metrics
correct = 0
incorrect = 0
TP = 0
TN = 0
FP = 0
FN = 0
predictions = np.zeros(2022)

for i in range(len(testFrame)):
    if i % 10 == 0:
        print("Testing on article %d" % i)
        
    article = testFrame.iloc[i]
    articleText = article['Text']
    prediction = predict(articleText, allText[0], classOccurrences, classPriors, numReal, numFake)
    predictions[i] = prediction
    
    rawArticle = testFrameRaw.loc[testFrameRaw["ID"] == article["ID"]]
    label = rawArticle['Label'].item()
    
    print("prediction: %d    label: %d" % (prediction, label))
    
    # calculate values for metrics
    if rawArticle['Label'].item() == prediction:
        correct += 1
        if prediction == 0:
            # true negative - real news
            TN += 1
        else:
            # true positive - fake news
            TP += 1
    else:
        incorrect += 1
        if prediction == 0:
            # false negative - was actually fake news
            FN += 1
        else:
            # false positive - was actually real news
            FP += 1
            
            
# metrics
print("Correctly identified: %d" % correct)
print("Incorrectly identified: %d" % incorrect)
print("True Positives: %d" % TP)
print("True Negatives: %d" % TN)
print("False Positives: %d" % FP)
print("False Negatives: %d" % FN)
print()

print(predictions)
    

Wall time: 0 ns
Testing on article 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 10
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
Testing on article 20
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
Testing on article 30
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1

prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
Testing on article 300
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
Testing on article 310
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
Testing on article 320
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    labe

prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
Testing on article 590
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
Testing on article 600
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 610
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 0    labe

prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
Testing on article 880
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
Testing on article 890
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
Testing on article 900
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    labe

prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
Testing on article 1170
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 1180
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
Testing on article 1190
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    l

prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 1460
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
Testing on article 1470
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 1
Testing on article 1480
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    l

prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
Testing on article 1750
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
Testing on article 1760
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
Testing on article 1770
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    l

In [176]:
%%time
# calculate evaluation metrics

confusionMatrix = np.array([[TP, FN],
                           [FP, TN]])
error = (FP + FN) / (TP + FN + FP + TN)
accuracy = (TP + TN) / (TP + FN + FP + TN)
falseAlarm = FP / (FP + TN)
miss = FN / (TP + FN)
recall = TP / (TP + FN)
precision = TP / (TP + FP)

print("Error: %f" % error)
print("Accuracy: %f" % accuracy)
print("Precision: %f    Recall: %f" % (precision, recall))
print("Miss: %f    False Alarm: %f" % (miss, falseAlarm))
print("Recall: %f    False Alarm: %f" % (recall, falseAlarm))
print()
print("Confusion Matrix:")
print(confusionMatrix)

Wall time: 0 ns
Error: 0.311078
Accuracy: 0.688922
Precision: 0.676622    Recall: 0.675926
Miss: 0.324074    False Alarm: 0.299048
Recall: 0.675926    False Alarm: 0.299048

Confusion Matrix:
[[657 315]
 [314 736]]


Now to run the model on the validation set.

We first need to preprocess the dataset.

In [179]:
testingPredictions = predictions

In [177]:
%%time

# clean data, remove nulls and NaN values

validFrame = validFrame.drop('Label', axis=1)
validFrameNumpy = validFrame.to_numpy()
valid_feature_names, validFrameNumpy = getCleanedData(validFrameNumpy)
n = len(validFrameNumpy)

print(valid_feature_names)

validFrame = pd.DataFrame(validFrameNumpy, np.arange(n), valid_feature_names)

['ID' 'Title' 'Author' 'Text']
Wall time: 1.5 s


In [180]:
%%time
# validate predict function on a small subset

pred = np.zeros(25)

for i in range(150, 170):
    article = validFrame.iloc[i]
    articleText = article['Text']
    prediction = predict(articleText, allText[0], classOccurrences, classPriors, numReal, numFake)
    rawArticle = validFrameRaw.loc[validFrameRaw["ID"] == article["ID"]]
    label = rawArticle['Label'].item()
    pred[i - 150] = prediction
    print("prediction: %d    label: %d" % (prediction, label))
    
print(pred)

prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
[0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0.
 0.]
Wall time: 1min 39s


In [184]:
%%time
# validation

# for metrics
correct = 0
incorrect = 0
TP = 0
TN = 0
FP = 0
FN = 0
validPredictions = np.zeros(2009)

for i in range(2009):
    if i % 10 == 0:
        print("Testing on article %d" % i)
        
    article = validFrame.iloc[i]
    articleText = article['Text']
    prediction = predict(articleText, allText[0], classOccurrences, classPriors, numReal, numFake)
    validPredictions[i] = prediction
    
    rawArticle = validFrameRaw.loc[validFrameRaw["ID"] == article["ID"]]
    label = rawArticle['Label'].item()
    
    print("prediction: %d    label: %d" % (prediction, label))
    
    # calculate values for metrics
    if rawArticle['Label'].item() == prediction:
        correct += 1
        if prediction == 0:
            # true negative - real news
            TN += 1
        else:
            # true positive - fake news
            TP += 1
    else:
        incorrect += 1
        if prediction == 0:
            # false negative - was actually fake news
            FN += 1
        else:
            # false positive - was actually real news
            FP += 1
            
            
# metrics
print("Correctly identified: %d" % correct)
print("Incorrectly identified: %d" % incorrect)
print("True Positives: %d" % TP)
print("True Negatives: %d" % TN)
print("False Positives: %d" % FP)
print("False Negatives: %d" % FN)
print()

print(validPredictions)

Testing on article 0
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 0
Testing on article 10
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 20
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
Testing on article 30
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
pre

prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
Testing on article 300
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
Testing on article 310
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
Testing on article 320
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    labe

prediction: 0    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
Testing on article 590
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 600
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
Testing on article 610
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    labe

prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
Testing on article 880
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
Testing on article 890
prediction: 1    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
Testing on article 900
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    labe

prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
Testing on article 1170
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 0
Testing on article 1180
prediction: 0    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
Testing on article 1190
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    l

prediction: 0    label: 0
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 0    label: 1
Testing on article 1460
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
Testing on article 1470
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 1
Testing on article 1480
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    l

prediction: 1    label: 0
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
Testing on article 1750
prediction: 0    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
Testing on article 1760
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 0
prediction: 1    label: 0
prediction: 0    label: 0
prediction: 1    label: 1
prediction: 1    label: 1
Testing on article 1770
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    label: 1
prediction: 0    label: 1
prediction: 1    label: 0
prediction: 1    l

In [185]:
%%time
# calculate evaluation metrics

confusionMatrix = np.array([[TP, FN],
                           [FP, TN]])
error = (FP + FN) / (TP + FN + FP + TN)
accuracy = (TP + TN) / (TP + FN + FP + TN)
falseAlarm = FP / (FP + TN)
miss = FN / (TP + FN)
recall = TP / (TP + FN)
precision = TP / (TP + FP)

print("Error: %f" % error)
print("Accuracy: %f" % accuracy)
print("Precision: %f    Recall: %f" % (precision, recall))
print("Miss: %f    False Alarm: %f" % (miss, falseAlarm))
print("Recall: %f    False Alarm: %f" % (recall, falseAlarm))
print()
print("Confusion Matrix:")
print(confusionMatrix)

Error: 0.319064
Accuracy: 0.680936
Precision: 0.657371    Recall: 0.689655
Miss: 0.310345    False Alarm: 0.326996
Recall: 0.689655    False Alarm: 0.326996

Confusion Matrix:
[[660 297]
 [344 708]]
Wall time: 998 µs
