# Naive-Bayes Classifer for "Fake News" Dataset

The classes we are attempting to predict are:
- 0 => reliable news
- 1 => unreliable/fake news

## 3 steps to Naive-Bayes

- **Step 1:** *Separate dataset by class*
- **Step 2:** *Summarise dataset (i.e. calculate probabilties)*
- **Step 3:** *Predict for new data*

In [1]:
#imports go here

import pandas as pd
from dask import delayed
import numpy as np
import math
from time import sleep
import re
import string
import random
from collections import Counter
import sys
np.set_printoptions(threshold=sys.maxsize)
##Imports the song playing capacity.
import webbrowser

from csv import reader

In [2]:
def count_words(text):
    return Counter(text)

In [3]:
def loadData(name):
    return pd.read_csv(name, names = ['ID','Title','Author','Text','Label'])

In [4]:
def removeStringWords(string, badWords):
    size = len(badWords)
    for i in range(size):
        try:
            string.remove(badWords[i])
        except:
            continue
        
    return string

In [5]:
def createIndices(dataSize, percent):
    sizeChoice = int(dataSize * 0.2)
    randomChoices = random.sample(range(dataSize-1),sizeChoice)
    half = int(sizeChoice/2)
    firstHalf = randomChoices[0:half]
    secondHalf = randomChoices[half:]
    return firstHalf, secondHalf

In [6]:
def createDataSets(totData, percent):
    dataSize = len(totData)
    testIndices, validIndices = createIndices(dataSize, percent)
    trainIndices = np.empty(0)
    for i in range(dataSize):
        
        if i not in testIndices and i not in validIndices:
            trainIndices = np.append(trainIndices, i)
            
    testFrame = totData.drop(trainIndices)
    testFrame = testFrame.drop(validIndices)
    validFrame = totData.drop(trainIndices)
    validFrame = validFrame.drop(testIndices)
    trainFrame = totData.drop(testIndices)
    trainFrame = trainFrame.drop(validIndices)
    return trainFrame, testFrame, validFrame

In [7]:
# Jesse's method for cleaning data

#remove NaN, punctuation and new lines
def getCleanedData(data):
    feature_names = np.array(['ID', 'Title', 'Author', 'Text'])
    
    #convert NaNs in order to remove later
    for i in range(3):
        string_replacement = ""
        if i == 1:
            string_replacement = "-NO AUTHOR-"
        else:
            string_replacement = "NaN"
            
        for j in range(len(data)):
            if pd.isnull(data[j][i+1]):
                data[j][2] = string_replacement
                
    #punctuation to remove
    remove = string.punctuation
    remove = remove + "“"
    remove = remove + "”"
    remove = remove + "’"
    remove = remove + '‘'
    remove = remove + '—'
    remove = remove + '–'
    
    #remove rows that contain NaN
    data = data[np.all(data != "NaN", axis = 1)]
    
    #remove punctuation, new lines, and convert words to lowercase
    for i in range(3):
        for j in range(len(data)):
            data[j][i+1] = data[j][i+1].replace("\n","").translate(str.maketrans('', '', remove)).lower()
            
    #remove rows that contain '' after removing a bunch of things
    data = data[np.all(data != '', axis = 1)]
    return feature_names, data

In [8]:
def cleanString(text):
    badWords = ['not','you','at','from','of','us','in','have','yes','no','are','','for','but','that','it','this','he','she','they','that','a','an','who','where','there','his','her','their','i','my','we','our','were','the','if','as','and','in','on','we','to','also','so','is','its']
    try:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.split(' ')
        text = removeStringWords(text, badWords)
        #print('success')
        #sleep(0.01)
        return text
    except:
        #print('fail')
        return

In [9]:
def appendArr(text, y):
    return text.append(y)

In [10]:
def getCounters(data, title, labelTitle, labelDesired):
        dataTitle = data.loc[data[labelTitle] == labelDesired]
        titleArray = dataTitle[title].to_numpy()
        results = []
        for i in titleArray:
            y = delayed(cleanString)(i)
            try:
                p = float(y[0])
                pass
            except:
                if y is not None:
                    #print(y)
                    appendArr(results,y)
                
        texts = delayed(results)
        return texts.compute()

In [11]:
def appendArr(text, y):
    return text.append(y)

In [12]:
def addCounters(prevCount, currCount):
    return prevCount + currCount

In [13]:
def tempNameCounter(dataFrame, title, labelTitle, labelDesired):
    titleCounterReal = getCounters(trainFrame, title, labelTitle, labelDesired)
    results = []
    resultstwo = []
    j = 0
    for i in titleCounterReal:
        y = delayed(count_words)(i)
        if j % 2:
            results.append(y)
        else:
            resultstwo.append(y) 
    bigCount = delayed(addCounters)(results,resultstwo)
    done = bigCount.compute() 
    #bigCount.visualize()
    sadMe = Counter()
    for i in done:
        sadMe = sadMe + count_words(i)
    return sadMe

In [14]:
def getRelevantInfo(dataFrame, title, labelTitle, labels):
    results = []
    j = 0
    for i in labels:
        #print(tempNameCounter(dataFrame, title, labelTitle, i))
        results.append(tempNameCounter(dataFrame, title, labelTitle, i))
        #print("result at index " + str(j) + " corresponds to " + title + " of label " + str(i))
        j +=1
        
    return results

In [15]:
def getTitles(dataFrame):
    print("Running Titles")
    return  getRelevantInfo(dataFrame, 'Title', 'Label', [0,1])

In [16]:
def getText(dataFrame):
    print("Running Text")
    return getRelevantInfo(dataFrame, 'Text', 'Label', [0,1])

In [17]:
def getAuthor(dataFrame):
    print("Running Text")
    return getRelevantInfo(dataFrame, 'Author', 'Label', [0,1])

In [18]:
def unzip(data):
    wordsReal, numberReal = [list(c) for c in zip(*list(data[0].items()))]
    wordsFake, numberFake = [list(c) for c in zip(*list(data[1].items()))]
    return [wordsReal,numberReal,wordsFake,numberFake]

In [19]:
def createAllDataString(results):
    #use the longest array to do the least work
    allWords = []
    allNumbers = []
    realWords = results[0]
    realNumbers = results[1]
    fakeWords = results[2]
    fakeNumbers = results[3]
    allWords = fakeWords
    allNumbers = fakeNumbers
   # print(fakeWords)
   # print(realWords)
    for i in realWords:
        indexReal = realWords.index(i)
        if i in allWords:
            indexAll = allWords.index(i)
            allNumbers[indexAll] = allNumbers[indexAll] + realNumbers[indexReal]
        else:
            #print("added Word")
            allWords.append(i)
            allNumbers.append(realNumbers[indexReal])
            
                
    #to verify just check that the lengths match up, the assumption is that there will be a smaller all words compared to realWords + fakeWords
    #print(len(allWords))
    #print(len(realWords) + len(fakeWords))
    return allWords, allNumbers
        

In [20]:
#Please only use this to test to see if it generates the binary data correctly it is referenced in createBinaryDataStrings.
#The idea is that should we have all the words captured the result array since it is constituted of words should be equal to the length of the the two sets of words at it's max but obviously
#we expect it to be a bit or a lot smaller than it, so I made a way of testing it to see that everytime it finds a word it counts it as a 1(for being true in the array)
#if the printed counter is not the same as teh array in size then clearly something must be missing.
#hence why this will definitely show that the above works
def testCreateBinaryDataStrings(realArray, fakeArray, resultArray):
    print(len(realArray))
    print(len(fakeArray))
    j = 0 
    for i in realArray:
        if i in resultArray:
            j += 1
    print(j)
    j = 0
    for i in fakeArray:
        if i in resultArray:
            j += 1
    print(j)
    print(len(resultArray))

In [21]:
def hotCodeWordArray(dataPoint, allData):
    # allData = np.append(allData, "SEXYGODZILLA") this is just me checking if it works
    binary = np.ones(0, dtype = bool)
    for i in allData:
        if i in dataPoint:
            binary = np.append(binary, True)
        else:
            binary = np.append(binary,False)
    return binary

In [22]:
def playSongWhenDone():
    # feel free to add songs or remove them as you want, just follow the format of nextNum,songLink
    temp = pd.read_csv('songs.csv', names = ['id', 'url'])
    ID = temp['id'].to_numpy()
    urls = temp['url'].to_numpy()
    i = random.sample(range(len(ID)),1)
    song = urls[i]
    try:
        webbrowser.open(song[0])
    except:
        print("no internet")

## Step 1: Separate dataset by class

Done by using the unzip() and getRelevantInfo() methods

We also separate our data into training, testing, and validation subsets.

In [48]:
%%time

# load dataset from CSV file
filename = 'tenPercent.csv'
data = loadData(filename)

# create training, testing, and validation datasets from dataset 'data'
trainFrame, testFrame, validFrame = createDataSets(data, 0.2)

# store data before cleaning as a backup
trainFrameRaw = trainFrame
testFrameRaw = testFrame
validFrameRaw = validFrame

# this returns an array of arrays of the form [realWords, realNumbers, fakeWords, fakeNumbers]
# realWords => words occurring in data classed as real, similarly for fakeWords
# realNumbers => number of occurrences of words in realWords, similarly for fakeNumbers
# i.e. realNumbers[0] = "apple", realNumbers[0] = 5 means that the word "apple" occurs 5 times in articles classed as real news.
# trainTextData = unzip(getText(trainFrame))

Wall time: 142 ms


In [38]:
%%time
# clean data, remove nulls and NaN values

trainFrameNumpy = trainFrame.to_numpy()
test_feature_names, trainFrameNumpy = getCleanedData(trainFrameNumpy)
test_feature_names = np.append(test_feature_names, 'Label')
n = len(trainFrameNumpy)
trainFrame = pd.DataFrame(trainFrameNumpy, np.arange(n), test_feature_names)

Wall time: 328 ms


In [39]:
# separate training data by class
# loc gets every row where the column value meets some condition

#I understand you are doing this for your class priors so cool
real = trainFrame.loc[trainFrame['Label'] == 0]
fake = trainFrame.loc[trainFrame['Label'] == 1]

In [40]:
%%time
trainTextData = unzip(getText(trainFrame))
trainTitleData = unzip(getTitles(trainFrame))

Running Text
Running Titles
Wall time: 24.9 s


In [25]:
%%time
#allTitles = createAllDataString(trainTitleData)
# createAllDataString creates a 2D array of [all unique words in dataset, occurrences of unique words]

Wall time: 334 ms


In [41]:
%%time
allText = createAllDataString(trainTextData)

Wall time: 51.4 s


In [42]:
# explicitly separate and store real words, fake words and their occurrences

realTextWords = trainTextData[0]
realTextNumbers = trainTextData[1]
fakeTextWords = trainTextData[2]
fakeTextNumbers = trainTextData[3]

## Step 2: Summarise the dataset (calculate probabilities)

We need:
   - the probability that an article is real or fake news
   - for each article, a vector of words that occur in it (vocabulary)
   - a vocab vector for real and fake, and the number of occurences of each word. (This is done already above.) The global vocabulary is just the union of realWords and fakeWords.
   
P(class given data) = ( product of P(Xi given class) * P(class) ) / P(Data)

In [43]:
# get total number of articles in training set
dataSize = len(trainFrame) 

# get number of real articles and number of fake articles
#this is fine because of what you did initially
numReal = len(real)
numFake = len(fake)

classPriors = np.array([(numReal/dataSize),(numFake/dataSize)])
print(classPriors)

[0.50866337 0.49133663]


### Get class conditional table

To do this we need:
- priors for each class
- to be able to vectorise the text of an article to show what words it contains that we have seen before.
    - this then means that we need to vectorise the text from our training articles as well, or some way to check if a training article contains a word we have seen in the entire training set.

In [44]:
realTitleWords = trainTitleData[0]
realTitleNumbers = trainTitleData[1]
fakeTitleWords = trainTitleData[2]
fakeTitleNumbers = trainTitleData[3]

In [45]:
#What this does is not a vectorise anymore, and since I need to try and save on memory space at times to help you down the road I changed the vectorised function.
#What this does is it creates 2 arrays, one array containing ALL words for both the fake and real entries of that piece you gave it.
#The other array is the array containing those counts, in order.
#it will seperate it for you, so long as you give it an all_____info where that ____ is whatever you wanted text,title or author
#allTitles = createAllDataString(trainTitleData)

In [46]:
#Next you have the vectorise, so the vectorise function does things a bit weirder but regardless will work on any string vector you give it.


#hotTitlesTrue = hotCodeWordArray(trainTitleData[0], allTitles[0])
#hotTitlesFake = hotCodeWordArray(trainTitleData[2], allTitles[0])
#hotTextTrue = hotCodeWordArray(trainTitleData[0], allText[0])
#hotTextFalse = hotCodeWordArray(trainTitleData[2], allText[0])

#the inputs are as follows, the result_____ (0 is for only real strings of that result and 2 for only fake strings of that result)
#always all____[0] since this contains all the strings for that thing. Whereas [1] contains all the counts
#This is all just a boolean set.


#if you give it a new entry it will work just fine, I recommend looking at your interaction with pandas that you be a little careful on how you interact with it.
#I personally would get it to to read a csv file similar to load data and then break it up into arrays where each array is that respective thing
#I.e a author, title, text and label array (which you can hide for testing but use in validation)
# a functional showcase is in this function - https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array
#you should want to do that and just give it the right data.


# I need to get each article as a boolean vector
# related to all the words we have seen over all of the articles, real and fake

# then iterate over each article's vector
# count articles that contain each word for real and for fake
# these count values go into the class conditional table
# classCon[0][0] => (# of real articles containing word 0)/(# of real articles)

In [47]:
%%time
# need to count occurrences of each word in real and fake articles
# this is where the actual training happens

numWords = len(allText[0])
classOccurrences = np.zeros([numWords, 2])
# every row of classOccurrences => [numRealArticles, numFakeArticles]

# for every article, update class occurrences
for i in range(len(trainFrame)):
    row = trainFrame.iloc[i]
    rowClass = row['Label']
    # vectorise article text
    rowVector = hotCodeWordArray(row['Text'], allText[0])
    print("Training from row %d" % i)
    
    # for every seen word
    for j in range(len(allText[0])):
        # if word j occurs in the article increment class occurences
        if rowVector[j] == True:
            if rowClass == 0:
                # real article
                classOccurrences[j][0] += 1
            else:
                # fake article
                classOccurrences[j][1] += 1
                

                
print(classOccurrences)

Training from row 0
Training from row 1
Training from row 2
Training from row 3
Training from row 4
Training from row 5
Training from row 6
Training from row 7
Training from row 8
Training from row 9
Training from row 10
Training from row 11
Training from row 12
Training from row 13
Training from row 14
Training from row 15
Training from row 16
Training from row 17
Training from row 18
Training from row 19
Training from row 20
Training from row 21
Training from row 22
Training from row 23
Training from row 24
Training from row 25
Training from row 26
Training from row 27
Training from row 28
Training from row 29
Training from row 30
Training from row 31
Training from row 32
Training from row 33
Training from row 34
Training from row 35
Training from row 36
Training from row 37
Training from row 38
Training from row 39
Training from row 40
Training from row 41
Training from row 42
Training from row 43
Training from row 44
Training from row 45
Training from row 46
Training from row 47
Tr

Training from row 378
Training from row 379
Training from row 380
Training from row 381
Training from row 382
Training from row 383
Training from row 384
Training from row 385
Training from row 386
Training from row 387
Training from row 388
Training from row 389
Training from row 390
Training from row 391
Training from row 392
Training from row 393
Training from row 394
Training from row 395
Training from row 396
Training from row 397
Training from row 398
Training from row 399
Training from row 400
Training from row 401
Training from row 402
Training from row 403
Training from row 404
Training from row 405
Training from row 406
Training from row 407
Training from row 408
Training from row 409
Training from row 410
Training from row 411
Training from row 412
Training from row 413
Training from row 414
Training from row 415
Training from row 416
Training from row 417
Training from row 418
Training from row 419
Training from row 420
Training from row 421
Training from row 422
Training f

Training from row 751
Training from row 752
Training from row 753
Training from row 754
Training from row 755
Training from row 756
Training from row 757
Training from row 758
Training from row 759
Training from row 760
Training from row 761
Training from row 762
Training from row 763
Training from row 764
Training from row 765
Training from row 766
Training from row 767
Training from row 768
Training from row 769
Training from row 770
Training from row 771
Training from row 772
Training from row 773
Training from row 774
Training from row 775
Training from row 776
Training from row 777
Training from row 778
Training from row 779
Training from row 780
Training from row 781
Training from row 782
Training from row 783
Training from row 784
Training from row 785
Training from row 786
Training from row 787
Training from row 788
Training from row 789
Training from row 790
Training from row 791
Training from row 792
Training from row 793
Training from row 794
Training from row 795
Training f

Training from row 1119
Training from row 1120
Training from row 1121
Training from row 1122
Training from row 1123
Training from row 1124
Training from row 1125
Training from row 1126
Training from row 1127
Training from row 1128
Training from row 1129
Training from row 1130
Training from row 1131
Training from row 1132
Training from row 1133
Training from row 1134
Training from row 1135
Training from row 1136
Training from row 1137
Training from row 1138
Training from row 1139
Training from row 1140
Training from row 1141
Training from row 1142
Training from row 1143
Training from row 1144
Training from row 1145
Training from row 1146
Training from row 1147
Training from row 1148
Training from row 1149
Training from row 1150
Training from row 1151
Training from row 1152
Training from row 1153
Training from row 1154
Training from row 1155
Training from row 1156
Training from row 1157
Training from row 1158
Training from row 1159
Training from row 1160
Training from row 1161
Training fr

Training from row 1476
Training from row 1477
Training from row 1478
Training from row 1479
Training from row 1480
Training from row 1481
Training from row 1482
Training from row 1483
Training from row 1484
Training from row 1485
Training from row 1486
Training from row 1487
Training from row 1488
Training from row 1489
Training from row 1490
Training from row 1491
Training from row 1492
Training from row 1493
Training from row 1494
Training from row 1495
Training from row 1496
Training from row 1497
Training from row 1498
Training from row 1499
Training from row 1500
Training from row 1501
Training from row 1502
Training from row 1503
Training from row 1504
Training from row 1505
Training from row 1506
Training from row 1507
Training from row 1508
Training from row 1509
Training from row 1510
Training from row 1511
Training from row 1512
Training from row 1513
Training from row 1514
Training from row 1515
Training from row 1516
Training from row 1517
Training from row 1518
Training fr

## Step 3: Predict for new data

Now that we have our class conditional table (just take the values from classOccurrences and divide by numReal and numFake) we can predict the class of an unseen article by vectorising it and using Bayes' Theorem:

- P(class|X) = (product of P(Xi|class)) * P(class) / P(X)

We define a method to predict the class of a given article, either from the body of the article or the title.

The function takes in
- some text (article text or title)
- vector of all unique words in training data
- class conditional model (w/o numReal and numFake denominators)
- class priors

and returns a value indicating the predicted class:
- 0 => real/reliable
- 1 => fake/unreliable

In [58]:
# plan to implement Laplace smoothing

def predict(text, allText, classCon, classPriors):
    textVector = hotCodeWordArray(text, allText)
    pGivenReal = 1
    pGivenFake = 1
    
    # get P(text|real) and P(text|fake)
    for i in range(len(textVector)):
        realCon = classCon[i][0]/numReal
        fakeCon = classCon[i][1]/numFake
        
        #check if word from training data is in article
        if textVector[i] == True:
            pGivenReal *= realCon
            pGivenFake *= fakeCon
        else:
            pGivenReal *= (1 - realCon)
            pGivenFake *= (1 - fakeCon)
    
    # calculate P(real|text) and P(fake|text)
    pReal = (pGivenReal * classPriors[0])/( (pGivenReal*classPriors[0]) + (pGivenFake*classPriors[1]))
    pFake = 1 - pReal
    
    # compare results and hence predict class
    if pReal > pFake:
        return 0
    else:
        return 1

Below we process the testing data for use in the predict() function

In [53]:
%%time
#testFrame = testFrame.drop('Label', axis=1)

# clean data, remove nulls and NaN values

testFrameNumpy = testFrame.to_numpy()
test_feature_names, testFrameNumpy = getCleanedData(testFrameNumpy)
n = len(testFrameNumpy)
testFrame = pd.DataFrame(testFrameNumpy, np.arange(n), test_feature_names)

Wall time: 38.9 ms
