# Naive-Bayes Classifer for "Fake News" Dataset

The classes we are attempting to predict are:
- 0 => reliable news
- 1 => unreliable/fake news

## 3 steps to Naive-Bayes

- **Step 1:** *Separate dataset by class*
- **Step 2:** *Summarise dataset (i.e. calculate probabilties)*
- **Step 3:** *Predict for new data*

In [1]:
#imports go here

import pandas as pd
from dask import delayed
import numpy as np
import math
from time import sleep
import re
import string
import random
from collections import Counter
import sys
np.set_printoptions(threshold=sys.maxsize)
##Imports the song playing capacity.
import webbrowser

from csv import reader

In [2]:
def count_words(text):
    return Counter(text)

In [3]:
def loadData(name):
    return pd.read_csv(name, names = ['ID','Title','Author','Text','Label'])

In [4]:
def removeStringWords(string, badWords):
    size = len(badWords)
    for i in range(size):
        try:
            string.remove(badWords[i])
        except:
            continue
        
    return string

In [5]:
def createIndices(dataSize, percent):
    sizeChoice = int(dataSize * 0.2)
    randomChoices = random.sample(range(dataSize-1),sizeChoice)
    half = int(sizeChoice/2)
    firstHalf = randomChoices[0:half]
    secondHalf = randomChoices[half:]
    return firstHalf, secondHalf

In [6]:
def createDataSets(totData, percent):
    dataSize = len(totData)
    testIndices, validIndices = createIndices(dataSize, percent)
    trainIndices = np.empty(0)
    for i in range(dataSize):
        
        if i not in testIndices and i not in validIndices:
            trainIndices = np.append(trainIndices, i)
            
    testFrame = totData.drop(trainIndices)
    testFrame = testFrame.drop(validIndices)
    validFrame = totData.drop(trainIndices)
    validFrame = validFrame.drop(testIndices)
    trainFrame = totData.drop(testIndices)
    trainFrame = trainFrame.drop(validIndices)
    return trainFrame, testFrame, validFrame

In [7]:
# Jesse's method for cleaning data

#remove NaN, punctuation and new lines
def getCleanedData(data):
    feature_names = np.array(['ID', 'Title', 'Author', 'Text'])
    
    #convert NaNs in order to remove later
    for i in range(3):
        string_replacement = ""
        if i == 1:
            string_replacement = "-NO AUTHOR-"
        else:
            string_replacement = "NaN"
            
        for j in range(len(data)):
            if pd.isnull(data[j][i+1]):
                data[j][2] = string_replacement
                
    #punctuation to remove
    remove = string.punctuation
    remove = remove + "“"
    remove = remove + "”"
    remove = remove + "’"
    remove = remove + '‘'
    remove = remove + '—'
    remove = remove + '–'
    
    #remove rows that contain NaN
    data = data[np.all(data != "NaN", axis = 1)]
    
    #remove punctuation, new lines, and convert words to lowercase
    for i in range(3):
        for j in range(len(data)):
            data[j][i+1] = data[j][i+1].replace("\n","").translate(str.maketrans('', '', remove)).lower()
            
    #remove rows that contain '' after removing a bunch of things
    data = data[np.all(data != '', axis = 1)]
    return feature_names, data

In [8]:
def cleanString(text):
    badWords = ['not','you','at','from','of','us','in','have','yes','no','are','','for','but','that','it','this','he','she','they','that','a','an','who','where','there','his','her','their','i','my','we','our','were','the','if','as','and','in','on','we','to','also','so','is','its']
    try:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.split(' ')
        text = removeStringWords(text, badWords)
        #print('success')
        #sleep(0.01)
        return text
    except:
        #print('fail')
        return

In [9]:
def appendArr(text, y):
    return text.append(y)

In [10]:
def getCounters(data, title, labelTitle, labelDesired):
        dataTitle = data.loc[data[labelTitle] == labelDesired]
        titleArray = dataTitle[title].to_numpy()
        results = []
        for i in titleArray:
            y = delayed(cleanString)(i)
            try:
                p = float(y[0])
                pass
            except:
                if y is not None:
                    #print(y)
                    appendArr(results,y)
                
        texts = delayed(results)
        return texts.compute()

In [11]:
def appendArr(text, y):
    return text.append(y)

In [12]:
def addCounters(prevCount, currCount):
    return prevCount + currCount

In [13]:
def tempNameCounter(dataFrame, title, labelTitle, labelDesired):
    titleCounterReal = getCounters(trainFrame, title, labelTitle, labelDesired)
    results = []
    resultstwo = []
    j = 0
    for i in titleCounterReal:
        y = delayed(count_words)(i)
        if j % 2:
            results.append(y)
        else:
            resultstwo.append(y) 
    bigCount = delayed(addCounters)(results,resultstwo)
    done = bigCount.compute() 
    #bigCount.visualize()
    sadMe = Counter()
    for i in done:
        sadMe = sadMe + count_words(i)
    return sadMe

In [14]:
def getRelevantInfo(dataFrame, title, labelTitle, labels):
    results = []
    j = 0
    for i in labels:
        #print(tempNameCounter(dataFrame, title, labelTitle, i))
        results.append(tempNameCounter(dataFrame, title, labelTitle, i))
        #print("result at index " + str(j) + " corresponds to " + title + " of label " + str(i))
        j +=1
        
    return results

In [15]:
def getTitles(dataFrame):
    print("Running Titles")
    return  getRelevantInfo(dataFrame, 'Title', 'Label', [0,1])

In [16]:
def getText(dataFrame):
    print("Running Text")
    return getRelevantInfo(dataFrame, 'Text', 'Label', [0,1])

In [17]:
def getAuthor(dataFrame):
    print("Running Text")
    return getRelevantInfo(dataFrame, 'Author', 'Label', [0,1])

In [18]:
def unzip(data):
    wordsReal, numberReal = [list(c) for c in zip(*list(data[0].items()))]
    wordsFake, numberFake = [list(c) for c in zip(*list(data[1].items()))]
    return [wordsReal,numberReal,wordsFake,numberFake]

In [19]:
def createAllDataString(results):
    #use the longest array to do the least work
    allWords = []
    allNumbers = []
    realWords = results[0]
    realNumbers = results[1]
    fakeWords = results[2]
    fakeNumbers = results[3]
    allWords = fakeWords
    allNumbers = fakeNumbers
   # print(fakeWords)
   # print(realWords)
    for i in realWords:
        indexReal = realWords.index(i)
        if i in allWords:
            indexAll = allWords.index(i)
            allNumbers[indexAll] = allNumbers[indexAll] + realNumbers[indexReal]
        else:
            #print("added Word")
            allWords.append(i)
            allNumbers.append(realNumbers[indexReal])
            
                
    #to verify just check that the lengths match up, the assumption is that there will be a smaller all words compared to realWords + fakeWords
    #print(len(allWords))
    #print(len(realWords) + len(fakeWords))
    return allWords, allNumbers
        

In [20]:
#Please only use this to test to see if it generates the binary data correctly it is referenced in createBinaryDataStrings.
#The idea is that should we have all the words captured the result array since it is constituted of words should be equal to the length of the the two sets of words at it's max but obviously
#we expect it to be a bit or a lot smaller than it, so I made a way of testing it to see that everytime it finds a word it counts it as a 1(for being true in the array)
#if the printed counter is not the same as teh array in size then clearly something must be missing.
#hence why this will definitely show that the above works
def testCreateBinaryDataStrings(realArray, fakeArray, resultArray):
    print(len(realArray))
    print(len(fakeArray))
    j = 0 
    for i in realArray:
        if i in resultArray:
            j += 1
    print(j)
    j = 0
    for i in fakeArray:
        if i in resultArray:
            j += 1
    print(j)
    print(len(resultArray))

In [21]:
def hotCodeWordArray(dataPoint, allData):
    # allData = np.append(allData, "SEXYGODZILLA") this is just me checking if it works
    binary = np.ones(0, dtype = bool)
    for i in allData:
        if i in dataPoint:
            binary = np.append(binary, True)
        else:
            binary = np.append(binary,False)
    return binary

In [22]:
def playSongWhenDone():
    # feel free to add songs or remove them as you want, just follow the format of nextNum,songLink
    temp = pd.read_csv('songs.csv', names = ['id', 'url'])
    ID = temp['id'].to_numpy()
    urls = temp['url'].to_numpy()
    i = random.sample(range(len(ID)),1)
    song = urls[i]
    try:
        webbrowser.open(song[0])
    except:
        print("no internet")

## Step 1: Separate dataset by class

Done by using the unzip() and getRelevantInfo() methods

We also separate our data into training, testing, and validation subsets.

In [23]:
%%time

# load dataset from CSV file
filename = 'tenPercent.csv'
data = loadData(filename)

# create training, testing, and validation datasets from dataset 'data'
trainFrame, testFrame, validFrame = createDataSets(data, 0.2)

# this returns an array of arrays of the form [realWords, realNumbers, fakeWords, fakeNumbers]
# realWords => words occurring in data classed as real, similarly for fakeWords
# realNumbers => number of occurrences of words in realWords, similarly for fakeNumbers
# i.e. realNumbers[0] = "apple", realNumbers[0] = 5 means that the word "apple" occurs 5 times in articles classed as real news.
# trainTextData = unzip(getText(trainFrame))


# separate training data by class
# loc gets every row where the column value meets some condition

#I understand you are doing this for your class priors so cool
real = trainFrame.loc[trainFrame['Label'] == 0]
fake = trainFrame.loc[trainFrame['Label'] == 1]


Wall time: 135 ms


In [24]:
%%time
trainTextData = unzip(getText(trainFrame))
trainTitleData = unzip(getTitles(trainFrame))

Running Text
Running Titles
Wall time: 28.4 s


In [25]:
%%time
allTitles = createAllDataString(trainTitleData)
# createAllDataString creates a 2D array of [all unique words in dataset, occurrences of unique words]

Wall time: 334 ms


In [26]:
%%time
allText = createAllDataString(trainTextData)

Wall time: 1min 13s


In [27]:
%%time
#hotTitlesReal = hotCodeWordArray(trainTitleData[0], allTitles[0])
#hotTitlesFake = hotCodeWordArray(trainTitleData[2], allTitles[0])
#hotTextReal = hotCodeWordArray(trainTitleData[0], allText[0])
#hotTextFake = hotCodeWordArray(trainTitleData[2], allText[0])

Wall time: 0 ns


In [28]:
# explicitly separate and store real words, fake words and their occurrences

realTextWords = trainTextData[0]
realTextNumbers = trainTextData[1]
fakeTextWords = trainTextData[2]
fakeTextNumbers = trainTextData[3]

## Step 2: Summarise the dataset (calculate probabilities)

We need:
   - the probability that an article is real or fake news
   - for each article, a vector of words that occur in it (vocabulary)
   - a vocab vector for real and fake, and the number of occurences of each word. (This is done already above.) The global vocabulary is just the union of realWords and fakeWords.
   
P(class given data) = ( product of P(Xi given class) * P(class) ) / P(Data)

In [29]:
# get total number of articles in training set
dataSize = len(trainFrame) 

# get number of real articles and number of fake articles
#this is fine because of what you did initially
numReal = len(real)
numFake = len(fake)

classPriors = np.array([(numReal/dataSize),(numFake/dataSize)])
print(classPriors)

[0.49278846 0.50721154]


## Get class conditional table

To do this we need:
- priors for each class
- to be able to vectorise the text of an article to show what words it contains that we have seen before.
    - this then means that we need to vectorise the text from our training articles as well, or some way to check if a training article contains a word we have seen in the entire training set.

In [30]:
# get class conditional table
# need for each word, (instances in class/classSize) for each class


#This below feels super dodgy given how the unzips work and especially given how the getText function works
#realText = unzip(getText(real))
#fakeText = unzip(getText(fake))
#Yeah it is pretty dodgy, this is what you want, you basically told it to sepukku but dishonourably. 
#I ran it and got an infinite loop.

#realTextWords = trainTextData[0]
#realTextNumbers = trainTextData[1]
#fakeTextWords = trainTextData[2]
#fakeTextNumbers = trainTextData[3]


# realText[0] => every unique word[i] in real articles
# realText[1] => number of occurrences of word[i] in real articles
# similarly for fakeText[0] and fakeText[1]

In [31]:
# we need to convert each article into boolean/binary vector words ---- I tried this, had a couple issues but I made it into a binary vector, 
# words encodes whether an article contains a word or not, for all words we have seen


#Here is what I did
#you dont want to use this for that
#I recommend the following

#allTitlesInfo = unzip(getTitles(trainFrame))
#allTextInfo = unzip(getText(trainFrame)) #Text will always take a bit of time
#allAuthorInfo = unzip(getAuthor(trainFrame))

#the results for the above have the following form, index 0 = true words
#index 0 = true words, index 1 = true words' respective counts 
#index 2 = fake words, index 3 = fake words' respective counts

#please avoid calling these get functions, use them once and then store them, they are quite computationally expensive and when you move on to
#the Actual Data, you might take ~30 min to do it once on getText. So always store.
# createBinaryAllDataString(getText(trainFrame))

#I will segment it here for that reason so that all our gets are in one area so that we dont have to wait ages to continue.
#Try and keep things that are expensive in their smallish segments so you dont have to worry about them too much

realTitleWords = trainTitleData[0]
realTitleNumbers = trainTitleData[1]
fakeTitleWords = trainTitleData[2]
fakeTitleNumbers = trainTitleData[3]

In [32]:
#What this does is not a vectorise anymore, and since I need to try and save on memory space at times to help you down the road I changed the vectorised function.
#What this does is it creates 2 arrays, one array containing ALL words for both the fake and real entries of that piece you gave it.
#The other array is the array containing those counts, in order.
#it will seperate it for you, so long as you give it an all_____info where that ____ is whatever you wanted text,title or author
allTitles = createAllDataString(trainTitleData)

In [33]:
#Next you have the vectorise, so the vectorise function does things a bit weirder but regardless will work on any string vector you give it.


hotTitlesTrue = hotCodeWordArray(trainTitleData[0], allTitles[0])
hotTitlesFake = hotCodeWordArray(trainTitleData[2], allTitles[0])
hotTextTrue = hotCodeWordArray(trainTitleData[0], allText[0])
hotTextFalse = hotCodeWordArray(trainTitleData[2], allText[0])

#the inputs are as follows, the result_____ (0 is for only real strings of that result and 2 for only fake strings of that result)
#always all____[0] since this contains all the strings for that thing. Whereas [1] contains all the counts
#This is all just a boolean set.


#if you give it a new entry it will work just fine, I recommend looking at your interaction with pandas that you be a little careful on how you interact with it.
#I personally would get it to to read a csv file similar to load data and then break it up into arrays where each array is that respective thing
#I.e a author, title, text and label array (which you can hide for testing but use in validation)
# a functional showcase is in this function - https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array
#you should want to do that and just give it the right data.


# I need to get each article as a boolean vector
# related to all the words we have seen over all of the articles, real and fake

# then iterate over each article's vector
# count articles that contain each word for real and for fake
# these count values go into the class conditional table
# classCon[0][0] => (# of real articles containing word 0)/(# of real articles)

In [44]:
# just get first real news article for testing vectorisation
#r = real.iloc[0]
#print("Pandas row:")
#print(row)

# attempt to unzip text from r, since r is still in Pandas format
#try not to, do a to_numpy on the columns you are interested and get an array for each, they are all the same length so it is okay and you could index all using one variable with their respective arrays
#This means you can have 3 naibe bayes that tell you the probability of it being a Real on title, on text and on author seperately
#Should help you find which is the best for seeing which is true, since if you want to do it on three seperate variables, you will need to look into statistics for a dependency to normalise this (which I know is not wanted)
#It is possible but it will be a thing we can do right at the end since they are independent up till then since you cant multiply those 3 probabilities by eachother without thinking about the dependency

#so it would be like P(real|title), p(real|text), p(real|author) since if you multiplied those you would unless all Ps are 1 would reduce probability and 
# p(real|title)*p(real|text)*p(real|author) + p(fake|title)*p(fake|text)*p(fake|author) != 1 (almost always) and hence a need to normalise and hence why I say dont try do that
#make one naive bayes function that is robus and use it on each of them.

# ^^^ this last line is the plan, I'm testing on the text since that seems to be running quickly enough
# though I see you are only testing on titles really for quicker testing
# it's a bit hard to keep track of the million different variables created, a lot of them redundant

# article in numpy array form
#realArr = real.to_numpy()
#print()
#print("Numpy row:")
#print(realArr[0])

Pandas row:
ID                                                       28
Title     Andrea Tantaros of Fox News Claims Retaliation...
Author                                            Jim Dwyer
Text      Andrea Tantaros, a former Fox News host, charg...
Label                                                     0
Name: 28, dtype: object


In [46]:
%%time
# need to count occurrences of each word in real and fake articles
# this is where the actual training happens

numWords = len(allText[0])
classOccurrences = np.zeros([numWords, 2])
# every row of classOccurrences => [numRealArticles, numFakeArticles]

# for every article, update class occurrences
for i in range(len(trainFrame)):
    row = trainFrame.iloc[i]
    rowClass = row['Label']
    # vectorise article text
    rowVector = hotCodeWordArray(row['Text'], allText[0])
    print("Training from row %d" % i)
    
    # for every seen word
    for j in range(len(allText[0])):
        # if word j occurs in the article increment class occurences
        if rowVector[j] == True:
            if rowClass == 0:
                # real article
                classOccurrences[j][0] += 1
            else:
                # fake article
                classOccurrences[j][1] += 1
                

                
print(classOccurrences)

Training from row 0
Training from row 1
Training from row 2
Training from row 3
Training from row 4
Training from row 5
Training from row 6
Training from row 7
Training from row 8
Training from row 9
Training from row 10
Training from row 11
Training from row 12
Training from row 13
Training from row 14
Training from row 15
Training from row 16
Training from row 17
Training from row 18
Training from row 19
Training from row 20
Training from row 21
Training from row 22
Training from row 23
Training from row 24
Training from row 25
Training from row 26
Training from row 27
Training from row 28
Training from row 29
Training from row 30
Training from row 31
Training from row 32
Training from row 33
Training from row 34
Training from row 35
Training from row 36
Training from row 37
Training from row 38
Training from row 39
Training from row 40
Training from row 41
Training from row 42
Training from row 43
Training from row 44
Training from row 45
Training from row 46
Training from row 47
Tr

TypeError: argument of type 'float' is not iterable