### import packages

In [None]:
import csv
import random
from sklearn.model_selection import train_test_split
import os
import numpy as np
import pandas as pd


### load Data and make test- train split

In [None]:
def load_data(file_name):

    if os.path.exists(file_name):
            with open(file_name, 'r', encoding="utf8") as csvfile:
                csv_reader_object = csv.reader(csvfile, delimiter=',')
                counter = 0
                csv_list = []
                for row in csv_reader_object:
                    #print(row)
                    if counter == 0:
                        pass
                    else:
                        csv_list.append(row+[file_name[7:11]])
                    counter += 1
            
            print(counter-1,"Einträge aus", file_name[7:], "geladen")
            return csv_list        
    else:
        print("Datei", file_name ,"nicht gefunden") 
        
def main():
    file_folder = "./data/"
    files = ["Fake.csv","True.csv"]

    main_data = []

    for element in files:
        file_name = file_folder+element
        main_data += (load_data(file_name))

    print("Es gibt insgesamt", len(main_data), "Einträge")

    random.shuffle(main_data)
    train_data,test_data = train_test_split(main_data,test_size=0.2) 

    print("Länge train_data:", len(train_data)," und Länge test_data:", len(test_data))
    #print(train_data[0][3])

    return train_data,test_data


# TF-IDF

### the indexer

In [None]:
def getDocID(aFile):
    #get a unique docID
    prefix,fileName=os.path.split(aFile)
    prefix,sentDir=os.path.split(prefix)
    prefix,userDir=os.path.split(prefix)    
    docID=userDir+"|"+sentDir+"|"+fileName;
    return docID
    
def makeWordStats(aFile):    
    global termBase
    global numberDocsSeen
    global docBase
    
    
    alreadySeen=set()
    
    file=open(aFile,"r")
    content=file.read()
    wordCount=0
    for s in content.split(" "):
        s=s.lower()#lower case        
        if not( "\n" in s) and len(s)>3 and not(s in alreadySeen) and s.isalpha():
            wordCount=wordCount+1            
            alreadySeen.add(s)
            if s in termBase:
                termBase[s]["numberDocsContainedTerm"]=termBase[s]["numberDocsContainedTerm"]+1 #called only once per documente for a specific term (otherwise alreadySeen contains is)
            else:
                termBase[s]={"numberDocsContainedTerm":1}
    docBase[getDocID(aFile)]={"file":aFile,"wordCount":wordCount,"colIndex":numberDocsSeen}
    numberDocsSeen=numberDocsSeen+1    

### the crawler

In [None]:
callCounter=0
stopAfterMax=10000#set to low value like 50 reduces computation time
#mode is preburn for colecting statitics and computing idf
#mode is wordVector for computing the wordvector, preburn must have been called before
def crawl(startDir):
    #print("crawl with mode",mode)
    global callCounter
    global stopAfterMax
    
    #print(callCounter,stopAfterMax)
    for entry in os.listdir(startDir):
        #print(entry)
        candidate=startDir+"/"+entry        
        if (os.path.isdir(candidate)):            
            callCounter=callCounter+1
            if callCounter<stopAfterMax:
                crawl(candidate)
                
        else:#not a path, so index it            
            callCounter=callCounter+1
            if callCounter<stopAfterMax:
                #print(candidate)                
                makeWordStats(candidate)

In [None]:
callCounter=0

termBase={}#a hashmap to count how many docs contain a specific term
docBase={}#stores the number of terms in each document identifed by its docID and the full file name and the column positoin in the wordVector
numberDocsSeen=0
crawl(maildir_base)

### build idf scores using a dict

In [None]:
def makeIDFfromstats():    
    global idf_sorted
    global termBase
    for term in termBase.keys():
        idf_d=-np.log(termBase[term]["numberDocsContainedTerm"]/numberDocsSeen)
        termBase[term]["idf"]=idf_d
        
        idf_sorted.append((idf_d,term))
    idf_sorted.sort(reverse=True)
    counter=0
    for idf_term in idf_sorted:
        term=idf_term[1]        
        termBase[term]["rowIndex"]=counter
        counter=counter+1

In [None]:
idf_sorted=[]#the mapping from term to idf sorted by idf just to inspect the result
makeIDFfromstats()

### compute word-vectors using tf-idf

In [None]:
def makeWordVector(aDocID):
    
    global wordVectors
    global docBase
    global termBase    
    wordsInDocument={}#count how many times a specific term occured in the document
    
    
    
    fileName=docBase[aDocID]["file"]
    colIndex=docBase[aDocID]["colIndex"]
    file=open(fileName,"r")
    content=file.read()
    
    #count words in document
    for s in content.split(" "):
        s=s.lower()#lower case
        if not( "\n" in s) and len(s)>3 and s.isalpha():                        
            if s in wordsInDocument:
                wordsInDocument[s]=wordsInDocument[s]+1
            else:
                wordsInDocument[s]=1
    
    #compute tf-idf per each term
    for term in wordsInDocument:
        termFreq=wordsInDocument[term]
        invDocFreq=termBase[term]["idf"]
        rowIndex=termBase[term]["rowIndex"]
        #print(term,"termFreq",termFreq,"invDocFreq",invDocFreq,"colIndex",colIndex,"rowIndex",rowIndex)
        wordVectors[rowIndex][colIndex]=termFreq*invDocFreq    
        #termRowIndex=map_term_position[term]
        #wordVectors[termRowIndex][numberDocsSeen]=termFreq*invDocFreq #make the entry for term and document
    #docStatistics[getDocID(aFile)][2]=numberDocsSeen#store the column this document has in the wordVector-Matrix


In [None]:
numberRows=len(termBase.keys())
numberCols=len(docBase.keys())
#rows-> term (the rows are sorted according to idf_sorted, such that very frequent terms are at the bottom of the matrix)
#cols-> doc
wordVectors=np.zeros(shape=(numberRows,numberCols))
print("shape of wordVectors:",np.shape(wordVectors))

for docID in docBase:
    makeWordVector(docID)

# Naive Bayes

### build statistics / learning phase

In [None]:
def analyze(aTweet,classified):
    global termBase
    alreadySeen=set()
    splits=aTweet.split(" ")
    for token in splits:
        token=token.lower()
        if (len(token)>2) and token.isalpha() and (not(token in alreadySeen)):
            alreadySeen.add(token)
            if not(token in termBase):
                termBase[token]={"negative":0,"neutral":0,"positive":0}
            if classified=="negative":
                termBase[token]["negative"]=termBase[token]["negative"]+1
            elif classified=="neutral":
                termBase[token]["neutral"]=termBase[token]["neutral"]+1
            elif classified=="positive":
                termBase[token]["positive"]=termBase[token]["positive"]+1
            else:
                print("class not supported",classified)

### crawl through the database

In [None]:
def crawl():
    for i in range (0,len(df)):
    #for i in range (0,10):
        classified=df.iloc[i].airline_sentiment
        tweet=df.iloc[i].text
        analyze(tweet,classified)
        countPrior(classified)
def countPrior(classified):
    if classified=="negative":
        prior["negative"]=prior["negative"]+1
    elif classified=="neutral":
        prior["neutral"]=prior["neutral"]+1
    elif classified=="positive":
        prior["positive"]=prior["positive"]+1

In [None]:
termBase={}
prior={"negative":0,"neutral":0,"positive":0}
crawl()

### prediction phase

In [None]:
def predict(aText):    
    product_neg=0
    product_neu=0
    product_pos=0
    length=len(df)
    for token in aText.split(" "):
        token=token.lower()
        if (len(token)>2) and token.isalpha() and isTokenNonZero(token):            
            print(token)
            product_neg=product_neg+np.log(termBase[token]["negative"]/prior["negative"])
            product_neu=product_neu+np.log(termBase[token]["neutral"]/prior["neutral"])
            product_pos=product_pos+np.log(termBase[token]["positive"]/prior["positive"])
    product_neg=product_neg+np.log(prior["negative"]/length)
    product_neu=product_neu+np.log(prior["neutral"]/length)
    product_pos=product_pos+np.log(prior["positive"]/length)
    print("negative",product_neg)
    print("neutral",product_neu)
    print("positive",product_pos)

def isTokenNonZero(token):
    if not(token in termBase):
        return False
    if termBase[token]["negative"]<=0:
        return False
    if termBase[token]["neutral"]<=0:
        return False
    if termBase[token]["positive"]<=0:
        return False
    return True

In [None]:
#predict("Beispiel")