### import packages

In [1]:
import csv
import random
from sklearn.model_selection import train_test_split
import os
import numpy as np
import pandas as pd
import math

### load Data and create training- and testdata


In [2]:
def load_data(file_name):

    if os.path.exists(file_name): #Kontrolle, ob die Daten vorhanden sind
            with open(file_name, 'r', encoding="utf8") as csvfile: #Öffnen und auslesen der Datei 
                csv_reader_object = csv.reader(csvfile, delimiter=',')
                counter = 0
                csv_list = []
                for row in csv_reader_object: #Jede Zeile in den Daten wird ausgelesen und in einer Liste gespeichert
                    #print(row)
                    if counter == 0:
                        pass
                    else:
                        csv_list.append(row+[file_name[7:11]])
                    counter += 1
            
            #print(counter-1,"Einträge aus", file_name[7:], "geladen")
            return csv_list        
    else:
        print("Datei", file_name ,"nicht gefunden") 
        
def get_data():
    file_folder = "./data/"
    files = ["Fake.csv","True.csv"]

    main_data = []

    for element in files: #Jede Datei aus der Liste wird ausgelesen
        file_name = file_folder+element
        main_data += (load_data(file_name))

    #print("Es gibt insgesamt", len(main_data), "Einträge")

    random.shuffle(main_data) #Randomizieren aller Daten
    main_data,unused_data = train_test_split(main_data,test_size=0.90) #Reduzierung des Datensatz auf 30%
    train_data,test_data = train_test_split(main_data,test_size=0.2) #Unterteilung in Training- und Testdaten

    #print("Länge train_data:", len(train_data)," und Länge test_data:", len(test_data))
    #print(train_data[0][3])

    return train_data,test_data

# Vorverarbeitung

In [3]:
def clear_data(data):

    qwe_list = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n1234567890“”,'‘’…¿0\\"
    forbidden_words = ["pictwitter","http","reuters","\xa0"]
    replace_words = []
    data_clear = []
    
    for news in data: #Für jeden Datensatz werden Zahlen und Sonderzeichen herausgefilter und in einzelne Wörter unterteilt
        record = news[0]+news[1]
        for element in qwe_list:
            record = record.replace(element," ")
        record = record.split(" ")

        tmp_list = []
        for element in record: #Für jedes Wort im Datensatz wird kontrolliert, ob es erlaubt ist und in Kleinbuchstaben gesetzt
            element = element.replace(" ","").lower()
            skip = 0
            for fbw in forbidden_words:
                if fbw in element:
                    skip = 1
                    break                    
            
            if len(element)>1 and skip == 0 : #jedes Wort muss eine minimal Länge von 2 Zeichen besitzen
                tmp_list.append(element)
            
        news = [tmp_list,news[4]]
        data_clear.append(news)
    return data_clear

# TF-IDF

In [4]:
def tf_idf_train(train_data): #Umwandlung der Wörter im Trainingdatensatz in Wahrscheinlichkeiten
    all_word_count = {}
    news_count = len(train_data) 
    news_tf = {}
    unique_words = []
    index = 0
    
    for news in train_data: 
        #Für jeden Datensatz der Trainingsdaten wird ein temporäres Dict erstellt 
        #in welchem die Anzahl der vorkommenden Wörter gespeichert wird.
        len_news = len(news[0])
        tmp_word_count = {}
        news_tf.update({index:{}})
                      
        for word in news[0]:
            if word not in unique_words:
                unique_words.append(word)
            
            if word in tmp_word_count.keys():
                tmp_word_count[word] += 1
            else:
                tmp_word_count.update({word:1})
                
        for word in tmp_word_count.keys():
            
            news_tf[index].update({word:tmp_word_count[word]/len_news}) #{word:tf}
            
            if word in all_word_count.keys():
                all_word_count[word] += 1
            else:
                all_word_count.update({word:1})
                
        index += 1

    idf_dict = {} #{word:idf}
    for word in all_word_count.keys():
        df = all_word_count[word]/news_count #df
        idf = math.log(news_count/(df+1),10)
        idf_dict.update({word:idf})
        #tf_idf = tf * idf

    unique_words.sort() #Alphanumerisches Sortieren des Datensatz
    unique_word_count = len(unique_words) #Länge der Liste mit einmaligen Wörtern
    
    #print(unique_words) #Alle Wörter, welche in allen Texten vorkommen
    #print(news_count) #Anzahl Nachrichtentexte
    #print(news_tf.keys()) #Dict mit Wort und tf-Wert
    
    #Shape der Matrix = news_count*unique_word_count+1 | das +1 ist für den class_value(True/Fake)(1/0)
    tf_idf_matrix = np.zeros((news_count, unique_word_count+1), dtype=float) 
    # Erstellen einer Nullmatrix mit Anzahl Texte*Anzahl Wörter
    
    print(len(all_word_count.keys()))
    
    for news_index in range(news_count):
        if train_data[news_index][-1] == "True":
            tf_idf_matrix[news_index,unique_word_count] = 1
            
        for word_index in range(unique_word_count):
            if unique_words[word_index] in news_tf[news_index].keys():
                tf_idf_matrix[news_index,word_index] = news_tf[news_index][unique_words[word_index]] * idf_dict[unique_words[word_index]]

    return tf_idf_matrix,news_count,all_word_count

#idf   
#tf(t,d) = count of t in d / number of unique_words in d
#df(t) = occurrence of t in documents
#idf(t) = N/df
#idf(t) = log(N/(df + 1))
#tf-idf(t, d) = tf(t, d) * log(N/(df + 1))

In [5]:
train_data,test_data = get_data()
train_data_clear = clear_data(train_data)
test_data_clear = clear_data(test_data)

training_data_tfidf, total_rows,all_word_count = tf_idf_train(train_data_clear)

In [7]:
def tf_idf_test(test_data,all_word_count):#Umwandlung der Wörter im Testdatensatz in Wahrscheinlichkeiten
    news_count = len(test_data) 
    news_tf = {}
    index = 0
    unique_words = all_word_count.keys()
    unique_word_count = len(unique_words)
    
    for news in train_data: 
        #Für jeden Datensatz der Testdaten wird ein temporäres Dict erstellt 
        #in welchem die Anzahl der vorkommenden Wörter gespeichert wird.
        len_news = len(news[0])
        tmp_word_count = {}
        news_tf.update({index:{}}) 
        
        for word in news[0]: 
            if word not in unique_words: #Wörter, welche nicht im Trainingsdatensatz vorkommen werden aussortiert
                break

            if word in tmp_word_count.keys():
                tmp_word_count[word] += 1
            else:
                tmp_word_count.update({word:1})

            for word in tmp_word_count.keys():

                news_tf[index].update({word:tmp_word_count[word]/len_news}) #{word:tf}
                all_word_count[word] += 1
                #if word in all_word_count.keys():       
            index += 1

    idf_dict = {} #{word:idf}
    for word in all_word_count.keys():
        df = all_word_count[word]/news_count #df
        idf = math.log(news_count/(df+1),10)
        idf_dict.update({word:idf})
        
    print(len(all_word_count.keys()))    
        
    print(news_count, unique_word_count+1)        
    
    tf_idf_matrix = np.zeros((news_count, unique_word_count+1), dtype=float)
    for news_index in range(news_count):
            
        for word_index in range(unique_word_count):
            print(unique_words[word_index],news_tf[news_index].keys())
            if unique_words[word_index] in news_tf[news_index].keys():
                tf_idf_matrix[news_index,word_index] = news_tf[news_index][unique_words[word_index]] * idf_dict[unique_words[word_index]]
    return tf_idf_matrix
tf_idf_test(test_data,all_word_count)

41707
898 41708


TypeError: 'dict_keys' object is not subscriptable

# Naives Bayes


In [None]:
def summarize_dataset(dataset): #Berechnung des Durschnittswert, Standardbweichung und Anzahl im Datensatz
    stats = [(np.mean(row),np.std(row),len(row)) for row in zip(*dataset)]
    return stats

def naives_bayes(tf_idf_matrix): #Anwendung des Naives Bayes Algorithmus 
    
    separated = {0:[],1:[]}
    summaries = {}
    
    for element in range(len(tf_idf_matrix)): #Aufteilen des Datensatz in True und Fake
        vector = tf_idf_matrix[element]
        class_value = vector[-1]
        separated[class_value].append(vector[:-1:])
    
    for class_value, rows in separated.items(): #Speichern 
        summaries[class_value] = summarize_dataset(rows)

    return summaries


## Probabilities

In [None]:
def pdf(x, mean, std): #Berechnung der (Gaussian) probability distribution function
    exponent = math.exp(-((x-mean)**2 / (2 * std**2 )))
    return (1 / (math.sqrt(2 * math.pi) * std)) * exponent

In [None]:
def prob(total_rows,summaries,new_news):
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for element in range(len(class_summaries)):
            mean, std, count = class_summaries[element]
            probabilities[class_value] *= pdf(new_news[element], mean, std)

In [None]:
# Vorhersage der class_value
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

## Test


In [None]:
def test(train_data, test_data): #Lernen des Algorithmus und Testen des Algorithmus mit Testdaten
    
    summarize = summarize_by_class(train_data) #Trainieren des Algorithmus mit Testdaten
    predictions = []
    for row in test_data: #Testen des Algortihmus mit Testdaten
        predictions.append(predict(summarize, row))
    return predictions


In [None]:
def get_class_values(data): #Herausfiltern der class_values aus dem Testdatensatz
    test_class_values = []
    for element in data:
        test_class_values.append(element[1])
    return test_class_values

# Evaluation

In [None]:
def accuracy(test_class_values, predictions): # Ermitteln der Genauigkeit des Algorithmus
    fp,tp,fn,tn = 0,0,0,0


# Ausführen der Funktionen

In [None]:
train_data,test_data = get_data()
train_data_clear = clear_data(train_data)
test_data_clear = clear_data(test_data)
test_class_values = get_class_values(test_data_clear)

training_data_tfidf, total_rows,all_word_count = tf_idf_train(train_data_clear)
test_data_tfidf= tf_idf_test(test_data,all_word_count)

predictions = test(tf_idf_matrix,test_data_tfidf)

accuracy(test_class_values, predictions)
#summaries = naives_bayes(tf_idf_matrix)