In [1]:
from random import randrange
from math import sqrt
from math import exp
from math import pi
import pandas as pd
from csv import reader
from random import seed


In [2]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated


In [3]:
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset


In [4]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())        


In [5]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup   


In [6]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 


In [7]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)


In [8]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries


In [9]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries


In [10]:
def calculate_probability(x, mean, stdev):
    if(stdev==0):
        stdev=1
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent


In [11]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
            return probabilities


In [12]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label


In [13]:
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)


In [14]:
seed(1)
filename = 'Desktop/test.csv'
train = load_csv(filename)
for i in range(len(train[0])-1):
    str_column_to_float(train, i)
look_up=str_column_to_int(train, len(train[0])-1)
train



[[4.0, 0.0, 3.0, 5.0, 1.0, 0.0, 6.0, 0.0, 2],
 [0.0, 5.0, 0.0, 2.0, 6.0, 0.0, 1.0, 0.0, 1],
 [0.0, 0.0, 6.0, 1.0, 0.0, 4.0, 1.0, 2.0, 0],
 [4.0, 1.0, 0.0, 1.0, 1.0, 0.0, 6.0, 0.0, 2],
 [0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 6.0, 0],
 [0.0, 4.0, 0.0, 2.0, 6.0, 0.0, 0.0, 1.0, 1],
 [5.0, 0.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2]]

In [16]:
filename = 'Desktop/test1.csv'
test = load_csv(filename)
test


[['0', '3', '0', '2', '6', '0', '2', '1']]

In [18]:
for i in range(0,8):
    str_column_to_float(test, i)
l=naive_bayes(train, test)

[2]

In [19]:
for i in look_up:
    if(look_up[i]==l[0]):
        print(i)


Politics
