In [94]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


def split_dataset(test_size):
    dataset = pd.read_csv("new_data.csv", header=None).values
    attr = dataset[:, :-1].astype(np.int32, copy=False) # атрибуты
    classes = dataset[:, -1].astype(np.int32, copy=False) # классы
    data_train, data_test, class_train, class_test = train_test_split(attr, classes, test_size=test_size, random_state=55)
    return data_train, class_train, data_test, class_test

# Разделяет обучающую выборку по классам таким образом,
#  чтобы можно было получить все элементы, принадлежащие определенному классу.
def separate_by_class(data_train, class_train):
    classes_dict = {}
    for i in range(len(data_train)):
        classes_dict.setdefault(class_train[i], []).append(data_train[i])
    return classes_dict

def mean(numbers):  # Среднее значение
    return sum(numbers) / float(len(numbers))

def stand_dev(numbers):  # вычисление дисперсии
    var = sum([pow(x - mean(numbers), 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(var)

def summarize(data_train):
    summaries = [(mean(att_numbers), stand_dev(att_numbers)) for att_numbers in zip(*data_train)]
    return summaries

def summarize_by_class(data_train, class_train): # Обучение классификатора
    # Разделяет обучающую выборку по классам таким образом,
    # чтобы можно было получить все элементы, принадлежащие определенному классу.
    classes_dict = separate_by_class(data_train, class_train)
    summaries = {}
    for class_name, instances in classes_dict.items():
        summaries[class_name] = summarize(instances)
    return summaries

# вычисление апостериорной вероятности принадлежности объекта к определенному классу
def calc_probability(x, mean, stdev):
    if stdev == 0:
        stdev += 0.000001  # добавляем эпсилон, если дисперсия равна 0
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

# вычисление вероятности принадлежности объекта к каждому из классов
def calc_class_probabilities(summaries, instance_attr):
    probabilities = {}
    for class_name, class_summaries in summaries.items():
        probabilities[class_name] = 1.0
        for i in range(len(class_summaries)):
            mean, stdev = class_summaries[i]
            x = float(instance_attr[i])
            probabilities[class_name] *= calc_probability(x, mean, stdev)
    return probabilities

# классификация одного объекта
def predict_one(summaries, instance_attr):
    # вычисление вероятности принадлежности объекта к каждому из классов
    probabilities = calc_class_probabilities(summaries, instance_attr)
    best_class, max_prob = None, -1
    for class_name, probability in probabilities.items():
        if best_class is None or probability > max_prob:
            max_prob = probability
            best_class = class_name
    return best_class

# классификация тестовой выборки
def predict(summaries, data_test):
    predictions = []
    for i in range(len(data_test)):
        result = predict_one(summaries, data_test[i])
        predictions.append(result)
    return predictions

# сравнение результатов классификации с реальными, вычисление точности классификации
def calc_accuracy(summaries, data_test, class_test):
    correct_answ = 0
    predictions = predict(summaries, data_test)
    for i in range(len(data_test)):
        if class_test[i] == predictions[i]:
            correct_answ += 1
    return correct_answ / float(len(data_test))



In [95]:
data_train, class_train, data_test, class_test = split_dataset(0.5)    

In [96]:
data_train[:15]

array([[ 0,  2,  4,  9, 12, 13],
       [ 0,  2,  4,  8, 12, 13],
       [ 0,  4,  5,  8,  9, 11],
       [ 0,  1,  4,  6,  7, 11],
       [ 0,  4,  5,  6, 11, 12],
       [ 0,  4,  4, 10, 10, 12],
       [ 0,  1,  4,  6,  7,  8],
       [ 0,  2,  3,  3,  8, 13],
       [ 0,  7,  9, 10, 11, 13],
       [ 0,  2,  4,  6,  6,  9],
       [ 0,  3,  6,  7,  9, 10],
       [ 0,  6,  6,  7,  7, 11],
       [ 0,  4,  6,  8, 13, 13],
       [ 0,  2,  5,  7,  9, 11],
       [ 0,  2,  3,  6, 10, 10]])

In [97]:
class_train[:5]

array([0, 0, 0, 0, 0])

In [98]:
data_test[:15]

array([[ 0,  1,  1,  2,  7, 10],
       [ 0,  3,  5,  6,  6, 10],
       [ 0,  2,  2,  4, 12, 12],
       [ 0,  5,  5,  8, 12, 13],
       [ 0,  1,  2,  4,  7,  8],
       [ 0,  1,  2,  4, 12, 13],
       [ 0,  2,  5,  6,  9, 11],
       [ 0,  8,  9, 11, 13, 13],
       [ 0,  5,  6, 10, 11, 12],
       [ 0,  1,  7,  8, 11, 12],
       [ 0,  3,  4,  9, 12, 13],
       [ 0,  2,  7,  8, 11, 13],
       [ 0,  1,  7,  9, 11, 12],
       [ 0,  5, 11, 11, 12, 13],
       [ 0,  5,  6, 10, 11, 11]])

In [99]:
class_test[:15]

array([1, 1, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1])

In [100]:
summaries = summarize_by_class(data_train, class_train)


In [101]:
accuracy = calc_accuracy(summaries, data_test, class_test)
print('myNBClass ', 'Accuracy: ', accuracy)

myNBClass  Accuracy:  0.45765693722510997


In [102]:
clf = GaussianNB()
clf.fit(data_train, class_train)
print('sklNBClass ', 'Accuracy: ', clf.score(data_test, class_test))

sklNBClass  Accuracy:  0.592483006797


In [103]:
def change_dataset():
    with open("data.csv") as inf, open("new_data.csv", "w") as outf:
        for line in inf:
            ans = []
            rang = []
            s = line.split(',')
            if (s[0] == s[2] == s[4] == s[6] == s[8]):
                ans.append(1)
            else:
                ans.append(0)
            rang = s[1:10:2]
            rang.sort(key=int)
            ans.extend(rang)
            ans.append(s[10])
            out = ','.join(map(str, ans))
            outf.write(out)