# Machine Learning Algorithms

## Naive Bayes

In [1]:
import pandas as pd
import numpy as np
import math
from helper import train_test_split

In [2]:
def class_dict(data):
    classes = {}
    for row in data:
        if (row[-1] not in classes):
            classes[row[-1]] = []
        classes[row[-1]].append(row)
    return classes

def mean_std(data):
    mstd = [(np.mean(col), np.std(col)) for col in list(zip(*data))[:-1]]
    return [(mean, std) if std != 0 else (0.0,1.0) for mean,std in mstd]

def mean_std_classes(data):
    classes = class_dict(data)
    mstd = {}
    for c in classes:
        mstd[c] = mean_std(classes[c])
    return mstd

def prob(x, mean, std):
    if std == 0.0: return 1e-6
    return (1/(math.sqrt(2*math.pi)*std))*math.exp(-(math.pow(x-mean,2)/(2*math.pow(std,2))))

def prior(train):
    p = {}
    for c in set(train[-1]):
        p[c] = len([x for x in train[:,-1] if x == c]) / len(train[:,-1])
    return p

def prob_classes(mstd, priors, row):
    p = {}
    for c in mstd:
        p[c] = priors[c] *np.multiply.reduce([
            prob(x, mean, std)
            for (mean, std), x in zip(mstd[c], row)])
    return p

def predict(mstd, priors, row):
    probs = prob_classes(mstd, priors, row)
    best = None, -1
    for c in probs:
        if best[0] is None or probs[c] > best[1]:
            best = c, probs[c]
    return best[0]

def accuracy(train, test):
    dist = mean_std_classes(train)
    priors = prior(train)
    predicted = [predict(dist, priors, row) for row in test]
    actual = [row[-1] for row in test]
    return sum(1 for p,a in zip(predicted, actual) if p == a) / len(test) * 100.0

In [3]:
train, test = train_test_split()

In [4]:
print(accuracy(train['artificial'], test['artificial']))
print(accuracy(train['income'], test['income']))

95.39999999999999
81.04539033228917


## Perceptron

In [5]:
import pandas as pd
import numpy as np
from helper import train_test_split

In [6]:
def predict(row, weights):
    weighted_sum = weights[0] + np.dot(weights[1:], row[:-1])
    return 1 if weighted_sum >= 0 else 0

def train_weights(train, learn_rate, epochs):
    weights = np.zeros_like(train[0])

    for epoch in range(epochs):
        for row in train:
            error = row[-1] - predict(row, weights)
            weights[0] += learn_rate * error
            for i in range(len(row)-1):
                weights[i + 1] += learn_rate * error * row[i]

    return weights

def accuracy(data, weights):
    predicted = [predict(row, weights) for row in data]
    actual = [row[-1] for row in data]
    return sum(1 for p,a in zip(predicted, actual) if p == a) / len(data) * 100.0

In [7]:
train, test = train_test_split()

In [8]:
weights = train_weights(train['artificial'], 0.1, 5)
print(accuracy(test['artificial'], weights))
weights = train_weights(train['income'], 0.1, 5)
print(accuracy(test['income'], weights))

100.0
78.28757447331245


## K-Nearest Neighbors

In [9]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from helper import train_test_split

In [10]:
train, test = train_test_split()

In [11]:
def dist(x, y, length):
    return np.add.reduce(np.abs(x[:length] - y[:length]))

In [12]:
def neighbors(train, test, k):
    sample = train[np.random.choice(len(train), 500)]
    distances = np.array(sorted([(x, dist(test, x, len(test))) for x in sample], key=lambda x: x[1]))[:k, 0]
    return distances

In [13]:
def prediction(nb):
    pred = Counter([n[-1] for n in nb]).most_common()[0][0]
    return pred

In [14]:
def accuracy(train, test, k):
    nbs = np.array([neighbors(train, row, k) for row in test])
    predicted = [prediction(nb) for nb in nbs]
    actual = [row[-1] for row in test]
    return sum(1 for p,a in zip(predicted, actual) if p == a) / len(test) * 100.0

In [15]:
train, test = train_test_split()

In [16]:
print(accuracy(train['artificial'], test['artificial'], 5))
print(accuracy(train['income'], test['income'], 100))

100.0
76.3773723972729


## Decision Trees

In [17]:
def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] < value: left.append(row)
        else: right.append(row)
    return left, right
 
def gini_index(groups, classes):
    n = sum(len(group) for group in groups)
    gini = 0
    for group in groups:
        if len(group) == 0:
            continue
        score = 0
        for c in classes:
            p = [row[-1] for row in group].count(c) / len(group)
            score += p ** 2
        gini += (1 - score) * len(group) / n
    return gini

def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = 1e10, 1e10, 1e10, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

def terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

def split_node(node, max_depth, min_size, depth):
    left, right = node['groups']
    del node['groups']
    if not left or not right:
        node['left'] = node['right'] = terminal(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = terminal(left), terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = terminal(left)
    else:
        node['left'] = get_split(left)
        split_node(node['left'], max_depth, min_size, depth+1)
    if len(right) <= min_size:
        node['right'] = terminal(right)
    else:
        node['right'] = get_split(right)
        split_node(node['right'], max_depth, min_size, depth+1)

def predict(node, row):
    if row[node['index']] < node['value']:
        return predict(node['left'], row) if isinstance(node['left'], dict) else node['left']
    else:
        return predict(node['right'], row) if isinstance(node['right'], dict) else node['right']

def accuracy(train, test):
    tree = get_split(train)
    split_node(tree, 3, 5, 1)
    predicted = [predict(tree, row) for row in test]
    actual = [row[-1] for row in test]
    return sum(1 for p,a in zip(predicted, actual) if p == a) / len(test) * 100.0

In [19]:
train, test = train_test_split()

In [20]:
print(accuracy(train['artificial'], test['artificial']))
print(accuracy(train['income'][:500], test['income']))

100.0
83.56980529451508


## Compare with scikit-learn

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [22]:
def evaluate(model, dataset):
    model.fit(train[dataset][:,:-1], train[dataset][:,-1])
    target_pred = model.predict(test[dataset][:,:-1])
    return accuracy_score(test[dataset][:,-1], target_pred, normalize = True)

In [23]:
evaluate(GaussianNB(), 'artificial')

1.0

In [24]:
evaluate(DecisionTreeClassifier(), 'artificial')

1.0

In [25]:
evaluate(Perceptron(), 'artificial')



1.0

In [26]:
evaluate(KNeighborsClassifier(), 'artificial')

1.0

In [27]:
evaluate(GaussianNB(), 'income')

0.7957127940544193

In [28]:
evaluate(DecisionTreeClassifier(), 'income')

0.8098396904367053

In [29]:
evaluate(Perceptron(), 'income')



0.7833056937534549

In [30]:
evaluate(KNeighborsClassifier(), 'income')

0.7769793010257355