## Imports

In [23]:
import numpy as np
import pandas as pd
import sklearn.model_selection
from collections import Counter
import math

# <center> Classification </center>

## Data

In [53]:
from sklearn import preprocessing
data = pd.read_csv('/Users/jeremynixon/Dropbox/python/Oracle Development/iris.data', header=None)
# iris = data[:100]
iris=data
y = np.array(iris[4])
new_y = []
for i in y:
    if i == 'Iris-setosa':
        new_y.append(1)
    elif i == 'Iris-versicolor':
        new_y.append(0)
    elif i == 'Iris-virginica':
        new_y.append(2)
y = np.array(new_y)
x = preprocessing.scale(np.array(iris.drop([4], 1)))
# x = np.hstack((np.ones((x.shape[0],1)),x))
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size = .20, random_state=42)

In [54]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
Y = df['quality'].values
le = preprocessing.LabelEncoder().fit(Y) 
Y = le.transform(Y)
df = preprocessing.scale(df.drop('quality',1))
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(df, Y, test_size = .20, random_state=42)

## <center> Logistic Regression </center>

In [65]:
def logistic_regression(x_train, y_train, lr=.001, num_iters=10000):
    # Add Bias
    x_train = np.column_stack((np.ones(len(x_train)), x_train))
    
    # Collect Shapes
    nrow, ncol = x_train.shape
    nclasses = len(np.unique(y_train))
    
    # Initialize Weights
    w = np.random.randn(ncol, nclasses)
    
    for iteration in xrange(num_iters):
        # Forward
        
        # Weighted sum of features
        output_raw = np.matmul(x_train, w)
        
        # Squash weighted sum values to be between 0 and 1
        output_softmax = np.exp(output_raw)
        output_softmax = output_softmax / np.sum(output_softmax, axis=1, keepdims=True)
        
        # Backward
        output_softmax[range(len(x_train)), y_train] -= 1
        gradient = np.matmul(x_train.T, output_softmax/nrow)
        
        # Update
        w -= lr * gradient
        
    return w

In [66]:
def predict_lr(weights, x_test):
    # Add Bias Column
    x_test = np.column_stack((np.ones(len(x_test)), x_test))
    
    # Weighted sum of features
    output_raw = np.matmul(x_test, weights)
    
    # Squash weighted sum values to be between 0 and 1
    output_softmax = np.exp(output_raw)
    output_softmax = output_softmax / np.sum(output_softmax, axis=1, keepdims=True)
    
    return np.argmax(output_softmax, axis=1)

In [67]:
weights = logistic_regression(x_train, y_train)
outputs = predict_lr(weights, x_test)
accuracy = Counter(outputs - y_test)[0]/float(len(y_test))
print accuracy

0.503125


## <center> Neural Network </center>

In [58]:
def neural_network_sgd(x_train, y_train, num_hidden=1000, lr=0.1, num_iters=1000, batch_size=32):
    # Add bias
    x_train = np.column_stack((np.ones(len(x_train)), x_train))
    
    # Get Important Shapes
    n_row, n_col = np.shape(x_train)
    n_classes = len(np.unique(y_train))
    
    # Initialize Weight Matricies
    w1 = np.random.randn(n_col, num_hidden) * .01
    w2 = np.random.randn(num_hidden, n_classes) * .01
    
    # Iterate Through Backpropagation
    for iteration in xrange(num_iters):
        
        stochastic_sample = np.random.randint(0, n_row-1, batch_size)
            
        x_batch = x_train[stochastic_sample]
        y_batch = y_train[stochastic_sample]
        
        # Forward
        hidden_raw = np.matmul(x_batch, w1)
        hidden_relu = np.maximum(0, hidden_raw)
        output_raw = np.matmul(hidden_relu, w2)
        output_softmax = np.exp(output_raw)
        output_softmax = output_softmax / np.sum(output_softmax, axis=1, keepdims=True)
        
        # Backward
        output_softmax[range(batch_size), y_batch] -= 1
        w2_gradient = np.matmul(hidden_relu.T, output_softmax/batch_size)
        hidden_gradient = np.matmul(output_softmax/batch_size, w2.T)
        hidden_gradient[hidden_relu <= 0] = 0
        w1_gradient = np.matmul(x_batch.T, hidden_gradient)
        
        # Update Weights
        w1 -= lr * w1_gradient
        w2 -= lr * w2_gradient
    return w1, w2

In [59]:
def predict_nn(w1, w2, x_test):
    # Add Bias
    x_test = np.column_stack((np.ones(len(x_test)), x_test))
    # Forward
    hidden_raw = np.matmul(x_test, w1)
    hidden_relu = np.maximum(0, hidden_raw)
    output_raw = np.matmul(hidden_relu, w2)
    output_softmax = np.exp(output_raw)
    output_softmax = output_softmax / np.sum(output_softmax, axis=1, keepdims=True)
    return np.argmax(output_softmax, axis=1)

In [60]:
weights1, weights2 = neural_network_sgd(x_train, y_train)
outputs = predict_nn(weights1, weights2, x_test)
accuracy = Counter(outputs - y_test)[0]/float(len(y_test))
print accuracy

0.59375


## <center> Decision Tree </center>

In [61]:

import math
import random
import pandas as pd
import numpy as np


class Tree(object):
    def __init__(self, parents=None):
        self.children = []
        self.split_feature = None
        self.split_feature_value = None
        self.parents = parents
        self.label = None

def data_to_distribution(y_train):
        types = set(y_train)
        distribution = []
        for i in types:
            distribution.append(list(y_train).count(i)/float(len(y_train)))
        return distribution


def entropy(distribution):
    return -sum([p * math.log(p,2) for p in distribution])


def split_data(x_train, y_train, feature_index):
    attribute_values = x_train[:,feature_index]
    for attribute in set(attribute_values):
        data_subset = []
        for index, point in enumerate(x_train):
            if point[feature_index] == attribute:
                data_subset.append([point, y_train[index]])
        yield data_subset



def gain(x_train, y_train, feature_index):
    entropy_gain = entropy(data_to_distribution(y_train))
    for data_subset in split_data(x_train, y_train, feature_index):
        entropy_gain -= entropy(data_to_distribution([label
                    for (point, label) in data_subset]))
    return entropy_gain


def homogeneous(y_train):
    return len(set(y_train)) <= 1

def majority_vote(y_train, node):
    labels = y_train
    choice = max(set(labels), key=list(labels).count)
    node.label = choice
    return node


def build_decision_tree(x_train, y_train, root, remaining_features):
    if homogeneous(y_train):
        root.label = y_train[0]
        return root
    
    if len(remaining_features) == 0:
        return majority_vote(y_train, root)
    
    best_feature = max(remaining_features, key=lambda index: 
                       gain(x_train, y_train, index))
    
    if gain(x_train, y_train, best_feature) == 0:
        return majority_vote(y_train, root)
    
    root.split_feature = best_feature
    
    for data_subset in split_data(x_train, y_train, best_feature):
        child = Tree(parents = root)
        child.split_feature_value = data_subset[0][0][best_feature]
        root.children.append(child)
        
        new_x = np.array([point for (point, label) in data_subset])
        new_y = np.array([label for (point, label) in data_subset])
        
        build_decision_tree(new_x, new_y, child, remaining_features - set([best_feature]))
    
    return root

def decision_tree(x_train, y_train):
    return build_decision_tree(x_train, y_train, Tree(), 
                               set(range(len(x_train[0]))))


def find_nearest(array, value):
    nearest = (np.abs(array-value)).argmin()
    return array[nearest]


def classify(tree, point):
    if tree.children == []:
        return tree.label
    else:
        try:
            matching_children = [child for child in tree.children
                if child.split_feature_value == point[tree.split_feature]]
            return classify(matching_children[0], point)
        except:
            array = [child.split_feature_value for child in tree.children]
            point[tree.split_feature] = find_nearest(array, point[tree.split_feature])
            matching_children = [child for child in tree.children
                if child.split_feature_value == point[tree.split_feature]]
            return classify(matching_children[0], point)


def predict_dt(x_train, tree):
    predicted_labels = [classify(tree, point) for point in x_train]
    return predicted_labels

def decision_tree_full(x_train, y_train, x_test):
    tree = decision_tree(x_train, y_train)
    predictions = text_classification(x_test, tree)
    return predictions

In [62]:
tree = decision_tree(x_train, y_train)
outputs = predict_dt(x_test, tree)
accuracy = Counter(outputs - y_test)[0]/float(len(y_test))
print accuracy

0.53125


## <center> Random Forest </center>

In [63]:
class Tree(object):
    def __init__(self, parents=None):
        self.children = []
        self.split_feature = None
        self.split_feature_value = None
        self.parents = parents
        self.label = None
        
def data_to_distribution(y_train):
        types = set(y_train)
        distribution = []
        for i in types:
            distribution.append(list(y_train).count(i)/float(len(y_train)))
        return distribution
    
def entropy(distribution):
    return -sum([p * math.log(p,2) for p in distribution])

def split_data(x_train, y_train, feature_index):
    attribute_values = x_train[:,feature_index]
    for attribute in set(attribute_values):
        data_subset = []
        for index, point in enumerate(x_train):
            if point[feature_index] == attribute:
                data_subset.append([point, y_train[index]])
        yield data_subset
        
def gain(x_train, y_train, feature_index):
    entropy_gain = entropy(data_to_distribution(y_train))
    for data_subset in split_data(x_train, y_train, feature_index):
        entropy_gain -= entropy(data_to_distribution([label
                    for (point, label) in data_subset]))
    return entropy_gain

def homogeneous(y_train):
    return len(set(y_train)) <= 1

def majority_vote(y_train, node):
    labels = y_train
    choice = max(set(labels), key=list(labels).count)
    node.label = choice
    return node

def build_decision_tree(x_train, y_train, root, remaining_features):
    remaining_features = np.array(list(remaining_features))
    if homogeneous(y_train):
        root.label = y_train[0]
        return root
    
    if remaining_features.shape == 0:
        return majority_vote(y_train, root)
    
    indices = np.random.choice(int(remaining_features.shape[0]), int(2*remaining_features.shape[0]/3), replace = False)

    best_feature = max(remaining_features[indices], key=lambda index: 
                       gain(x_train, y_train, index))
    remaining_features = set(remaining_features)
    if gain(x_train, y_train, best_feature) == 0:
        return majority_vote(y_train, root)
    
    root.split_feature = best_feature
    
    for data_subset in split_data(x_train, y_train, best_feature):
        child = Tree(parents = root)
        child.split_feature_value = data_subset[0][0][best_feature]
        root.children.append(child)
        
        new_x = np.array([point for (point, label) in data_subset])
        new_y = np.array([label for (point, label) in data_subset])
        
        build_decision_tree(new_x, new_y, child, remaining_features - set([best_feature]))
    
    return root

def decision_tree(x_train, y_train):
    return build_decision_tree(x_train, y_train, Tree(), 
                               set(range(len(x_train[0]))))
def find_nearest(array, value):
    nearest = (np.abs(array-value)).argmin()
    return array[nearest]

def classify(tree, point):
    if tree.children == []:
#         print "label = %r" %(tree.label)
        return tree.label
    else:
        try:
            matching_children = [child for child in tree.children
                if child.split_feature_value == point[tree.split_feature]]
            return classify(matching_children[0], point)
        except:
            array = [child.split_feature_value for child in tree.children]
            point[tree.split_feature] = find_nearest(array, point[tree.split_feature])
            matching_children = [child for child in tree.children
                if child.split_feature_value == point[tree.split_feature]]
            return classify(matching_children[0], point)
        
def text_classification(x_test, tree):
    predicted_labels = [classify(tree, point) for point in x_test]
    return predicted_labels

def random_forest(x_train, y_train, x_test, n_estimators = 100):


    x_train_copy = x_train
    y_train_copy = y_train
    x_test_copy = x_test
    labels = []
    sample = []
    predictions = []
    for i in range(n_estimators):
        sample.append(np.random.choice(len(x_train), len(x_train),
                                       replace=True))
    for i in range(n_estimators):
        x_train = x_train_copy.copy()
        y_train = y_train_copy.copy()
        x_test_copy = x_test_copy.copy()
        
        x = x_train[sample[i]]
        y = y_train[sample[i]]
        tree = decision_tree(x, y)
        labels.append(text_classification(x_test_copy, tree))
    
    
    for index in range(len(labels[0])):
        prediction_dictionary = {}
        for tree_result in range(len(labels)):
            try:
                prediction_dictionary[labels[tree_result][index]] += 1
            except KeyError:
                prediction_dictionary[labels[tree_result][index]] = 1
        store = 0
        for index, value in prediction_dictionary.iteritems():
            if value > store:
                store = value
                chosen = index
        predictions.append(chosen)
        
        
    return predictions

In [64]:
outputs = random_forest(x_train, y_train, x_test)
accuracy = Counter(outputs - y_test)[0]/float(len(y_test))
print accuracy

0.559375


## <center> KNN </center>

In [68]:
def knn(x_train, y_train, x_test, k):
    predictions = []
    # Compute solution for each test datapoing
    for datapoint in x_test:
        distances = []
        
        # Get distance to every training set datapoint
        for index, vector in enumerate(x_train):
            distances.append([np.sum(np.sqrt((datapoint-vector)**2)), y_train[index]])
                              
        # Sort by distance
        distances.sort()

        # Plurality prediction
        counts = {}
        for i in xrange(k):
            try:
                counts[distances[i][1]] += 1
            except:
                counts[distances[i][1]] = 1
        # Recover Maximum
        base = 0
        
        for i in xrange(len(np.unique(y_train))):
            try:
                value = counts[i]
                if value > base:
                    prediction = i
            except:
                pass
        predictions.append(prediction)
    return predictions        

In [69]:
outputs = knn(x_train, y_train, x_test, 5)
accuracy = Counter(outputs - y_test)[0]/float(len(y_test))
print accuracy

0.421875


# <center> Regression </center>

## Data

In [70]:
df = pd.read_csv('/Users/jeremynixon/Dropbox/python/Oracle Development/iris.data', header=None)
labels = np.array(df[0])
features = np.array(df.drop([0, 4], 1))
# features = np.column_stack((np.ones(features.shape[0]),features))
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(features, labels, test_size = .20, random_state=42)

In [86]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
Y = df['fixed acidity'].values
df = preprocessing.scale(df.drop(['fixed acidity', 'quality'],1))
x_train, x_test, y_train, y_test = sklearn.cross_validation.train_test_split(df, Y, test_size = .20, random_state=42)

## <center> Linear Regression </center>

In [87]:
def linear_regression(x_train, y_train, lr=.00001, num_iters=1000):
    # Bias
    x_train = np.column_stack((np.ones(len(x_train)), x_train))
    
    # Initialize Weights
    w = .01 * np.ones(x_train.shape[1])
    
    for iteration in xrange(num_iters):
        raw_output = np.matmul(x_train, w)
        diff = y_train - raw_output
        grad = np.matmul(x_train.T, diff)
        
        w += lr * grad
    return w

In [88]:
def predict_lin_reg(weights, x_test):
    # Add Bias
    x_test = np.column_stack((np.ones(len(x_test)), x_test))
    
    output = np.matmul(x_test, weights)
    return output

In [89]:
weights = linear_regression(x_train, y_train)
outputs = predict_lin_reg(weights, x_test)
avg_error = np.mean(np.abs(outputs-y_test))
print avg_error

0.442458032338


## <center> KNN Regressor </center>

In [95]:
def knn(x_train, y_train, x_test, k):
    predictions = []
    # Compute solution for each test datapoing
    for datapoint in x_test:
        distances = []
        
        # Get distance to every training set datapoint
        for index, vector in enumerate(x_train):
            distances.append([np.sum(np.sqrt((datapoint-vector)**2)), y_train[index]])
                              
        # Sort by distance
        distances.sort()

        # Aggregate neighboring values
        average = 0
        for i in xrange(k):
            average += distances[i][1]
        average = average / float(k)
        predictions.append(average)
    return predictions        

In [96]:
outputs = knn(x_train, y_train, x_test, 5)
avg_error = np.mean(np.abs(outputs-y_test))
print avg_error

0.537375
