In [1]:

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(12)
num_observations = 5000

x1 = np.random.multivariate_normal([0, 0], [[1, .75],[.75, 1]], num_observations)
x2 = np.random.multivariate_normal([1, 4], [[1, .75],[.75, 1]], num_observations)

simulated_separableish_features = np.vstack((x1, x2)).astype(np.float32)
simulated_labels = np.hstack((np.zeros(num_observations),
                              np.ones(num_observations)))

In [2]:
simulated_separableish_features

array([[-0.2015173 , -0.6833582 ],
       [ 0.37451947, -0.8280822 ],
       [-0.16189468, -1.2471066 ],
       ...,
       [ 2.3255963 ,  4.181329  ],
       [ 0.24707289,  4.2294044 ],
       [ 1.7190224 ,  4.8253503 ]], dtype=float32)

In [3]:
simulated_labels

array([0., 0., 0., ..., 1., 1., 1.])

In [4]:
'''
plt.figure(figsize=(12,8))
plt.scatter(simulated_separableish_features[:, 0], simulated_separableish_features[:, 1],
            c = simulated_labels, alpha = .4)'''

'\nplt.figure(figsize=(12,8))\nplt.scatter(simulated_separableish_features[:, 0], simulated_separableish_features[:, 1],\n            c = simulated_labels, alpha = .4)'

In [5]:

def sigmoid(scores):
    return 1 / (1 + np.exp(-scores))

In [6]:
def log_likelihood(features, target, weights):
    scores = np.dot(features, weights)
    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return ll

In [7]:
def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):
    if add_intercept:
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
        
    weights = np.zeros(features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(features, weights)
        predictions = sigmoid(scores)

        output_error_signal = target - predictions
        
        gradient = np.dot(features.T, output_error_signal)

        if step % 10000 == 0:
            print(log_likelihood(features, target, weights))
        
    return weights

In [8]:
weights = logistic_regression(simulated_separableish_features, simulated_labels,
                     num_steps = 50000, learning_rate = 5e-5, add_intercept=True)

-6931.471805599453
-6931.471805599453
-6931.471805599453
-6931.471805599453
-6931.471805599453


In [9]:
weights

array([0., 0., 0.])

In [10]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=True, C = 1e15)
clf.fit(simulated_separableish_features, simulated_labels)


LogisticRegression(C=1000000000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:

final_scores = np.dot(np.hstack((np.ones((simulated_separableish_features.shape[0], 1)),
                                 simulated_separableish_features)), weights)
preds = np.round(sigmoid(final_scores))

print('Accuracy from scratch: {0}'.format((preds == simulated_labels).sum().astype(float) / len(preds)))
print('Accuracy from sk-learn: {0}'.format(clf.score(simulated_separableish_features, simulated_labels)))

Accuracy from scratch: 0.5
Accuracy from sk-learn: 0.9948


In [12]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt
 

def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset
 

def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup
 

def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax
 

def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split
 

def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores
 

def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)
 

def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = euclidean_distance(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors
 

def predict_classification(train, test_row, num_neighbors):
	neighbors = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction
 

def k_nearest_neighbors(train, test, num_neighbors):
	predictions = list()
	for row in test:
		output = predict_classification(train, row, num_neighbors)
		predictions.append(output)
	return(predictions)

seed(1)
filename = 'winequality-red.csv'
dataset = load_csv(filename)[1:]
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)

str_column_to_int(dataset, len(dataset[0])-1)

n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [50.78369905956113, 50.78369905956113, 46.70846394984326, 48.589341692789965, 48.589341692789965]
Mean Accuracy: 49.091%


In [13]:
dataset

[[7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4, 1],
 [7.8, 0.88, 0.0, 2.6, 0.098, 25.0, 67.0, 0.9968, 3.2, 0.68, 9.8, 1],
 [7.8, 0.76, 0.04, 2.3, 0.092, 15.0, 54.0, 0.997, 3.26, 0.65, 9.8, 1],
 [11.2, 0.28, 0.56, 1.9, 0.075, 17.0, 60.0, 0.998, 3.16, 0.58, 9.8, 5],
 [7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4, 1],
 [7.4, 0.66, 0.0, 1.8, 0.075, 13.0, 40.0, 0.9978, 3.51, 0.56, 9.4, 1],
 [7.9, 0.6, 0.06, 1.6, 0.069, 15.0, 59.0, 0.9964, 3.3, 0.46, 9.4, 1],
 [7.3, 0.65, 0.0, 1.2, 0.065, 15.0, 21.0, 0.9946, 3.39, 0.47, 10.0, 2],
 [7.8, 0.58, 0.02, 2.0, 0.073, 9.0, 18.0, 0.9968, 3.36, 0.57, 9.5, 2],
 [7.5, 0.5, 0.36, 6.1, 0.071, 17.0, 102.0, 0.9978, 3.35, 0.8, 10.5, 1],
 [6.7, 0.58, 0.08, 1.8, 0.097, 15.0, 65.0, 0.9959, 3.28, 0.54, 9.2, 1],
 [7.5, 0.5, 0.36, 6.1, 0.071, 17.0, 102.0, 0.9978, 3.35, 0.8, 10.5, 1],
 [5.6, 0.615, 0.0, 1.6, 0.089, 16.0, 59.0, 0.9943, 3.58, 0.52, 9.9, 1],
 [7.8, 0.61, 0.29, 1.6, 0.114, 9.0, 29.0, 0.9974, 3.26, 1.56, 9.1, 1],
 [8.

In [14]:
import matplotlib.pyplot as plt
import numpy as np


def main():
    train = np.array([[3.77,4.19,0],
    [4.77,1.169761413,0],
    [-5.,2.81281357,0],
    [3.1,2.61995032,0],
    [3.6,2.209014212,0],
    [1.2,-3.162953546,1],
    [2.3,-3.339047188,1],
    [5.6,0.476683375,1],
    [-1.3,-3.234550982,1],
    [2.1,-3.319983761,1]])
    forest = build_forest(train, k=10, N_trees=100)
    for row in train:
        prediction = make_prediction(forest, row)
        #print('truth = %d : prediction = %d' % (row[-1], prediction))
    return

def traverse_tree(node, row):
    if row[node['index']] < node['split_value']:
        if isinstance(node['left'], dict):
            return traverse_tree(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return traverse_tree(node['right'], row)
        else:
            return node['right']

def make_prediction(forest, row):
    list_of_classes = []
    for tree_root in forest:
        list_of_classes.append(traverse_tree(tree_root, row))
    return max(set(list_of_classes), key=list_of_classes.count)

def calc_information_gain(groups, list_of_class_ids):
    Nall = sum([len(group) for group in groups])

    IG = calc_gini([row for group in groups for row in group], list_of_class_ids)
    for group in groups:
        IG -= calc_gini(group, list_of_class_ids)*len(group)/Nall
    return IG

def calc_gini(group, list_of_class_ids):

    Ngroup = len(group)
    if Ngroup == 0:
        return 0
    dataset_class_ids = [row[-1] for row in group]
    sum_over_classes = 0.
    for class_id in list_of_class_ids:
        prob = dataset_class_ids.count(class_id)/Ngroup
        sum_over_classes += prob**2
    return 1. - sum_over_classes

def split_node(index, value, dataset):
    left = []
    right = []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return [left, right]

def get_split(dataset, index):
    list_of_class_ids = list(set(row[-1] for row in dataset))
    split_value, max_IG, split_groups = 0., -1., None
    for row in dataset:
        groups = split_node(index, row[index], dataset)
        IG = calc_information_gain(groups, list_of_class_ids)
        if IG > max_IG:
            split_value, max_IG, split_groups = row[index], IG, groups
    return { 'index': index, 'split_value': split_value, 'groups': groups }

def build_tree(train, max_depth, min_size):
    feature_index = int( np.random.random()*(len(train[0]) - 1) )
    root = get_split(train, feature_index)
    split(root, max_depth, min_size, 1)
    return root

def to_terminal(group):
    list_of_classes = [row[-1] for row in group]
    return max(set(list_of_classes), key=list_of_classes.count)

def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        feature_index = int( np.random.random()*(len(right[0]) - 1) )
        node['left'] = get_split(left, feature_index)
        split(node['left'], max_depth, min_size, depth+1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        feature_index = int( np.random.random()*(len(right[0]) - 1) )
        node['right'] = get_split(right, feature_index)
        split(node['right'], max_depth, min_size, depth+1)

def build_forest(train, k, N_trees):
    max_depth = 4
    min_size = 2
    forest = []
    for i in range(0, N_trees):
        k_indices = np.random.choice(len(train), k)
        forest.append(build_tree(train[k_indices], max_depth, min_size))
    return forest

if __name__ == '__main__':
    main()
# С использованием random forest точность составила 49.091%
