<a href="https://colab.research.google.com/github/Moly-malibu/100-Days-Of-ML-Code/blob/master/Decision_treee_OOP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Example OOP python

In [None]:
from random import seed
from random import randrange
from csv import reader
 
# Load file
def load(filename):
	file = open(filename, "rt")
	lines = reader(file)
	data = list(lines)
	return data

In [None]:
#Pass string column to float
def str_to_float(data, column):
	for row in data:
		row[column] = float(row[column].strip())

##K Folds

K-Fold CV is where a given data set is split into a K number of sections/folds where each fold is used as a testing set at some point. 

In [None]:
# Split k folds
def cross_validation(data, n_folds):
	data_split = list()
	data_copy = list(data)
	fold_size = int(len(data) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(data_copy))
			fold.append(data_copy.pop(index))
		data_split.append(fold)
	return data_split

##Accuracy percentage

The accuracy is a measure of the degree of closeness of a measured or calculated value to its actual value. The percent error is the ratio of the error to the actual value multiplied by 100. The precision of a measurement is a measure of the reproducibility of a set of measurements.

In [None]:
#Accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

##Cross-validation:

Cross-validation, sometimes called rotation estimation or out-of-sample testing, is any of various similar model validation techniques for assessing how the results of a statistical analysis will generalize to an independent data set.

In [None]:
#Cross validation split
def evaluate_algorithm(data, algorithm, n_folds, *args):
	folds = cross_validation(data, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

##Split data

Data splitting is the act of partitioning available data into. two portions, usually for cross-validatory purposes. One. portion of the data is used to develop a predictive model. and the other to evaluate the model's performance.

In [None]:
# Split a data
def test_split(index, value, data):
	left, right = list(), list()
	for row in data:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

##Gini Index

The Gini Index is a summary measure of income inequality. The Gini coefficient incorporates the detailed shares data into a single statistic, which summarizes the dispersion of income across the entire income distribution.

In [None]:
#The Gini index for a split data
def gini_index(groups, classes):
	n_instances = float(sum([len(group) for group in groups]))
	gini = 0.0
	for group in groups:
		size = float(len(group))
		if size == 0:
			continue
		score = 0.0
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		gini += (1.0 - score) * (size / n_instances)
	return gini

##Split data

In [None]:
#The best split point for a data
def add_split(data):
	class_values = list(set(row[-1] for row in data))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	for index in range(len(data[0])-1):
		for row in data:
			groups = test_split(index, row[index], data)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

##Create child and terminal

In [None]:
# Create child splits  
def split(node, max_depth, min_size, depth):
	left, right = node['groups']
	del(node['groups'])
	if not left or not right:
		node['left'] = node['right'] = terminal(left + right)
		return
	if depth >= max_depth:
		node['left'], node['right'] = terminal(left), terminal(right)
		return
	if len(left) <= min_size:
		node['left'] = terminal(left)
	else:
		node['left'] = add_split(left)
		split(node['left'], max_depth, min_size, depth+1)
	if len(right) <= min_size:
		node['right'] = terminal(right)
	else:
		node['right'] = add_split(right)
		split(node['right'], max_depth, min_size, depth+1)

##Decision Tree

In [None]:
# Build a decision tree
def build_tree(train, max_depth, min_size):
	root = add_split(train)
	split(root, max_depth, min_size, 1)
	return root

##Prediction:

Decision Trees. Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.

In [None]:
#Prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

##Classification and Regression Tree 

In [None]:
#Classification and Regression Tree:
def decision_tree(train, test, max_depth, min_size):
	tree = build_tree(train, max_depth, min_size)
	predictions = list()
	for row in test:
		prediction = predict(tree, row)
		predictions.append(prediction)
	return(predictions)

In [None]:
# Test data
seed(1)
# load and prepare data
filename = 'commodity_trade_statistics_data.csv'
data = load(filename)
for i in range(len(data[0])):
        str_to_float(data, i)
        n_folds = 5
        max_depth = 5
        min_size = 10
        scores = evaluate_algorithm(data, decision_tree, n_folds, max_depth, min_size)
print('Optimice de Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))