## <p style="text-align: center;">CART Algorithm on Dataset from Project 1 - Machine Learning I - Spring 2023</p>

### <p style="text-align: center;">Michael Butros</p>

## Importing Libraries and Defining Needed Functions

In [9]:
from random import seed
from random import randrange
from csv import reader
import numpy as np
from numpy import random
import pandas as pd
import colorama
from colorama import Fore
from math import sqrt
from math import exp
from math import pi
import csv

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini

# Select the best split point for a dataset
def get_split(dataset):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	for index in range(len(dataset[0])-1):
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left)
		split(node['left'], max_depth, min_size, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right)
		split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size):
	root = get_split(train)
	split(root, max_depth, min_size, 1)
	return root

# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
	tree = build_tree(train, max_depth, min_size)
	predictions = list()
	for row in test:
		prediction = predict(tree, row)
		predictions.append(prediction)
	return(predictions)



## Reading in the dataset, calculating the Sum, Score, and Good Locations

In [10]:
df = pd.read_csv("MLProject1.csv")
print(df.head(3))
df['Sum'] = df.iloc[: , 1:].sum(axis=1)
df['Score'] = df.Sum.div(17).round(4)*100
df['GoodLocation']=0
print("\n")
for ind in df.index:
      if df['Score'][ind] >=60:
            df['GoodLocation'][ind]=1
            print(Fore.GREEN + df['Location'][ind]+ " is a good location with %.2f "  % round(df['Score'][ind],2))
      else: 
          print(Fore.BLACK + df['Location'][ind]+ " is not a good location with %.2f"  % round(df['Score'][ind],2))
          df['GoodLocation'][ind]=0

         Location  Cost  Growth Potential  Crime Rate  Visibility  Parking  \
0          Austin     0                 1           1           1        0   
1  Salt Lake City     1                 0           1           0        0   
2         Raleigh     1                 1           1           1        1   

   Near Target Community  Access to Public Transportation  Permits Required  \
0                      0                                0                 0   
1                      0                                1                 1   
2                      0                                0                 0   

   Zoning Laws  Insurance Needed  Outdoor Play Area  Grooming Services  \
0            1                 1                  1                  0   
1            1                 1                  1                  1   
2            1                 0                  1                  1   

   Population Age >65  Retired  Median Income of Population  \
0         

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GoodLocation'][ind]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GoodLocation'][ind]=1


## Preparing dataset for CART Algorithm and Writing New Data to New CSV File

In [11]:
df1=df.drop(columns=["Location","Sum","Score"])
dlist = df1.values.tolist()
data = dlist
file = open('Proj1CART.csv', 'w+', newline='')
with file:   
    write = csv.writer(file)
    write.writerows(data)

## Running the CART Algorithm on the Data and Printing Accuracy Results

In [13]:
# Test CART on Bank Note dataset
seed(1)
# load and prepare data
filename = 'Proj1CART.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [80.0, 80.0, 90.0, 60.0, 80.0]
Mean Accuracy: 78.000%
