<a href="https://colab.research.google.com/github/ManjotSran/Blockchain-project/blob/main/HACKANONS_COLAB_25GB_RAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [6]:
import numpy as np
from random import seed
from random import randrange
from csv import reader

# Load a CSV file
def load_csv(filename):
    file = open(filename, "rt")
    lines = reader(file)
    dataset = list(lines)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def evaluate_algorithm(dataset, algorithm_class, n_folds, max_depth, min_size):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for i in range(n_folds):
        train_set = np.concatenate([folds[j] for j in range(n_folds) if j != i])
        test_set = np.array(folds[i])

        model = algorithm_class(max_depth, min_size)
        model.fit(train_set)
        predicted = model.predict_dataset(test_set[:, :-1])
        actual = test_set[:, -1]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores



# Your custom CART implementation
def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
    P_L = len(left_y) / len(y)
    P_R = 1 - P_L
    goodness = 0

    for j in range(num_classes):
        P_j_tL = np.sum(left_y == j) / len(left_y) if len(left_y) > 0 else 0
        P_j_tR = np.sum(right_y == j) / len(right_y) if len(right_y) > 0 else 0
        goodness += abs(P_j_tL - P_j_tR)

    return 2 * P_L * P_R * goodness

def calculate_best_split(dataset, num_features, num_classes):
    best_split = {}
    max_goodness = -float("inf")

    for feature_index in range(num_features):
        feature_values = np.unique(dataset[:, feature_index])
        for value in feature_values:
            left, right = split_dataset(dataset, feature_index, value)
            if len(left) > 0 and len(right) > 0:
                y, left_y, right_y = dataset[:, -1], left[:, -1], right[:, -1]
                current_goodness = calculate_measure_of_goodness(y, left_y, right_y, num_classes)
                if current_goodness > max_goodness:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = value
                    best_split["left"] = left
                    best_split["right"] = right
                    best_split["gain"] = current_goodness
                    max_goodness = current_goodness
    return best_split

def split_dataset(dataset, feature_index, threshold):
    left = np.array([row for row in dataset if row[feature_index] <= threshold])
    right = np.array([row for row in dataset if row[feature_index] > threshold])
    return left, right

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, gain=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain

class CART:
    def __init__(self, max_depth=10, min_size=2):
        self.root = None
        self.max_depth = max_depth
        self.min_size = min_size

    def build_tree(self, dataset, current_depth=0, num_classes=None):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_features = X.shape[1]
        if num_classes is None:
            num_classes = len(np.unique(y))

        # Stopping conditions
        if len(set(y)) == 1 or current_depth >= self.max_depth:
            return TreeNode(value=self.most_common_label(y))

        # Calculate the best split
        best_split = calculate_best_split(dataset, num_features, num_classes)
        if best_split["gain"] == 0 or len(best_split["left"]) < self.min_size or len(best_split["right"]) < self.min_size:
            return TreeNode(value=self.most_common_label(y))

        # Build left and right subtrees
        left_subtree = self.build_tree(best_split["left"], current_depth + 1, num_classes)
        right_subtree = self.build_tree(best_split["right"], current_depth + 1, num_classes)

        # Create a tree node
        return TreeNode(feature_index=best_split["feature_index"], threshold=best_split["threshold"],
                        left=left_subtree, right=right_subtree, gain=best_split["gain"])

    def most_common_label(self, y):
        return np.bincount(y.astype(int)).argmax()

    def fit(self, dataset):
        num_classes = len(np.unique(dataset[:, -1]))
        self.root = self.build_tree(dataset, num_classes=num_classes)

    def predict(self, x, node=None):
        if node is None:
            node = self.root

        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict(x, node.left)
        else:
            return self.predict(x, node.right)

    def predict_dataset(self, X):
        return [self.predict(x) for x in X]

# Test CART on Bank Note dataset
seed(1)
filename = 'data_banknote_authentication.csv'
dataset = load_csv(filename)

# Convert string attributes to integers
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

# Convert dataset to numpy array
dataset = np.array(dataset, dtype=float)

# Evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10

scores = evaluate_algorithm(dataset, CART, n_folds, max_depth, min_size)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


FileNotFoundError: [Errno 2] No such file or directory: 'data_banknote_authentication.csv'

In [4]:
import pandas as pd
import numpy as np

# Load the dataset
filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(filename, header=None)

# Inspect the first few rows of the dataset
print(df.head())

# Inspect data types and missing values
print(df.info())


   0                  1       2           3   4                    5   \
0  39          State-gov   77516   Bachelors  13        Never-married   
1  50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   
2  38            Private  215646     HS-grad   9             Divorced   
3  53            Private  234721        11th   7   Married-civ-spouse   
4  28            Private  338409   Bachelors  13   Married-civ-spouse   

                   6               7       8        9     10  11  12  \
0        Adm-clerical   Not-in-family   White     Male  2174   0  40   
1     Exec-managerial         Husband   White     Male     0   0  13   
2   Handlers-cleaners   Not-in-family   White     Male     0   0  40   
3   Handlers-cleaners         Husband   Black     Male     0   0  40   
4      Prof-specialty            Wife   Black   Female     0   0  40   

               13      14  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K  
3   United-State

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Replace '?' with NaN and drop rows with missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == np.object:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

# Split the data into features and target label
X = df.drop(14, axis=1)
y = df[14]

# Normalize the numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to numpy array for the model
X = np.array(X)
y = np.array(y)

# Model Evaluation
seed(1)
n_folds = 5
max_depth = 5
min_size = 10

scores = evaluate_algorithm(np.column_stack((X, y)), CART, n_folds, max_depth, min_size)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[column].dtype == np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[column].dtype == np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[column].dtype == np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[column].dtype == np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[column].dtype == np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[column].dtype == np.object:
Deprecated in NumPy 1.20; for more details and guida

NameError: name 'evaluate_algorithm' is not defined

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from random import seed
from random import randrange

# Load the dataset
filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(filename, header=None)

# Replace '?' with NaN and drop rows with missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == object:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

# Split the data into features and target label
X = df.drop(14, axis=1)
y = df[14]

# Normalize the numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to numpy array for the model
X = np.array(X)
y = np.array(y)

# Your custom CART implementation
def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
    def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
    P_L = len(left_y) / len(y)
    P_R = 1 - P_L
    goodness = 0

    for j in range(num_classes):
        P_j_tL = np.sum(left_y == j) / len(left_y) if len(left_y) > 0 else 0
        P_j_tR = np.sum(right_y == j) / len(right_y) if len(right_y) > 0 else 0
        goodness += abs(P_j_tL - P_j_tR)

    return 2 * P_L * P_R * goodness

def calculate_best_split(dataset, num_features, num_classes):
    best_split = {}
    max_goodness = -float("inf")

    for feature_index in range(num_features):
        feature_values = np.unique(dataset[:, feature_index])
        for value in feature_values:
            left, right = split_dataset(dataset, feature_index, value)
            if len(left) > 0 and len(right) > 0:
                y, left_y, right_y = dataset[:, -1], left[:, -1], right[:, -1]
                current_goodness = calculate_measure_of_goodness(y, left_y, right_y, num_classes)
                if current_goodness > max_goodness:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = value
                    best_split["left"] = left
                    best_split["right"] = right
                    best_split["gain"] = current_goodness
                    max_goodness = current_goodness
    return best_split

def split_dataset(dataset, feature_index, threshold):
    left = np.array([row for row in dataset if row[feature_index] <= threshold])
    right = np.array([row for row in dataset if row[feature_index] > threshold])
    return left, right

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, gain=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain

class CART:
    def __init__(self, max_depth=10, min_size=2):
        self.root = None
        self.max_depth = max_depth
        self.min_size = min_size

    def build_tree(self, dataset, current_depth=0, num_classes=None):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_features = X.shape[1]
        if num_classes is None:
            num_classes = len(np.unique(y))

        # Stopping conditions
        if len(set(y)) == 1 or current_depth >= self.max_depth:
            return TreeNode(value=self.most_common_label(y))

        # Calculate the best split
        best_split = calculate_best_split(dataset, num_features, num_classes)
        if best_split["gain"] == 0 or len(best_split["left"]) < self.min_size or len(best_split["right"]) < self.min_size:
            return TreeNode(value=self.most_common_label(y))

        # Build left and right subtrees
        left_subtree = self.build_tree(best_split["left"], current_depth + 1, num_classes)
        right_subtree = self.build_tree(best_split["right"], current_depth + 1, num_classes)

        # Create a tree node
        return TreeNode(feature_index=best_split["feature_index"], threshold=best_split["threshold"],
                        left=left_subtree, right=right_subtree, gain=best_split["gain"])

    def most_common_label(self, y):
        return np.bincount(y.astype(int)).argmax()

    def fit(self, dataset):
        num_classes = len(np.unique(dataset[:, -1]))
        self.root = self.build_tree(dataset, num_classes=num_classes)

    def predict(self, x, node=None):
        if node is None:
            node = self.root

        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict(x, node.left)
        else:
            return self.predict(x, node.right)

    def predict_dataset(self, X):
        return [self.predict(x) for x in X]

# Other required functions (str_column_to_float, cross_validation_split, accuracy_metric, evaluate_algorithm)
# ...

# Model Evaluation
seed(1)
n_folds = 5
max_depth = 5
min_size = 10

# Combine features and target for the evaluate_algorithm function
dataset = np.column_stack((X, y))
scores = evaluate_algorithm(dataset, CART, n_folds, max_depth, min_size)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


IndentationError: expected an indented block after function definition on line 37 (<ipython-input-7-bfaf7d3d2316>, line 38)

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from random import seed
from random import randrange

# Load the dataset
filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(filename, header=None)

# Replace '?' with NaN and drop rows with missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == object:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

# Split the data into features and target label
X = df.drop(14, axis=1)
y = df[14]

# Normalize the numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to numpy array for the model
X = np.array(X)
y = np.array(y)

# Your custom CART implementation
def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
    def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
      P_L = len(left_y) / len(y)
      P_R = 1 - P_L
      goodness = 0

    for j in range(num_classes):
      P_j_tL = np.sum(left_y == j) / len(left_y) if len(left_y) > 0 else 0
      P_j_tR = np.sum(right_y == j) / len(right_y) if len(right_y) > 0 else 0
      goodness += abs(P_j_tL - P_j_tR)

      return 2 * P_L * P_R * goodness

def calculate_best_split(dataset, num_features, num_classes):
    best_split = {}
    max_goodness = -float("inf")

    for feature_index in range(num_features):
        feature_values = np.unique(dataset[:, feature_index])
        for value in feature_values:
            left, right = split_dataset(dataset, feature_index, value)
            if len(left) > 0 and len(right) > 0:
                y, left_y, right_y = dataset[:, -1], left[:, -1], right[:, -1]
                current_goodness = calculate_measure_of_goodness(y, left_y, right_y, num_classes)
                if current_goodness > max_goodness:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = value
                    best_split["left"] = left
                    best_split["right"] = right
                    best_split["gain"] = current_goodness
                    max_goodness = current_goodness
    return best_split

def split_dataset(dataset, feature_index, threshold):
    left = np.array([row for row in dataset if row[feature_index] <= threshold])
    right = np.array([row for row in dataset if row[feature_index] > threshold])
    return left, right

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, gain=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain

class CART:
    def __init__(self, max_depth=10, min_size=2):
        self.root = None
        self.max_depth = max_depth
        self.min_size = min_size

    def build_tree(self, dataset, current_depth=0, num_classes=None):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_features = X.shape[1]
        if num_classes is None:
            num_classes = len(np.unique(y))

        # Stopping conditions
        if len(set(y)) == 1 or current_depth >= self.max_depth:
            return TreeNode(value=self.most_common_label(y))

        # Calculate the best split
        best_split = calculate_best_split(dataset, num_features, num_classes)
        if best_split["gain"] == 0 or len(best_split["left"]) < self.min_size or len(best_split["right"]) < self.min_size:
            return TreeNode(value=self.most_common_label(y))

        # Build left and right subtrees
        left_subtree = self.build_tree(best_split["left"], current_depth + 1, num_classes)
        right_subtree = self.build_tree(best_split["right"], current_depth + 1, num_classes)

        # Create a tree node
        return TreeNode(feature_index=best_split["feature_index"], threshold=best_split["threshold"],
                        left=left_subtree, right=right_subtree, gain=best_split["gain"])

    def most_common_label(self, y):
        return np.bincount(y.astype(int)).argmax()

    def fit(self, dataset):
        num_classes = len(np.unique(dataset[:, -1]))
        self.root = self.build_tree(dataset, num_classes=num_classes)

    def predict(self, x, node=None):
        if node is None:
            node = self.root

        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict(x, node.left)
        else:
            return self.predict(x, node.right)

    def predict_dataset(self, X):
        return [self.predict(x) for x in X]

# Helper functions
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def evaluate_algorithm(dataset, algorithm_class, n_folds, max_depth, min_size):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for i in range(n_folds):
        train_set = np.concatenate([folds[j] for j in range(n_folds) if j != i])
        test_set = np.array(folds[i])

        model = algorithm_class(max_depth, min_size)
        model.fit(train_set)
        predicted = model.predict_dataset(test_set[:, :-1])
        actual = test_set[:, -1]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Model Evaluation
seed(1)
n_folds = 5
max_depth = 5
min_size = 10

# Combine features and target for the evaluate_algorithm function
dataset = np.column_stack((X, y))
scores = evaluate_algorithm(dataset, CART, n_folds, max_depth, min_size)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


UnboundLocalError: local variable 'goodness' referenced before assignment

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from random import seed
from random import randrange

# Load the dataset
filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(filename, header=None)

# Replace '?' with NaN and drop rows with missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == object:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

# Split the data into features and target label
X = df.drop(14, axis=1)
y = df[14]

# Normalize the numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to numpy array for the model
X = np.array(X)
y = np.array(y)

# Custom CART implementation
def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
    P_L = len(left_y) / len(y)
    P_R = 1 - P_L
    goodness = 0

    for j in range(num_classes):
        P_j_tL = np.sum(left_y == j) / len(left_y) if len(left_y) > 0 else 0
        P_j_tR = np.sum(right_y == j) / len(right_y) if len(right_y) > 0 else 0
        goodness += abs(P_j_tL - P_j_tR)

    return 2 * P_L * P_R * goodness

def calculate_best_split(dataset, num_features, num_classes):
    best_split = {}
    max_goodness = -float("inf")

    for feature_index in range(num_features):
        feature_values = np.unique(dataset[:, feature_index])
        for value in feature_values:
            left, right = split_dataset(dataset, feature_index, value)
            if len(left) > 0 and len(right) > 0:
                y, left_y, right_y = dataset[:, -1], left[:, -1], right[:, -1]
                current_goodness = calculate_measure_of_goodness(y, left_y, right_y, num_classes)
                if current_goodness > max_goodness:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = value
                    best_split["left"] = left
                    best_split["right"] = right
                    best_split["gain"] = current_goodness
                    max_goodness = current_goodness
    return best_split

def split_dataset(dataset, feature_index, threshold):
    left = np.array([row for row in dataset if row[feature_index] <= threshold])
    right = np.array([row for row in dataset if row[feature_index] > threshold])
    return left, right

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, gain=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain

class CART:
    def __init__(self, max_depth=10, min_size=2):
        self.root = None
        self.max_depth = max_depth
        self.min_size = min_size

    def build_tree(self, dataset, current_depth=0, num_classes=None):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_features = X.shape[1]
        if num_classes is None:
            num_classes = len(np.unique(y))

        # Stopping conditions
        if len(set(y)) == 1 or current_depth >= self.max_depth:
            return TreeNode(value=self.most_common_label(y))

        # Calculate the best split
        best_split = calculate_best_split(dataset, num_features, num_classes)
        if best_split["gain"] == 0 or len(best_split["left"]) < self.min_size or len(best_split["right"]) < self.min_size:
            return TreeNode(value=self.most_common_label(y))

        # Build left and right subtrees
        left_subtree = self.build_tree(best_split["left"], current_depth + 1, num_classes)
        right_subtree = self.build_tree(best_split["right"], current_depth + 1, num_classes)

        # Create a tree node
        return TreeNode(feature_index=best_split["feature_index"], threshold=best_split["threshold"],
                        left=left_subtree, right=right_subtree, gain=best_split["gain"])

    def most_common_label(self, y):
        return np.bincount(y.astype(int)).argmax()

    def fit(self, dataset):
        num_classes = len(np.unique(dataset[:, -1]))
        self.root = self.build_tree(dataset, num_classes=num_classes)

    def predict(self, x, node=None):
        if node is None:
            node = self.root

        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict(x, node.left)
        else:
            return self.predict(x, node.right)

    def predict_dataset(self, X):
        return [self.predict(x) for x in X]

# Helper functions
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def evaluate_algorithm(dataset, algorithm_class, n_folds, max_depth, min_size):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for i in range(n_folds):
        train_set = np.concatenate([folds[j] for j in range(n_folds) if j != i])
        test_set = np.array(folds[i])

        model = algorithm_class(max_depth, min_size)
        model.fit(train_set)
        predicted = model.predict_dataset(test_set[:, :-1])
        actual = test_set[:, -1]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Model Evaluation
seed(1)
n_folds = 5
max_depth = 5
min_size = 10

# Combine features and target for the evaluate_algorithm function
dataset = np.column_stack((X, y))
scores = evaluate_algorithm(dataset, CART, n_folds, max_depth, min_size)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
