In [1]:
import csv
import math
from collections import Counter
import pickle

In [2]:
def read_csv(path):
    with open (path, 'r') as file:
        reader = csv.reader(file, delimiter=';')
        header = next(reader)
        data = [row for row in reader]
    return header, data

def encode_data(header, data):
    encoders = {}
    encoded_data = []
    target_col_index = len(header) - 1
    for col_index in range(len(header)):
        col_values = [row[col_index].strip(' "') for row in data]
        if col_index == target_col_index:
            try:
                mapping = {'no': 0, 'yes': 1}
                for row in data:
                    row[col_index] = mapping[row[col_index].strip('"')]
                encoders[header[col_index]] = mapping
                continue
            except:
                unique_vals = sorted(list(set(col_values)))
                mapping = {val: idx for idx, val in enumerate(unique_vals)}
                for row in data:
                    row[col_index] = mapping[row[col_index].strip('"')]
                encoders[header[col_index]] = mapping
        else:
            try:
                # Assuming numerical features can be converted to float
                float(col_values[0])
                for row in data:
                    row[col_index] = float(row[col_index].strip('"'))
                encoders[header[col_index]] = None
            except (ValueError, IndexError):
                unique_vals = sorted(list(set(col_values)))
                mapping = {val: idx for idx, val in enumerate(unique_vals)}
                for row in data:
                    row[col_index] = mapping[row[col_index].strip('"')]
                encoders[header[col_index]] = mapping
    
    for row in data:
        encoded_data.append([float(value) for value in row])
        
    return encoded_data, encoders

def split_dataset(data):
    return data[:4000], data[4000:4400], data[4400:]

def split_features_labels(dataset):
    X = [row[:-1] for row in dataset]
    y = [row[-1] for row in dataset]
    return X, y

In [3]:
def gini_index(groups, classes):
    total = sum(len(group) for group in groups)
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        counts = Counter(group)
        for c in classes:
            p = counts.get(c, 0) / size
            score += p * p
        gini += (1.0 - score) * (size / total)
    return gini

def get_best_split(X, y):
    best_index, best_value, best_score, best_groups = None, None, float('inf'), None
    classes = list(set(y))
    
    for index in range(len(X[0])):
        values = set([row[index] for row in X])
        for value in values:
            left_y = [y[i] for i in range(len(X)) if X[i][index] == value]
            right_y = [y[i] for i in range(len(X)) if X[i][index] != value]
            
            gini = gini_index([left_y, right_y], classes)
            
            if gini < best_score:
                best_score = gini
                best_index = index
                best_value = value
                
                left_X = [X[i] for i in range(len(X)) if X[i][index] == value]
                right_X = [X[i] for i in range(len(X)) if X[i][index] != value]
                
                best_groups = (left_X, left_y, right_X, right_y)
                
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

def build_cart(X, y, max_depth, depth=0):
    if len(set(y)) == 1 or depth >= max_depth:
        return {'label': Counter(y).most_common(1)[0][0]}
    
    node = get_best_split(X, y)
    
    left_X, left_y, right_X, right_y = node['groups']
    
    node['left'] = build_cart(left_X, left_y, max_depth, depth + 1)
    node['right'] = build_cart(right_X, right_y, max_depth, depth + 1)
    
    return node

In [4]:
# Load the data
header, raw_data = read_csv('bank.csv')
encoded_data, encoders = encode_data(header, raw_data)
train_set, val_set, pred_set = split_dataset(encoded_data)
X_train, y_train = split_features_labels(train_set)

# Build the CART model
cart_tree = build_cart(X_train, y_train, max_depth=5)

# Save the trained CART model and encoders to files
with open('cart_model.pkl', 'wb') as file:
    pickle.dump(cart_tree, file)
    print("CART model saved as cart_model.pkl")

with open('encoders.pkl', 'wb') as file:
    pickle.dump(encoders, file)
    print("Encoders saved as encoders.pkl")

CART model saved as cart_model.pkl
Encoders saved as encoders.pkl
