The goal of this notebook is to code a decision tree classifier that can be used with the following API:


# Import Statements

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import random
from pprint import pprint

In [4]:
%matplotlib inline
sns.set_style("darkgrid")

print("Set Up Completed !")

Set Up Completed !


# Load and Prepare Data

#### Format of the data
- the last column of the data frame must contain the label and it must also be called "label"
- there should be no missing values in the data frame

In [7]:
import os

os.chdir("../data/")
%pwd

'c:\\Users\\P52s\\Documents\\Obsidian\\Semestre 1 IDSCC5\\5eme année\\Data mining\\TPs\\data'

In [9]:
df = pd.read_csv("Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Train-Test-Split

In [10]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [11]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=20)

train_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [12]:
test_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6311,6312,15798429,Hernandez,741,France,Male,29,8,0.0,2,1,1,115994.52,0
6890,6891,15673877,Murray,490,France,Male,39,1,0.0,3,1,0,171060.01,1
663,664,15788659,Howells,695,France,Male,46,4,0.0,2,1,1,137537.22,0
4242,4243,15746553,Castles,526,Germany,Male,50,5,124233.24,1,0,1,159456.87,1
8376,8377,15704657,Denman,601,France,Male,39,3,72647.64,1,1,0,41777.9,1


In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

def apply_one_hot_encoding(df, columns_to_encode):
    transformer = ColumnTransformer(
        transformers=[
            ("onehot", OneHotEncoder(), columns_to_encode)
        ],
        remainder='passthrough'
    )
    df_encoded = transformer.fit_transform(df)
    
    # Get the one-hot encoded column names
    encoded_columns = []
    for i, col in enumerate(columns_to_encode):
        unique_values = df[col].unique()
        for unique_value in unique_values:
            encoded_columns.append(f"{col}_{unique_value}")

    new_columns = encoded_columns + list(df.columns.drop(columns_to_encode))
    return pd.DataFrame(df_encoded, columns=new_columns)

columns_to_encode = ["Geography", "Gender"]

train_df_encoded = apply_one_hot_encoding(train_df, columns_to_encode)
test_df_encoded = apply_one_hot_encoding(test_df, columns_to_encode)



train_df_encoded.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,Gender_Female,RowNumber,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,0,0,0,1,6312,15798429,Hernandez,741,29,8,0.0,2,1,1,115995.0,0
1,1,0,0,0,1,6891,15673877,Murray,490,39,1,0.0,3,1,0,171060.0,1
2,1,0,0,0,1,664,15788659,Howells,695,46,4,0.0,2,1,1,137537.0,0
3,0,1,0,0,1,4243,15746553,Castles,526,50,5,124233.0,1,0,1,159457.0,1
4,1,0,0,0,1,8377,15704657,Denman,601,39,3,72647.6,1,1,0,41777.9,1


In [16]:
test_df_encoded.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,Gender_Female,RowNumber,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,0,0,0,1,6312,15798429,Hernandez,741,29,8,0.0,2,1,1,115995.0,0
1,1,0,0,0,1,6891,15673877,Murray,490,39,1,0.0,3,1,0,171060.0,1
2,1,0,0,0,1,664,15788659,Howells,695,46,4,0.0,2,1,1,137537.0,0
3,0,1,0,0,1,4243,15746553,Castles,526,50,5,124233.0,1,0,1,159457.0,1
4,1,0,0,0,1,8377,15704657,Denman,601,39,3,72647.6,1,1,0,41777.9,1


# Helper Functions

The helper functions operate on a NumPy 2d-array. Therefore, let’s create a variable called “data” to see what we will be working with.

In [17]:
data = train_df.values
data[:5]

array([[1, 15634602, 'Hargrave', 619, 'France', 'Female', 42, 2, 0.0, 1,
        1, 1, 101348.88, 1],
       [2, 15647311, 'Hill', 608, 'Spain', 'Female', 41, 1, 83807.86, 1,
        0, 1, 112542.58, 0],
       [3, 15619304, 'Onio', 502, 'France', 'Female', 42, 8, 159660.8, 3,
        1, 0, 113931.57, 1],
       [4, 15701354, 'Boni', 699, 'France', 'Female', 39, 1, 0.0, 2, 0,
        0, 93826.63, 0],
       [5, 15737888, 'Mitchell', 850, 'Spain', 'Female', 43, 2,
        125510.82, 1, 1, 1, 79084.1, 0]], dtype=object)

### Data pure?

In [18]:
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

### Classify

In [19]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

### Potential splits?

In [20]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
    
    return potential_splits

### Split Data

In [21]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

### Lowest Overall Entropy?

In [22]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [23]:
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [24]:
def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

# Decision Tree Algorithm

### Representation of the Decision Tree

In [25]:
sub_tree = {"question": ["yes_answer", 
                         "no_answer"]}

### Algorithm

In [18]:
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # instantiate sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base cases).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

## Black box



In [27]:
y=train_df["Exited"]
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 9980, dtype: int64

In [34]:
X=pd.read_csv("Phurn_data_modeling_Features.csv")
y=pd.read_csv("Phurn_data_modeling_target.csv")

X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [35]:
y.head()

Unnamed: 0,Exited
0,1
1,0
2,1
3,0
4,0


In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.781
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      1607
           1       0.45      0.50      0.47       393

    accuracy                           0.78      2000
   macro avg       0.66      0.68      0.67      2000
weighted avg       0.79      0.78      0.79      2000

