# Decision Tree Classifier

**Import Required Packages**

In [60]:
import pandas as pd
import math
import numpy as np
import sklearn.metrics as metrics
import random
from sklearn.model_selection import KFold

In [62]:
def read_input_file(inputFile: str, features : list = None, column_names: bool = None):
  '''Function to read input csv files.
  Parameters
  ----------
  inputFile : str
    Input file name or path
  features : list, default = None
    list of features present in the given input file
  column_names : {True, False}, default = None
    if set to False the first row of the input file will be considered as features else the given feature list will be added as column names
  
  Return
  ------
  input_dataframe : pd.DataFrame
    returns input data as dataframe
    '''
  if column_names == True:
    input_dataframe = pd.read_csv(inputFile,names=features, index_col=None)
  elif column_names == False or column_names == None:
    input_dataframe = pd.read_csv(inputFile)
  return input_dataframe

**Map the features of input dataset by reading the feature mapping file**

In [78]:
#reads
features = read_input_file(f'../Data/featuremapping.csv', features= None, column_names=False)

In [79]:
feature_mapping = {}
features_list = list(features['Features'])
feature_mapping['continuous'] = list(features[(features['Type'] == 'continuous')]['Features'])
feature_mapping['discrete'] = list(features[(features['Type'] == 'discrete')]['Features'])
feature_mapping['missing'] = list(features[(features['Missing Values'] == 'yes')]['Features'])
feature_mapping['target_column'] = list(features[(features['Target'] == 'Yes')]['Features'])
if len(feature_mapping['target_column']) >1:
    print(f'Target cannot be more than one column. The provided feature making has {len(feature_mapping["target"])} target value')
else:
    feature_mapping['target'] = feature_mapping['target_column'][0]

**Read the credit approval train and test dataset**

In [81]:
train_data = read_input_file(f'../Data/training.data', features= features_list, column_names=True)
test_data = read_input_file(f'../Data/test.data', features= features_list, column_names=True)

**Class for data preprocessing**

In [82]:
class preprocessing:
  def fill_nan(self, train_data, test_data):
    '''Function to replace the '?' in dataset with median value
    
    Parameters
    ----------
    train_data : pd.DataFrame
      Dataset for training the model
    test_data : pd.DataFrame
      Dataset for testing the model
    
    Return
    ------
    train_data : pd.DataFrame
      Training dataset without '?'. '?' replaced with median calculaed from training dataset
    test_data : pd.DataFrame
      Testing dataset without '?'. '?' replaced with median calculaed from training dataset
    '''
    for cols in train_data.columns:
      if cols in feature_mapping["continuous"]:
        train_data_without_q = train_data.drop(train_data[train_data[cols] == "?"].index)
        median = train_data_without_q[cols].median()
        train_data[cols].replace("?",median, inplace=True)
        test_data[cols].replace("?",median, inplace=True)
        if not isinstance(train_data[cols].dtype,np.int64):
          train_data[cols] = [float(val) for val in train_data[cols]]
          test_data[cols] = [float(val) for val in test_data[cols]]
      elif cols in feature_mapping["discrete"] and cols != "A16":
        train_data_without_q = train_data.drop(train_data[train_data[cols] == "?"].index)
        unique_values = train_data_without_q[cols].unique()
        unique_values.sort()
        index = math.ceil((len(unique_values)+1)/2)
        train_data[cols].replace("?",unique_values[index-1], inplace=True)
        test_data[cols].replace("?",unique_values[index-1], inplace=True)
    return train_data, test_data

**Data Preprocessing**

In [83]:
#Object is created for preprocessing class and fill_nan function is invoked
preObj = preprocessing()
train_data, test_data = preObj.fill_nan(train_data, test_data)

**Class for decision tree helper functions**

In [84]:
class decisionTreeHelper:
  def __init__(self, criterion : str = 'C4.5'):
        self.__gain = {}
        self.__gainratio = {}
        self.__giniindex = {}
        self.__criterion = criterion
        self.data = None

  def merge_data_label(self, data: pd.DataFrame, class_label: pd.DataFrame):
    '''Function to merge the dataset features with class label.
    
    Parameters
    ----------
    data : pd.DataFrame
      All feature columns in the dataset except class label.
    class_label : pd.DataFrame
      Class label column in the dataset.
    
    Return
    ------
    data : pd.DataFrame
      Feature columns merged with class_lable column.
    '''
    self.data = data
    self.data.insert(len(data.columns),"label",list(class_label.values),True)
    return data

  def __calculate_entropy(self,data : pd.DataFrame):
    '''Function to calculate entropy.
    
    Parameters
    ----------
    data : pd.DataFrame
      Input dataset as pd.DataFrame to calculate entropy.
    
    Return
    ------
    entropy : float
      Entropy value of given dataset
    '''
    entropy = 0.0
    filtereddata = data
    groupedbylabel = data.groupby('label')
    for dataKey, dataGroup in groupedbylabel:
      precentage = len(dataGroup)/len(filtereddata)
      entropy -= precentage * math.log2(precentage)
    return entropy

  def __best_con_feature_split(self, data : pd.DataFrame, feature_name : list):
    '''Function to find the best split for continous features.
    
    Parameters
    ----------
    data : pd.DataFrame
      Input dataset as pd.DataFrame to calculate entropy.
    feature_name : list
      List of feature names.
    
    Return
    ------
    best_gain_split_value : float
      Best split value based on information gain(ID3)
    best_gain_ratio_split_value : float
      Best split value based on gain ratio(C4.5)
    best_gini_split_value : float
      Best split value based on gini index(CART)
      
    '''
    con_value_split_lists = []
    gain = {}
    gainratio = {}
    giniindex = {}
    best_gain_split_value = None
    best_gain_ratio_split_value = None
    best_gini_split_value = None
    feature_entropy = 0.0
    feature_intrinsic_value = 0.0
    total_entropy = self.__calculate_entropy(data)
    
    for index in range(len(data[feature_name])-1):
      con_value = list(data[feature_name])
      con_value.sort()
      con_value_lists = list(con_value)
      con_value_split_lists.append((con_value_lists[index]+con_value_lists[index+1])/2)
    con_value_split_lists = set(con_value_split_lists)
    
    for i in con_value_split_lists:
      if self.__criterion == 'ID3' or self.__criterion == 'C4.5':
        feature_value_precentage_less = len(data.query(f'{feature_name}<={i}')) / len(data)
        feature_value_precentage_more = len(data.query(f'{feature_name}>{i}')) / len(data)
        feature_value_entropy_less = self.__calculate_entropy(data.query(f'{feature_name}<={i}'))
        feature_value_entropy_more = self.__calculate_entropy(data.query(f'{feature_name}>{i}'))
        feature_entropy = (feature_value_precentage_less * feature_value_entropy_less) + (feature_value_precentage_more * feature_value_entropy_more)
        if feature_value_precentage_less == 0:
          feature_value_precentage_less = 1
        if feature_value_precentage_more == 0:
          feature_value_precentage_more = 1
        feature_intrinsic_value = -((feature_value_precentage_less * math.log2(feature_value_precentage_less)) + (feature_value_precentage_more * math.log2(feature_value_precentage_more)))
        gain[i] = total_entropy - feature_entropy
        if feature_intrinsic_value != 0.0:
          gainratio[i] = gain[i]/feature_intrinsic_value
        else:
          gainratio[i] = 0.0
      if self.__criterion == 'CART':
        feature_value_precentage_less = len(data.query(f'{feature_name}<={i}')) / len(data)
        feature_value_precentage_more = len(data.query(f'{feature_name}>{i}')) / len(data)
        feature_value_entropy_less = self.__calculate_gini(data.query(f'{feature_name}<={i}'))
        feature_value_entropy_more = self.__calculate_gini(data.query(f'{feature_name}>{i}'))
        giniindex[i] = (feature_value_precentage_less * feature_value_entropy_less) + (feature_value_precentage_more * feature_value_entropy_more)
    
    if self.__criterion == 'ID3' or self.__criterion == 'C4.5':
      best_gain_split_value = max(gain, key=gain.get)
      best_gain_ratio_split_value = max(gainratio, key=gainratio.get)
      self.__gain[feature_name] = gain[best_gain_split_value]
      self.__gainratio[feature_name] = gainratio[best_gain_ratio_split_value]
    if self.__criterion == 'CART':
      best_gini_split_value = min(giniindex, key=giniindex.get)
      self.__giniindex[feature_name] = giniindex[best_gini_split_value]
    
    return best_gain_split_value, best_gain_ratio_split_value, best_gini_split_value

  def __calculate_gain_ratio(self, data : pd.DataFrame):
    '''Function to calculate gain and gain ratio.
    
    Parameters
    ----------
    data : pd.DataFrame
      Input dataset as pd.DataFrame to calculate entropy.
    
    Return
    ------
    best_gain_split_value : dict
      Dict of best split based on gain for all the  features in the  input data
    best_gain_ratio_split_value : dict
      Dict of best split based on gain ratio for all the  features in the  input data
    self.__gain : 
      Dict of gain for all the  features in the  input data
    self.__gainratio :
      Dict of gain ratio for all the  features in the  input data
    '''
    self.__gainratio = {}
    best_gain_split_value = {}
    best_gain_ratio_split_value = {}
    best_gini_split_value = {}
    total_entropy = self.__calculate_entropy(data)
    bestgainratio = 0.0
    for featureIndex in range(len(data.columns)-1):
      feature_name = data.columns[featureIndex]
      if (feature_name in feature_mapping["continuous"]):
        best_gain_split_value[feature_name], best_gain_ratio_split_value[feature_name], best_gini_split_value[feature_name] = self.__best_con_feature_split(data,feature_name)
      else:
        feature_entropy = 0.0
        feature_intrinsic_value = 0.0
        groupedByFeature = data.groupby(feature_name)
        for featureKey, featureValueGroup in groupedByFeature:
          featureValueEntropy = self.__calculate_entropy(featureValueGroup)
          featureValueprecentage = len(featureValueGroup) / len(data)
          feature_entropy += featureValueprecentage * featureValueEntropy
          feature_intrinsic_value -= featureValueprecentage * math.log2(featureValueprecentage)
        self.__gain[feature_name] = total_entropy - feature_entropy
        if feature_intrinsic_value != 0.0:
          self.__gainratio[feature_name] = self.__gain[feature_name]/feature_intrinsic_value
        else:
          self.__gainratio[feature_name] = 0.0
    return best_gain_split_value, best_gain_ratio_split_value, self.__gain, self.__gainratio

  def __calculate_gini(self,data : pd.DataFrame):
    '''Function to calculate gini.
    
    Parameters
    ----------
    data : pd.DataFrame
      Input dataset as pd.DataFrame to calculate gini.
    
    Return
    ------
    gini : float
      Gini value of given dataset
    '''
    gini = 0.0
    groupedbylabel = data.groupby('label')
    for dataKey, dataGroup in groupedbylabel:
      precentage = len(dataGroup)/len(data)
      gini += math.pow(precentage,2)
    gini = 1 - gini
    return gini

  def __calculate_gini_index(self, data : pd.DataFrame):
    '''Function to calculate gini index.
    
    Parameters
    ----------
    data : pd.DataFrame
      Input dataset as pd.DataFrame to calculate entropy.
    
    Return
    ------
    best_gini_split_value : dict
      Dict of best split based on gain for all the  features in the  input data
    self.__giniindex :
      Dict of gini index for all the  features in the  input data
    '''
    self.__giniindex = {}
    best_gain_split_value = {}
    best_gain_ratio_split_value = {}
    best_gini_split_value = {}
    for featureIndex in range(len(data.columns)-1):
      feature_name = data.columns[featureIndex]
      groupedByFeature = data.groupby(feature_name)
      featureGini = 0.0
      if (feature_name in feature_mapping["continuous"]):
        best_gain_split_value[feature_name], best_gain_ratio_split_value[feature_name], best_gini_split_value[feature_name] = self.__best_con_feature_split(data,feature_name)
      else:
        for featureKey, featureValueGroup in groupedByFeature:
          featureValueGini = self.__calculate_gini(featureValueGroup)
          featureValueprecentage = len(featureValueGroup) / len(data)
          featureGini += featureValueprecentage * featureValueGini
        self.__giniindex[feature_name] = featureGini
    return best_gini_split_value, self.__giniindex

  def get_best_feature(self, train_values : pd.DataFrame, feature_list : list):
    '''Function to find the best feature based on input criterion(ID3, C4.5, CART).
    
    Parameters
    ----------
    train_values : pd.DataFrame
      Input dataset as pd.DataFrame.
    feature_list : pd.DataFrame
      List of features in the input dataset.
    
    Return
    ------
    best_gini_split_value : dict
      Dict of best split based on gain for all the  features in the  input data
    self.__giniindex :
      Dict of gini index for all the  features in the  input data
    '''
    feature_list = list(feature_list)
    feature_list.sort()
    feature_list.remove('label')
    if self.__criterion == 'C4.5':
      gainratio = {}
      _, best_split_value, _, _ = self.__calculate_gain_ratio(train_values)
      for key in feature_list:
          gainratio[key] = self.__gainratio[key]
      best_feature = max(gainratio, key=gainratio.get)
    elif self.__criterion == 'CART':
      giniindex = {}
      best_split_value, _ = self.__calculate_gini_index(train_values)
      for key in feature_list:
          giniindex[key] = self.__giniindex[key]
      best_feature = min(giniindex, key=giniindex.get)
    elif self.__criterion == 'ID3':
      gain = {}
      best_split_value, _, _, _ = self.__calculate_gain_ratio(train_values)
      for key in feature_list:
          gain[key] = self.__gain[key]
      best_feature = max(gain, key=gain.get)
    else:
      print(f'Provide valid value(Either ID3, C4.5 or CART) for criterion. {self.__criterion} is invalid')
    return best_split_value, best_feature
  
  def get_performance_score(self, actual_label : list, predicted_label : list):
    '''Function to calculate the performance metric using sklearn.
    
    Parameters
    ----------
    actual_label : list
      Actual(Ground Truth) class label from the dataset.
    predicted_label : pd.DataFrame
      Class label predicted by the model
    
    Return
    ------
    f1_score : float
    accuracy : float
    precision : float
    recall : float
    AUROC : float
    '''
    actual_label = actual_label.replace('+',1)
    actual_label = actual_label.replace('-',0)
    predicted_label = pd.DataFrame(predicted_label).replace('+',1)
    predicted_label = pd.DataFrame(predicted_label).replace('-',0)
    precision = metrics.precision_score(actual_label, predicted_label, pos_label=1)
    recall = metrics.recall_score(actual_label, predicted_label,pos_label=1)
    AUROC = metrics.roc_auc_score(actual_label, predicted_label)
    accuracy = metrics.accuracy_score(actual_label, predicted_label)
    f1_score = metrics.f1_score(actual_label, predicted_label,pos_label=1)
    return [f1_score, accuracy, precision, recall, AUROC]



In [85]:
class decisionTreeClassifier:
    def __init__(self, criterion: str = 'C4.5'):
        self.__criterion = criterion
        self.tree = None
        self.data = None
        self.predicted_labels = None
        
    def built_decision_tree(self, train_values : pd.DataFrame, train_features : pd.DataFrame, parent_data : pd.DataFrame):
        '''Recursive function to build the decision tree.
        
        Parameters
        ----------
        train_values : pd.DataFrame
            Filtered training dataset from parent data to predict the value. The values in train_values and parent_data will be the same in the first iteration
        train_features : pd.DataFrame
            List of columns in the training dataset
        parent_data : pd.DataFrame
            Subset of training data set from previous iteration. In the first iteration it will be entire training data set
        
        Return
        ------
        tree : dict
            Decision tree that will used to predict
        '''
        tree = {}
        helper = decisionTreeHelper(self.__criterion)
        groupedbylabel = train_values.groupby('label')
        class_categories = groupedbylabel['label'].unique()
        if len(train_values) == 0:
            return parent_data['label'].value_counts().index[0]
        elif len(class_categories) == 1:
            return class_categories[0][0]
        elif len(list(train_features)) == 1:
            return parent_data['label'].value_counts().index[0]
        else:
            best_split_value, best_feature = helper.get_best_feature(train_values, train_features)

        if best_feature not in tree:
            tree[best_feature] = {}
        if best_feature not in feature_mapping['continuous']:
          index_of_feature = list(train_features).index(best_feature)
          unique_values = self.data[best_feature].unique()
          unique_values.sort()
          for values in unique_values:
            train_values_filter = train_values.query(f"{best_feature} == '{values}'")
            subtree = self.built_decision_tree(train_values_filter, list(train_features)[:index_of_feature]+list(train_features)[index_of_feature+1:] ,train_values)
            tree[best_feature][values] = subtree
        else:
            for condition in ['<=','>']:
              index_of_feature = list(train_features).index(best_feature)
              train_values_filter = train_values.query(f'{best_feature} {condition} {best_split_value[best_feature]}')
              subtree = self.built_decision_tree(train_values_filter, list(train_features)[:index_of_feature]+list(train_features)[index_of_feature+1:] ,train_values)
              tree[best_feature][str(condition)+str(best_split_value[best_feature])] = subtree
        return tree
    
    def fit(self, train_data : pd.DataFrame, train_label : pd.DataFrame):
        '''Fit function to train the model.
        
        Parameters
        ----------
        train_data : pd.DataFrame
            Training dataset
        train_label : pd.DataFrame
            List of columns in the training dataset
        
        Return
        ------
        self.tree : dict
            Decision tree that will used to predict
        '''
        helper = decisionTreeHelper(self.__criterion)
        self.data = helper.merge_data_label(train_data, train_label)
        self.train_label = train_label
        self.tree = self.built_decision_tree(self.data, self.data.columns, self.data)
        return self.tree
    
    def predict(self, test_data : pd.DataFrame):
        '''Predict function to predict class label from trained model.
        
        Parameters
        ----------
        test_data : pd.DataFrame
            Test dataset with the class label column
        
        Return
        ------
        predicted_labels : list
            List of predicted labels
        '''
        self.predicted_labels = []
        if isinstance(test_data, pd.DataFrame):
            test_data = test_data.to_dict(orient='index')
        for key in test_data.keys():
            test_row = test_data[key]
            predicted_labels = self.predict_each_label(self.tree, test_row)
        return predicted_labels
    
    def predict_each_label(self, tree, test_row):
        '''Recursive function to predict class label of each row in test dataset.
        
        Parameters
        ----------
        tree : dict
            Decision tree generated from fit function
        
        Return
        ------
        self.predicted_labels : list
            List of predicted labels
        '''
        columnName = list(tree.keys())[0]
        decisionNodes = tree[columnName]
        if columnName in feature_mapping['continuous']:
            for values in list(decisionNodes):
                if(eval(str(test_row.get(columnName))+str(values))):
                    valValue = values
                    if type(decisionNodes[valValue]) is not dict:
                        predicted = decisionNodes[valValue]
                        self.predicted_labels.append(predicted)
                        return self.predicted_labels
                    else:
                        self.predict_each_label(decisionNodes[valValue], test_row)                     
        else:
            valValue = test_row.get(columnName)
            if valValue in list(decisionNodes.keys()):
                valValue = valValue
            else:
                valValue = random.choice(list(decisionNodes.keys()))
            if type(decisionNodes[valValue]) is not dict:
                predicted = decisionNodes[valValue]
                self.predicted_labels.append(predicted)
                return self.predicted_labels
            else:
                self.predict_each_label(decisionNodes[valValue], test_row)
        return self.predicted_labels
        
    def cross_val_score(self, train_data : pd.DataFrame, splits : int = 10, shuffle : bool = False):
        '''Function to perform cross validation.
        
        Parameters
        ----------
        train_data : pd.DataFrame
            Training dataset
        splits : int, default = 10
            Number split the train data
        shuffle : bool, default = False
            To randomize the split. Setting the shuffle argument to False will not randomize the split
        
        Return
        ------
        kfold_index : Dict
            Dictionary of each split indexes
        performance_scores
            Dictionary of performance metrics of each split
        '''
        helper = decisionTreeHelper(self.__criterion)
        kfold_split = KFold(n_splits = splits, shuffle = shuffle)
        performance_scores = {}
        kfold_index = {}
        for fold_index, (train_index, test_index) in enumerate(kfold_split.split(train_data)):
            train_set = train_data.iloc[train_index]
            train_label = train_set[feature_mapping['target']]
            train_set= train_set.drop([feature_mapping['target']],axis=1)
            self.fit(train_set, train_label)
            dev_set = train_data.iloc[test_index]
            actual_label = dev_set[feature_mapping['target']]
            dev_set = dev_set.drop([feature_mapping['target']],axis=1)
            predicted_label = self.predict(dev_set)
            performance_scores[fold_index] = helper.get_performance_score(actual_label, predicted_label)
            kfold_index[fold_index] = [train_index, test_index]
        return kfold_index, performance_scores
        
    def best_estimator(self, kfold_index, performance_scores, train_data):
        '''Function to train the model with best split from cross validation.
        
        Parameters
        ----------
        kfold_index : Dict
            Dictionary of each split indexes
        performance_scores
            Dictionary of performance metrics of each split
        train_data : pd.DataFrame
            Training dataset
        
        Return
        ------
        self.tree : dict
            Decision tree that will used to predict
        '''
        f1_score = []
        performance_metrics = pd.DataFrame(performance_scores).T
        performance_metrics = performance_metrics.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        best_f1_index= np.argmax(performance_metrics['F1'])
        for key, value in kfold_index.items():
            if key == best_f1_index:
                train_index, _ = kfold_index[key]
                break  
        train_set = train_data.iloc[train_index]
        train_label = train_set[feature_mapping['target']]
        train_set= train_set.drop([feature_mapping['target']],axis=1)
        tree = self.fit(train_set, train_label)
        return tree    

**main class to train and test decision tree using ID3, C4.5 and CART criterion**

In [86]:
class main:
    def id3(self, criterion):
        id3_dto = decisionTreeClassifier(criterion)
        helper_dto = decisionTreeHelper(criterion)
        kfold_index, performance_scores = id3_dto.cross_val_score(train_data)
        cross_val_score = pd.DataFrame(performance_scores).T
        cross_val_score= cross_val_score.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        id3_dt = id3_dto.best_estimator(kfold_index, performance_scores, train_data)
        test_set = test_data
        test_set = test_set.drop(feature_mapping['target'],axis = 1)
        actual_label = test_data[feature_mapping['target']]
        predicted_label = id3_dto.predict(test_set)
        final_performance_metrics = helper_dto.get_performance_score(actual_label, predicted_label)
        final_performance_metrics = pd.DataFrame(final_performance_metrics).T
        final_performance_metrics.insert(0,'Criterion','ID3')
        final_performance_metrics = final_performance_metrics.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        return cross_val_score, final_performance_metrics, id3_dt
    
    def c45(self, criterion):
        criterion='C4.5'
        c45_dto = decisionTreeClassifier(criterion)
        helper_dto = decisionTreeHelper(criterion)
        kfold_index, performance_scores = c45_dto.cross_val_score(train_data)
        cross_val_score = pd.DataFrame(performance_scores).T
        cross_val_score= cross_val_score.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        c45_dt = c45_dto.best_estimator(kfold_index, performance_scores, train_data)
        test_set = test_data
        test_set = test_set.drop(feature_mapping['target'],axis = 1)
        actual_label = test_data[feature_mapping['target']]
        predicted_label = c45_dto.predict(test_set)
        final_performance_metrics = helper_dto.get_performance_score(actual_label, predicted_label)
        final_performance_metrics = pd.DataFrame(final_performance_metrics).T
        final_performance_metrics.insert(0,'Criterion',criterion)
        final_performance_metrics = final_performance_metrics.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        return cross_val_score, final_performance_metrics, c45_dt
    
    def cart(self, criterion):
        criterion='CART'
        cart_dto = decisionTreeClassifier(criterion)
        helper_dto = decisionTreeHelper(criterion)
        kfold_index, performance_scores = cart_dto.cross_val_score(train_data)
        cross_val_score = pd.DataFrame(performance_scores).T
        cross_val_score= cross_val_score.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        cart_tree = cart_dto.best_estimator(kfold_index, performance_scores, train_data)
        test_set = test_data
        test_set = test_set.drop(feature_mapping['target'],axis = 1)
        actual_label = test_data[feature_mapping['target']]
        predicted_label = cart_dto.predict(test_set)
        final_performance_metrics = helper_dto.get_performance_score(actual_label, predicted_label)
        final_performance_metrics = pd.DataFrame(final_performance_metrics).T
        final_performance_metrics.insert(0,'Criterion',criterion)
        final_performance_metrics = final_performance_metrics.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
        return cross_val_score, final_performance_metrics, cart_tree

In [87]:
#Object is created to train and test the decision using ID3, C4.5 and CART criterion
main_obj = main()
id3_cross_val_score, id3_final_score, id3_tree = main_obj.id3('ID3')
c45_cross_val_score, c45_final_score, c45_tree = main_obj.c45('C4.5')
cart_cross_val_score, cart_final_score, cart_tree = main_obj.cart('CART')

In [88]:
print(f'Performance metrics of all 10 splits from cross validation performed using ID3 criterion')
id3_cross_val_score.round(3)

Performance metrics of all 10 splits from cross validation performed using ID3 criterion


Unnamed: 0,F1,Accuracy,Precision,Recall,AUROC
0,0.737,0.818,0.7,0.778,0.808
1,0.92,0.927,0.92,0.92,0.927
2,0.64,0.673,0.593,0.696,0.676
3,0.75,0.782,0.72,0.783,0.782
4,0.683,0.764,0.875,0.56,0.747
5,0.833,0.855,0.8,0.87,0.857
6,0.741,0.745,0.741,0.741,0.745
7,0.791,0.836,0.944,0.68,0.823
8,0.706,0.727,0.818,0.621,0.733
9,0.75,0.745,0.7,0.808,0.749


In [89]:
print(f'ID3 final performance metrics based on prediction made from test set \n Best split from cross validation is used to train and test the model on test dataset')
id3_final_score.round(3)

ID3 final performance metrics based on prediction made from test set 
 Best split from cross validation is used to train and test the model on test dataset


Unnamed: 0,Criterion,F1,Accuracy,Precision,Recall,AUROC
0,ID3,0.806,0.821,0.788,0.825,0.822


In [90]:
print(f'Performance metrics of all 10 splits from cross validation performed using C4.5 criterion')
c45_cross_val_score.round(3)

Performance metrics of all 10 splits from cross validation performed using C4.5 criterion


Unnamed: 0,F1,Accuracy,Precision,Recall,AUROC
0,0.769,0.836,0.714,0.833,0.836
1,0.846,0.855,0.815,0.88,0.857
2,0.84,0.855,0.778,0.913,0.863
3,0.87,0.891,0.87,0.87,0.888
4,0.698,0.764,0.833,0.6,0.75
5,0.846,0.855,0.759,0.957,0.869
6,0.8,0.8,0.786,0.815,0.8
7,0.818,0.855,0.947,0.72,0.843
8,0.836,0.836,0.885,0.793,0.839
9,0.786,0.782,0.733,0.846,0.785


In [91]:
print(f'C4.5 final performance metrics based on prediction made from test set \n Best split from cross validation is used to train and test the model on test dataset')
c45_final_score.round(3)

C4.5 final performance metrics based on prediction made from test set 
 Best split from cross validation is used to train and test the model on test dataset


Unnamed: 0,Criterion,F1,Accuracy,Precision,Recall,AUROC
0,C4.5,0.8,0.821,0.806,0.794,0.819


In [92]:
print(f'Performance metrics of all 10 splits from cross validation performed using CART criterion')
cart_cross_val_score.round(3)

Performance metrics of all 10 splits from cross validation performed using CART criterion


Unnamed: 0,F1,Accuracy,Precision,Recall,AUROC
0,0.737,0.818,0.7,0.778,0.808
1,0.739,0.782,0.81,0.68,0.773
2,0.766,0.8,0.75,0.783,0.798
3,0.698,0.764,0.75,0.652,0.748
4,0.809,0.836,0.864,0.76,0.83
5,0.784,0.8,0.714,0.87,0.81
6,0.755,0.764,0.769,0.741,0.763
7,0.8,0.836,0.9,0.72,0.827
8,0.72,0.745,0.857,0.621,0.753
9,0.704,0.709,0.679,0.731,0.71


In [93]:
print(f'CART final performance metrics based on prediction made from test set \n Best split from cross validation is used to train and test the model on test dataset')
cart_final_score.round(3)

CART final performance metrics based on prediction made from test set 
 Best split from cross validation is used to train and test the model on test dataset


Unnamed: 0,Criterion,F1,Accuracy,Precision,Recall,AUROC
0,CART,0.784,0.807,0.79,0.778,0.804
