In [1]:
import math
import pandas as pd
import numpy as np

class _DecisionNode:
    def __init__(self, attribute):
        # Inisialisasi simpul keputusan dengan atribut yang diberikan
        self.attribute = attribute
        self.children = {}  # Menyimpan anak-anak simpul keputusan

    def depth(self):
        # Menghitung kedalaman simpul keputusan
        if len(self.children) == 0:
            return 1
        else:
            max_depth = 0
            for child in self.children.values():
                if isinstance(child, _DecisionNode):
                    child_depth = child.depth()
                    if child_depth > max_depth:
                        max_depth = child_depth
            return max_depth + 1

    def add_child(self, value, node):
        # Menambahkan anak ke simpul keputusan dengan nilai atribut yang diberikan
        self.children[value] = node

    def count_leaves(self):
        if len(self.children) == 0:
            return 1
        else:
            count = 0
            for child in self.children.values():
                if isinstance(child, _DecisionNode):
                    count += child.count_leaves()
                else:
                    count += 1
            return count


class _LeafNode:
    def __init__(self, label, weight):
        # Inisialisasi simpul daun dengan label kelas dan bobot yang diberikan
        self.label = label
        self.weight = weight


class C45Classifier:
    def __init__(self):
        self.tree = None
        self.attributes = None
        self.data = None
        self.weight = 1

    def __calculate_entropy(self, data, weights):
        # Menghitung entropi dari dataset yang diberikan
        class_counts = {}  # Menghitung jumlah masing-masing label kelas
        total_weight = 0.0  # Menghitung total bobot data

        for i, record in enumerate(data):
            label = record[-1]  # Mengambil label kelas dari setiap data
            weight = weights[i]  # Mengambil bobot dari setiap data

            if label not in class_counts:
                class_counts[label] = 0.0
            class_counts[label] += weight
            total_weight += weight

        entropy = 0.0

        for count in class_counts.values():
            probability = count / total_weight  # Menghitung probabilitas masing-masing label kelas
            entropy -= probability * math.log2(probability)  # Menghitung kontribusi entropi dari masing-masing label

        return entropy

    def __split_data(self, data, attribute_index, attribute_value, weights):
        # Memisahkan dataset berdasarkan nilai atribut yang diberikan
        split_data = []  # Menyimpan subset data yang sesuai
        split_weights = []  # Menyimpan subset bobot yang sesuai

        for i, record in enumerate(data):
            if record[attribute_index] == attribute_value:
                split_data.append(record[:attribute_index] + record[attribute_index+1:])
                split_weights.append(weights[i])

        return split_data, split_weights

    def __select_best_attribute_c50(self, data, attributes, weights):
        # Memilih atribut terbaik untuk membagi dataset menggunakan algoritma C5.0
        total_entropy = self.__calculate_entropy(data, weights)
        best_attribute = None
        best_gain_ratio = 0.0
        split_info = 0.0
        for attribute_index in range(len(attributes)):
            attribute_values = set([record[attribute_index] for record in data])
            attribute_entropy = 0.0


            for value in attribute_values:
                subset, subset_weights = self.__split_data(data, attribute_index, value, weights)
                subset_entropy = self.__calculate_entropy(subset, subset_weights)
                subset_probability = sum(subset_weights) / sum(weights)
                attribute_entropy += subset_probability * subset_entropy
                split_info -= subset_probability * math.log2(subset_probability)


            gain = total_entropy - attribute_entropy

            if split_info != 0.0:
                gain_ratio = gain / split_info
            else:
                gain_ratio = 0.0

            if gain_ratio > best_gain_ratio:
                best_gain_ratio = gain_ratio
                best_attribute = attribute_index

        return best_attribute

    def __majority_class(self, data, weights):
        # Menentukan kelas mayoritas pada dataset
        class_counts = {}

        for i, record in enumerate(data):
            label = record[-1]
            weight = weights[i]

            if label not in class_counts:
                class_counts[label] = 0.0
            class_counts[label] += weight

        majority_class = None
        max_count = 0.0

        for label, count in class_counts.items():
            if count > max_count:
                max_count = count
                majority_class = label

        return majority_class

    def __build_decision_tree(self, data, attributes, weights):
        class_labels = set([record[-1] for record in data])

        # Base case 1: Jika semua data memiliki label kelas yang sama, return simpul daun dengan label kelas tersebut
        if len(class_labels) == 1:
            return _LeafNode(class_labels.pop(), sum(weights))

        # Base case 2: Jika tidak ada atribut lagi yang bisa dipertimbangkan, return simpul daun dengan label mayoritas
        if len(attributes) == 1:
            return _LeafNode(self.__majority_class(data, weights), sum(weights))

        # Memilih atribut terbaik untuk membagi dataset menggunakan algoritma C5.0
        best_attribute = self.__select_best_attribute_c50(data, attributes, weights)

        if best_attribute is None:
            return _LeafNode(self.__majority_class(data, weights), sum(weights))

        best_attribute_name = attributes[best_attribute]
        tree = _DecisionNode(best_attribute_name)
        attributes = attributes[:best_attribute] + attributes[best_attribute+1:]
        attribute_values = set([record[best_attribute] for record in data])

        for value in attribute_values:
            subset, subset_weights = self.__split_data(data, best_attribute, value, weights)

            if len(subset) == 0:
                # Jika subset kosong, maka buat simpul daun dengan label mayoritas dari data induk dan bobot subset
                tree.add_child(value, _LeafNode(self.__majority_class(data, weights), sum(subset_weights)))
            else:
                # Jika subset tidak kosong, rekursif membangun pohon keputusan menggunakan subset sebagai data dan atribut yang tersisa
                tree.add_child(value, self.__build_decision_tree(subset, attributes, subset_weights))

        return tree

    def __make_tree(self, data, attributes, weights):
        # Membuat pohon keputusan menggunakan dataset, atribut, dan bobot yang diberikan
        return self.__build_decision_tree(data, attributes, weights)

    def __train(self, data,weight =1):
        self.weight = weight
        # Melatih pohon keputusan menggunakan dataset yang diberikan
        self.attributes = data.columns.tolist()[:-1]  # Mendapatkan atribut dari kolom dataset
        weights = [self.weight] * len(data)  # Menginisialisasi bobot dengan nilai yang sama untuk setiap data
        self.tree = self.__make_tree(data.values.tolist(), self.attributes, weights)
        self.data = data

    def __classify(self, tree=None, instance=[]):
        if self.tree is None:
            raise Exception('Decision tree has not been trained yet!')
        # Mengklasifikasikan instance menggunakan pohon keputusan
        if tree is None:
            tree = self.tree

        if isinstance(tree, _LeafNode):
            return tree.label

        attribute = tree.attribute
        attribute_index = self.attributes.index(attribute)
        attribute_values = instance[attribute_index]

        if attribute_values in tree.children:
            # jika value
            child_node = tree.children[attribute_values]
            return self.__classify(child_node, instance)
        else:
            # jika node anak tidak ada maka akan mengambil nilai mayoritas di cabang
            class_labels = []
            for child_node in tree.children.values():
                if isinstance(child_node, _LeafNode):
                    class_labels.append(child_node.label)
            if len(class_labels) == 0:
                return self.__majority_class(self.data.values.tolist(), [1.0] * len(self.data))
            majority_class = max(set(class_labels))
            return majority_class

    def fit(self, data, label, weight =1):
        # Melatih pohon keputusan menggunakan dataset yang diberikan
        if isinstance(data, pd.DataFrame):
            data = pd.concat([data, label], axis=1)
        else:
            data = pd.DataFrame(np.c_[data, label])
        self.__train(data,weight)

    def predict(self, data):

        # check if data is dataframe
        if isinstance(data, pd.DataFrame):
            data = data.values.tolist()
        elif isinstance(data, list) and isinstance(data[0], dict):
            data = [list(d.values()) for d in data]
        #  check variables is same with attributes

        if len(data[0]) != len(self.attributes):
            raise Exception('Number of variables in data and attributes do not match!')
        # Memprediksi label kelas dari setiap data dalam dataset
        return [self.__classify(None, record) for record in data]

    def evaluate(self, x_test, y_test):
        # Mengevaluasi performa pohon keputusan menggunakan akurasi
        y_pred = self.predict(x_test)
        # print type y_test

        if isinstance(y_test, pd.Series):
            y_test = y_test.values.tolist()

    #     print every acc of each class
        acc = {}
        true_pred = 0
        real_acc ={}
        for i in range(len(y_test)):
            if y_test[i] not in real_acc:
                real_acc[y_test[i]] = 0
            real_acc[y_test[i]] += 1
            if y_test[i] == y_pred[i]:
                if y_test[i] not in acc:
                    acc[y_test[i]] = 0
                acc[y_test[i]] += 1
                true_pred += 1
        for key in acc:
            acc[key] /= real_acc[key]
    #     mean acc
        total_acc = true_pred / len(y_test)
        print("Evaluation result: ")
        print("Total accuracy: ", total_acc)
        for key in acc:
            print("Accuracy ", key, ": ", acc[key])

    def generate_tree_diagram(self, graphviz,filename):
        # Menghasilkan diagram pohon keputusan menggunakan modul graphviz
        dot = graphviz.Digraph()

        def build_tree(node, parent_node=None, edge_label=None):
            if isinstance(node, _DecisionNode):
                current_node_label = str(node.attribute)
                dot.node(str(id(node)), label=current_node_label)

                if parent_node:
                    dot.edge(str(id(parent_node)), str(id(node)), label=edge_label)

                for value, child_node in node.children.items():
                    build_tree(child_node, node, value)
            elif isinstance(node, _LeafNode):
                current_node_label = f"Class: {node.label}, Weight: {node.weight}"
                dot.node(str(id(node)), label=current_node_label, shape="box")

                if parent_node:
                    dot.edge(str(id(parent_node)), str(id(node)), label=edge_label)

        build_tree(self.tree)
        dot.format = 'png'
        return dot.render(filename, view=False)

    def print_rules(self, tree=None, rule=''):
        if self.tree is None:
            raise Exception('Decision tree has not been trained yet!')
        # Mencetak aturan yang dibuat oleh pohon keputusan
        if tree is None:
            tree = self.tree
        if rule != '':
            rule += ' AND '
        if isinstance(tree, _LeafNode):
            print(rule[:-3] + ' => ' + tree.label)
            return

        attribute = tree.attribute
        for value, child_node in tree.children.items():
            self.print_rules(child_node, rule + attribute + ' = ' + str(value) )

    def rules(self):
        rules = []

        def build_rules(node, parent_node=None, edge_label=None, rule=''):
            if isinstance(node, _DecisionNode):
                current_node_label = node.attribute
                if parent_node:
                    rule += f" AND {current_node_label} = {edge_label}"
                for value, child_node in node.children.items():
                    build_rules(child_node, node, value, rule)
            elif isinstance(node, _LeafNode):
                current_node_label = f"Class: {node.label}, Weight: {node.weight}"
                if parent_node:
                    rule += f" => {current_node_label}"
                rules.append(rule[5:])
        build_rules(self.tree)
        return rules

    def summary(self):
        # print summary
        print("Decision Tree Classifier Summary")
        print("================================")
        print("Number of Instances   : ", len(self.data))
        print("Number of Attributes  : ", len(self.attributes))
        print("Number of Leaves      : ", self.tree.count_leaves())
        print("Number of Rules       : ", len(self.rules()))
        print("Tree Depth            : ", self.tree.depth())

In [2]:
import pandas as pd

# 載入數據
data = pd.read_csv('adult.data',sep=", ",header=None)  # 假設您已下載的數據集文件為'adult.data'
test_data = pd.read_csv('adult.test',sep=", ",header=None,engine='python',skiprows=1)
t_d = test_data

  data = pd.read_csv('adult.data',sep=", ",header=None)  # 假設您已下載的數據集文件為'adult.data'


In [3]:
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
feature_cols = ['age','workclass','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country']
data = data.drop(columns=['education'])
data = data.drop(columns=['fnlwgt'])
convert = {"income" :{"<=50K":0, ">50K":1}}
data = data.replace(convert)

## 正規化

In [4]:
from sklearn.preprocessing import MinMaxScaler
# numerical_columns
num_columns = ['age','education-num','capital-gain','capital-loss','hours-per-week']
scaler = MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data[num_columns]), columns=num_columns)
data[num_columns] = data_scaled
data.replace('?', 'other', inplace=True)
data = pd.get_dummies(data)
data['income'] = data['income'].astype('int64')
X = data.drop('income',axis=1)
y = data['income']



In [5]:
# 創建C45Classifier對象
c45 = C45Classifier()

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test

In [7]:
# 訓練決策樹
c45.fit(X_train, y_train)

In [8]:
# 獲取決策樹摘要
c45.summary()

Decision Tree Classifier Summary
Number of Instances   :  26048
Number of Attributes  :  91
Number of Leaves      :  10811
Number of Rules       :  10811
Tree Depth            :  24


In [9]:
# 獲取決策規則
rules = c45.rules()
for rule in rules:
    print(rule)

education-num = 0.30136986301369856 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.346938775510204 => Class: 0.0, Weight: 3
education-num = 0.30136986301369856 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.346938775510204 AND marital-status_Divorced = 1.0 => Class: 1.0, Weight: 1
education-num = 0.30136986301369856 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.346938775510204 AND marital-status_Divorced = 1.0 => Class: 0.0, Weight: 1
education-num = 0.30136986301369856 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 3
education-num = 0.30136986301369856 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Federal-gov = 0.6020408163265305 AND workclass_Self-emp-not-inc = 0.0 AND workclass_Private = 0.0 => Class: 1.0, Weight: 1
education-num = 0.3013698630

education-num = 0.1643835616438356 => Class: 0.0, Weight: 11
education-num = 0.1643835616438356 => Class: 0.0, Weight: 4
education-num = 0.1643835616438356 => Class: 0.0, Weight: 2
education-num = 0.1643835616438356 AND marital-status_Married-civ-spouse = 0.4666666666666667 => Class: 0.0, Weight: 4
education-num = 0.1643835616438356 AND marital-status_Married-civ-spouse = 0.4666666666666667 AND native-country_Mexico = 1.0 => Class: 1.0, Weight: 1
education-num = 0.1643835616438356 AND marital-status_Married-civ-spouse = 0.4666666666666667 AND native-country_Mexico = 1.0 => Class: 0.0, Weight: 1
education-num = 0.4383561643835617 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 1
education-num = 0.4383561643835617 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 3
education-num = 0.4383561643835617 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-

capital-gain = 0.5616438356164384 AND education-num = 0.0 AND capital-loss = 0.8666666666666667 AND hours-per-week = 0.0 AND marital-status_Never-married = 0.346938775510204 => Class: 1.0, Weight: 1
capital-gain = 0.5616438356164384 AND education-num = 0.0 AND capital-loss = 0.8666666666666667 AND hours-per-week = 0.0 AND marital-status_Never-married = 0.346938775510204 => Class: 0.0, Weight: 1
capital-gain = 0.5616438356164384 AND education-num = 0.0 AND capital-loss = 0.8666666666666667 AND hours-per-week = 0.0 => Class: 0.0, Weight: 1
capital-gain = 0.5616438356164384 AND education-num = 0.0 AND capital-loss = 0.8666666666666667 AND hours-per-week = 0.0 => Class: 1.0, Weight: 1
capital-gain = 0.5616438356164384 AND education-num = 0.0 AND capital-loss = 0.8666666666666667 AND hours-per-week = 0.0 AND occupation_Exec-managerial = 0.39795918367346933 => Class: 0.0, Weight: 1
capital-gain = 0.5616438356164384 AND education-num = 0.0 AND capital-loss = 0.8666666666666667 AND hours-per-w

education-num = 0.3835616438356165 AND capital-gain = 0.5333333333333333 => Class: 0.0, Weight: 1
education-num = 0.3835616438356165 AND capital-gain = 0.5333333333333333 => Class: 0.0, Weight: 1
education-num = 0.3835616438356165 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Federal-gov = 0.44897959183673464 => Class: 0.0, Weight: 7
education-num = 0.3835616438356165 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Federal-gov = 0.44897959183673464 => Class: 1.0, Weight: 1
education-num = 0.3835616438356165 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 1
education-num = 0.3835616438356165 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND occupation_Machine-op-inspct = 0.6530612244897959 => Class: 1.0, Weight: 2
education-num = 0.3835616438356165 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND occupation_Machine-o

education-num = 0.3835616438356165 AND capital-gain = 0.6666666666666666 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Private = 0.39795918367346933 AND marital-status_Divorced = 1.0 AND occupation_Craft-repair = 0.0 AND native-country_Canada = 1.0 => Class: 0.0, Weight: 2
education-num = 0.3835616438356165 AND capital-gain = 0.6666666666666666 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Private = 0.39795918367346933 AND marital-status_Divorced = 1.0 AND occupation_Exec-managerial = 1.0 => Class: 0.0, Weight: 3
education-num = 0.3835616438356165 AND capital-gain = 0.6666666666666666 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Private = 0.39795918367346933 AND marital-status_Divorced = 1.0 AND occupation_Exec-managerial = 1.0 => Class: 1.0, Weight: 1
education-num = 0.3835616438356165 AND capital-gain = 0.6666666666666666 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 1
education-num = 0.3835616438356165 AND cap

education-num = 0.2191780821917808 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND occupation_Sales = 0.6020408163265305 AND workclass_Private = 1.0 => Class: 0.0, Weight: 2
education-num = 0.2191780821917808 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND occupation_Sales = 0.6020408163265305 AND workclass_Private = 1.0 => Class: 1.0, Weight: 1
education-num = 0.2191780821917808 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Divorced = 0.44897959183673464 AND marital-status_Married-civ-spouse = 0.0 => Class: 0.0, Weight: 1
education-num = 0.2191780821917808 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Divorced = 0.44897959183673464 AND marital-status_Married-civ-spouse = 0.0 AND occupation_Farming-fishing = 1.0 => Class: 0.0, Weight: 2
education-num = 0.2191780821917808 AND capital-gain

education-num = 0.2191780821917808 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.39795918367346933 AND workclass_Federal-gov = 0.0 AND marital-status_Married-civ-spouse = 0.0 AND workclass_Self-emp-not-inc = 1.0 => Class: 1.0, Weight: 2
education-num = 0.2191780821917808 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.39795918367346933 AND workclass_Federal-gov = 0.0 => Class: 0.0, Weight: 2
education-num = 0.2191780821917808 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.39795918367346933 => Class: 1.0, Weight: 1
education-num = 0.2191780821917808 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 1.0, Weight: 2
education-num = 0.2191780821917808 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND relationship_Husband = 0.5510204081632653 => Class: 0.0, Weight: 1
education-num = 

education-num = 0.2328767123287671 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 4
education-num = 0.2328767123287671 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Married-civ-spouse = 0.6530612244897959 => Class: 0.0, Weight: 1
education-num = 0.2328767123287671 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Married-civ-spouse = 0.6530612244897959 => Class: 1.0, Weight: 2
education-num = 0.2328767123287671 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 5
education-num = 0.2328767123287671 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 => Class: 0.0, Weight: 1
education-num = 0.2328767123287671 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.5 AND marital-status_Divorced = 0.0 AND occupation_Exec-managerial = 0.0 AND marital-status_M

education-num = 0.547945205479452 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Married-civ-spouse = 0.39795918367346933 AND workclass_other = 1.0 AND race_White = 0.0 AND workclass_Federal-gov = 1.0 AND workclass_Private = 0.0 AND occupation_Exec-managerial = 0.0 => Class: 0.0, Weight: 1
education-num = 0.547945205479452 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Married-civ-spouse = 0.39795918367346933 AND workclass_other = 1.0 AND race_White = 0.0 AND workclass_Federal-gov = 1.0 AND workclass_Private = 0.0 AND occupation_Exec-managerial = 0.0 => Class: 1.0, Weight: 2
education-num = 0.547945205479452 AND capital-gain = 0.6 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND marital-status_Married-civ-spouse = 0.39795918367346933 AND workclass_other = 1.0 AND race_White = 0.0 AND workclass_Federal-gov = 1.0 AND workclass_Private = 0.0 AND occupation_Craft-repair = 1.0 => Class: 1.0, Weight: 2
e

education-num = 0.2465753424657534 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.39795918367346933 AND workclass_Federal-gov = 0.0 AND marital-status_Married-AF-spouse = 0.0 AND marital-status_Married-civ-spouse = 0.0 AND occupation_Exec-managerial = 1.0 AND occupation_Other-service = 0.0 AND workclass_State-gov = 0.0 AND occupation_Craft-repair = 1.0 => Class: 0.0, Weight: 1
education-num = 0.2465753424657534 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.39795918367346933 AND workclass_Federal-gov = 0.0 AND marital-status_Married-AF-spouse = 0.0 AND marital-status_Married-civ-spouse = 0.0 AND occupation_Exec-managerial = 1.0 AND occupation_Other-service = 0.0 AND race_Asian-Pac-Islander = 1.0 => Class: 1.0, Weight: 2
education-num = 0.2465753424657534 AND capital-gain = 0.5333333333333333 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_L

education-num = 0.2465753424657534 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.5 AND workclass_Self-emp-not-inc = 0.0 AND occupation_Exec-managerial = 0.0 AND workclass_Private = 0.0 => Class: 0.0, Weight: 1
education-num = 0.2465753424657534 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.5 AND workclass_Self-emp-not-inc = 0.0 AND occupation_Exec-managerial = 0.0 AND workclass_Private = 0.0 AND occupation_Prof-specialty = 1.0 AND race_Asian-Pac-Islander = 0.0 AND marital-status_Married-civ-spouse = 0.0 => Class: 0.0, Weight: 4
education-num = 0.2465753424657534 AND capital-gain = 0.8 AND capital-loss = 0.0 AND hours-per-week = 0.0 AND workclass_Local-gov = 0.5 AND workclass_Self-emp-not-inc = 0.0 AND occupation_Exec-managerial = 0.0 AND workclass_Private = 0.0 AND occupation_Prof-specialty = 1.0 AND race_Asian-Pac-Islander = 0.0 AND marital-status_Married-civ-spouse = 0.0 AND race

In [10]:
y_pred = c45.predict(X_test)

# Calculate accuracy
correct_predictions = sum(1 for true, pred in zip(y_test, y_pred) if true == pred)
total_predictions = len(y_test)
accuracy = correct_predictions / total_predictions

# Print the accuracy
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 75.11%


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
precision = precision_score(y_test, y_pred)
print(f'acc: {precision*100:.2f}%')

acc: 49.98%


In [12]:
recall = recall_score(y_test, y_pred)
print(f'recall rate: {recall*100:.2f}%')

recall rate: 64.26%


In [13]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('confusion_matrix:')
print(conf_matrix)
#TP FN
#FN TN

confusion_matrix:
[[3851 1042]
 [ 579 1041]]


In [14]:
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1*100:.2f}%')

F1 score: 56.22%


In [15]:
from openpyxl import Workbook
result_test = c45.predict(X_test)

#產出Excel(Test data)
wb = Workbook()
ws = wb.active
ws.append(['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income','Predict result'])

for i in range(len(result_test)):
    if result_test[i] == 0:
        result = '<=50K.'
    else:
        result = '>50K.'
    #將現在loop到原始資料的列轉為list
    li = t_d.iloc[i,:].tolist()
    #
    li.append(result)
    ws.append(li)
wb.save('Adult_dt_C45.xlsx')

# 參考文獻
https://github.com/novandikp/DecisionTreeC45/blob/main/build/lib/C45/__init__.py