In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv("./bank.csv", delimiter=";")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [6]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
df['marital'].describe()

count        4521
unique          3
top       married
freq         2797
Name: marital, dtype: object

In [8]:
df['marital'].unique()

array(['married', 'single', 'divorced'], dtype=object)

In [9]:
le_marital = LabelEncoder()
df['marital'] = le_marital.fit_transform(df['marital'])
print(le_marital.classes_)
df['marital']

['divorced' 'married' 'single']


0       1
1       1
2       2
3       1
4       1
       ..
4516    1
4517    1
4518    1
4519    1
4520    2
Name: marital, Length: 4521, dtype: int64

In [10]:
df['job'].unique()

array(['unemployed', 'services', 'management', 'blue-collar',
       'self-employed', 'technician', 'entrepreneur', 'admin.', 'student',
       'housemaid', 'retired', 'unknown'], dtype=object)

In [11]:
le_job = LabelEncoder()
df['job'] = le_job.fit_transform(df['job'])
print(le_job.classes_)
df['job']

['admin.' 'blue-collar' 'entrepreneur' 'housemaid' 'management' 'retired'
 'self-employed' 'services' 'student' 'technician' 'unemployed' 'unknown']


0       10
1        7
2        4
3        4
4        1
        ..
4516     7
4517     6
4518     9
4519     1
4520     2
Name: job, Length: 4521, dtype: int64

In [12]:
df['education'].unique()

array(['primary', 'secondary', 'tertiary', 'unknown'], dtype=object)

In [13]:
le_education = LabelEncoder()
df['education'] = le_education.fit_transform(df['education'])
print(le_education.classes_)
df['education']

['primary' 'secondary' 'tertiary' 'unknown']


0       0
1       1
2       2
3       2
4       1
       ..
4516    1
4517    2
4518    1
4519    1
4520    2
Name: education, Length: 4521, dtype: int64

In [14]:
df['default'].unique()

array(['no', 'yes'], dtype=object)

In [15]:
df['default'].replace(['no', 'yes'], [0, 1], inplace=True)
print(df['default'])

0       0
1       0
2       0
3       0
4       0
       ..
4516    0
4517    1
4518    0
4519    0
4520    0
Name: default, Length: 4521, dtype: int64


In [16]:
df['housing'].unique()

array(['no', 'yes'], dtype=object)

In [17]:
df['housing'].replace(['no', 'yes'], [0, 1], inplace=True)
print(df['housing'])

0       0
1       1
2       1
3       1
4       1
       ..
4516    1
4517    1
4518    0
4519    0
4520    1
Name: housing, Length: 4521, dtype: int64


In [18]:
df['loan'].unique()

array(['no', 'yes'], dtype=object)

In [19]:
df['loan'].replace(['no', 'yes'], [0, 1], inplace=True)
print(df['loan'])

0       0
1       1
2       0
3       1
4       0
       ..
4516    0
4517    1
4518    0
4519    0
4520    1
Name: loan, Length: 4521, dtype: int64


In [20]:
df['contact'].unique()

array(['cellular', 'unknown', 'telephone'], dtype=object)

In [21]:
le_contact = LabelEncoder()
df['contact'] = le_contact.fit_transform(df['contact'])
print(le_contact.classes_)
df['contact']

['cellular' 'telephone' 'unknown']


0       0
1       0
2       0
3       2
4       2
       ..
4516    0
4517    2
4518    0
4519    0
4520    0
Name: contact, Length: 4521, dtype: int64

In [22]:
df['month'].unique()

array(['oct', 'may', 'apr', 'jun', 'feb', 'aug', 'jan', 'jul', 'nov',
       'sep', 'mar', 'dec'], dtype=object)

In [23]:
le_month = LabelEncoder()
df['month'] = le_month.fit_transform(df['month'])
print(le_month.classes_)
df['month']

['apr' 'aug' 'dec' 'feb' 'jan' 'jul' 'jun' 'mar' 'may' 'nov' 'oct' 'sep']


0       10
1        8
2        0
3        6
4        8
        ..
4516     5
4517     8
4518     1
4519     3
4520     0
Name: month, Length: 4521, dtype: int64

In [24]:
df['poutcome'].unique()

array(['unknown', 'failure', 'other', 'success'], dtype=object)

In [25]:
le_poutcome = LabelEncoder()
df['poutcome'] = le_poutcome.fit_transform(df['poutcome'])
print(le_poutcome.classes_)
df['poutcome']

['failure' 'other' 'success' 'unknown']


0       3
1       0
2       0
3       3
4       3
       ..
4516    3
4517    3
4518    3
4519    1
4520    1
Name: poutcome, Length: 4521, dtype: int64

In [26]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3,no
1,33,7,1,1,0,4789,1,1,0,11,8,220,1,339,4,0,no
2,35,4,2,2,0,1350,1,0,0,16,0,185,1,330,1,0,no
3,30,4,1,2,0,1476,1,1,2,3,6,199,4,-1,0,3,no
4,59,1,1,1,0,0,1,0,2,5,8,226,1,-1,0,3,no


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   int64 
 2   marital    4521 non-null   int64 
 3   education  4521 non-null   int64 
 4   default    4521 non-null   int64 
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   int64 
 7   loan       4521 non-null   int64 
 8   contact    4521 non-null   int64 
 9   day        4521 non-null   int64 
 10  month      4521 non-null   int64 
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   int64 
 16  y          4521 non-null   object
dtypes: int64(16), object(1)
memory usage: 600.6+ KB


In [28]:
df.shape

(4521, 17)

In [29]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [37]:
data = df

In [31]:
X.shape, y.shape

((4521, 16), (4521,))

In [44]:
class Node:
    def __init__(self,value,nxt_node=None,prev_node=None):
        self.nxtnode=nxt_node
        self.value=value
        self.prevnode=prev_node
        self.leaf_node=False
    
    def get_value(self):
        return self.value

    def set_prevnode(self,prevnode):
        self.prevnode=prevnode

    def get_prevnode(self):
        return self.prevnode

    def set_nxtnode(self,n_node):
        self.nxtnode=n_node

    def get_nxtnode(self):
        return self.nxtnode

In [45]:
#implementing the ID3 algorithm.
class ID3:
    def __init__(self,dataset):
        self.data=dataset
        self.total=dataset.shape[0]
        #assuming target variable would be at the end of the list.
        self.target_var=list(data.columns)[-1]
        
    def entropy(self,feature_data=None):
        #entropy is a measure of the distortion or the amount of diversity
        #in terms of labels or target variable present in the dataset.
        #an entropy value of 0 means only single category is present whereas an
        #entropy value of 1 means the dataset is perfectly balanced and all categories 
        #have equal number of rows.
        if feature_data is not None:
            data=feature_data
            total=data.shape[0]
        else:
            data=self.data
            total=self.total
        self.n=data[self.target_var].nunique()
        self.labels=list(data[self.target_var].unique())
        self.prob=[]
        self.ent=0
        
        for i in range(1,self.n+1):
            prob=(data[data[self.target_var]==self.labels[i-1]].shape[0])/total
            self.prob.append(prob)
            ent=-prob*np.log2(prob)
            #print(ent)
            self.ent=self.ent+ent
        return self.ent
    def entropy_per_feature(self,fname,feature_data=None):
        if feature_data is not None:
            data=feature_data
        else:
            data=self.data
        unique_values=list(data[fname].unique())
        #print(unique_values)
        unique_val_count=data[fname].nunique()
        ent_ftr=0
        for i in range(0,unique_val_count):
            category_data=data[data[fname]==unique_values[i]]
            #print(type(category_data))
            ent_cat=(category_data.shape[0]/self.total)*self.entropy(category_data)
            #print(ent_cat)
            ent_ftr=ent_ftr+ent_cat
        return ent_ftr
    
    def information_gain(self,fname,feature_data=None): 
        #print(fname) 
        if feature_data is None:
            information_gain=self.entropy()-self.entropy_per_feature(fname)
        else:
            et=self.entropy(feature_data)
            if et==0:
                return 0
            information_gain=et-self.entropy_per_feature(fname,feature_data)
        return information_gain

In [51]:
#maximum value function.
def max_val(dct):
    max_val=0
    key_max=''
    for key in dct.keys():
        if dct[key]>max_val:
            max_val=dct[key]
            key_max=key
    return key_max

In [52]:
def oop_building_tree():
    algo=ID3(data)
    feature_list=list(data.columns)[:-1]
    ig={}
    root_node=''
    #finding the root node.
    for feature in feature_list:
        i_gain=algo.information_gain(feature)
        ig[feature]=i_gain
    root_node=max_val(ig)
    del ig[root_node]
    n=Node(root_node)
    print("root_node:{}".format(n.get_value()))
    #finding child nodes of the root node.
    category_list=list(data[n.get_value()].unique())
    print("{}=>{}".format(n.get_value(),category_list))
    n_list=[Node(category) for category in category_list]
    #establishing links between the child nodes and the root node(parent node).
    n.set_nxtnode(n_list)
    for node in n_list:
        node.set_prevnode(n)
    #finding remaining intermediate nodes or decision nodes.
    while (len(ig.keys())!=0 and len(feature_list)!=1):
        for node in n_list:
            cat_data=data[data[n.get_value()]==node.get_value()]
            ig_1={}
            count=0
            for key in ig.keys():
                i_gain=algo.information_gain(key,cat_data)
                if i_gain==0:
                    #print(node.get_value())
                    values=list(cat_data.y.unique())
                    #values=['Yes']
                    val_node=[Node(val) for val in values]
                    node.set_nxtnode(val_node)
                    for vnode in val_node:
                        vnode.set_prevnode(node)
                        #print(vnode.get_nxtnode())
                        vnode.leaf_node=True
                    count=1
                    #node.leaf_node=True
                    print("{}=>{}".format(node.get_value(),list(data.y[data[n.get_value()]==node.get_value()].unique())[0]))
                    del ig[key]
                    break  
                ig_1[key]=i_gain
            if count==1:
                #print("star")
                continue
            root1_node=max_val(ig_1)
            #establishing links between the newly found intermediate node.
            #and the respectie parent node.
            intr_node=Node(root1_node)
            node.set_nxtnode(intr_node)
            intr_node.set_prevnode(node)
            print("{}=>{}".format(node.get_value(),intr_node.get_value()))
            intr_values=[Node(val) for val in list(data[intr_node.get_value()].unique())]
            intr_node.set_nxtnode(intr_values)
            print("{}=>{}".format(intr_node.get_value(),[node.get_value() for node in intr_node.get_nxtnode()]))
            for vnode in intr_values:
                vnode.set_prevnode(intr_node)
                #print(vnode.get_nxtnode())
                vnode.leaf_node=True
            #print(root1_node)
            del ig[str(root1_node)]

In [53]:
oop_building_tree()

root_node:balance
balance=>[1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 264, 1109, 502, 360, 194, 4073, 2317, -221, 132, 16, 106, 93, 543, 5883, 627, 696, 784, 105, 4189, 171, 42, 2536, 1235, 1811, 229, 2089, 3935, 363, 11971, 553, 1117, 396, 2204, 872, 145, -849, 4629, 844, 228, 50, 1539, 2231, 3064, 82, 2155, 101, -516, 415, 5887, 1355, 16873, 203, 338, 444, 2, 6248, 412, 344, 3222, 174, 591, 388, 219, 451, 5, 177, 657, 1315, 1466, 879, 293, 424, 1831, 111, 455, -195, 3616, 14093, 1567, 5426, 261, 2843, 406, 493, 5996, 3777, 524, 574, 427, 483, 3391, 2488, 1517, 217, 22, 331, 505, 1808, 4111, 52, 25, 1890, 419, 2693, -231, -55, 462, 119, 586, 4659, 168, -715, 769, 428, -970, 6313, 948, 1877, 874, 323, 1906, 1152, 179, 391, 89, 442, -249, 8104, 501, 308, 602, 273, 1641, 20, 4590, -256, 978, 80, 2104, 499, 1269, 310, 1143, 2980, 37, 1031, 9009, -465, 5181, 176, 1699, 172, 6979, 6, -62, 1972, 899, 21, 23, -1206, 215, 1372, 1981, 9216, 39, 5563, 1, 782, 3771, 992, 1526, 265

KeyError: ''

In [54]:
!pip install decision-tree-id3

[0mCollecting decision-tree-id3
  Downloading decision-tree-id3-0.1.2.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting nose>=1.1.2 (from decision-tree-id3)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m174.3 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Building wheels for collected packages: decision-tree-id3
  Building wheel for decision-tree-id3 (setup.py) ... [?25ldone
[?25h  Created wheel for decision-tree-id3: filename=decision_tree_id3-0.1.2-py3-none-any.whl size=15944 sha256=905c98dd2b32f7b8b08b2543ad15ccf28f7f75cbd057ba86d0d6af9962b780fd
  Stored in directory: /home/aminkhani/.cache/pip/wheels/54/e9/72/00b38b5d4f4464d8ee5ed73092f2167c88bac31e01fcc17d38
Successfully built decision-tree-id3
[0mInstalling collected packages: nose, decision-tree-id3
[0mSuccessfully installed decision-tree-id3-0.1.2 nose-1.3.7
[0m