In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [59]:
df=pd.read_csv('data_banknote_authentication.txt')
df=df.sample(frac=1)
df.head(5)

Unnamed: 0,variance of Wavelet Transformed image(WTI),skewness of WTI,kurtosis of WTI,entropy of image,class
666,1.2262,0.89599,5.7568,-0.11596,0
1213,-2.62,-6.8555,6.2169,-0.62285,1
129,3.4663,1.1112,1.7425,1.3388,0
944,-2.121,-0.05588,1.949,1.353,1
909,-1.7322,-9.2828,7.719,-1.7168,1


In [52]:
df.shape

(1372, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
variance of Wavelet Transformed image(WTI)    1372 non-null float64
skewness of WTI                               1372 non-null float64
kurtosis of WTI                               1372 non-null float64
entropy of image                              1372 non-null float64
class                                         1372 non-null int64
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [6]:
df['class'].value_counts()

0    762
1    610
Name: class, dtype: int64

In [7]:
df.corr(method='pearson')['class']

variance of Wavelet Transformed image(WTI)   -0.724843
skewness of WTI                              -0.444688
kurtosis of WTI                               0.155883
entropy of image                             -0.023424
class                                         1.000000
Name: class, dtype: float64

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
def split_in_grps(train,attribute,value):
    left=list()
    right=list()
    for i in range(train.shape[0]):
        if(train[attribute].iloc[i]>=value):
            right.append(list(train.iloc[i]))
        else:
            left.append(list(train.iloc[i]))
#Since we are dealing in dataFrames for whole of our algo,we would convert the lists hence found to dataframe
    left=pd.DataFrame(left,columns=list(train))
    right=pd.DataFrame(right,columns=list(train))
    return left,right

In [62]:
def giniScore(groups,class_values):
    #GiniScore is calculated just by the concept of confusion_matrix and weighted avg.
    total_rows=float(sum([group.shape[0] for group in groups]))
    gini=0.0
    for group in groups:
        size = len(group)
        fsize=float(size)
        # avoid divide by zero
        if size == 0:
            continue
        score=0.0
        for class_ in class_values:
            p=[group.iloc[i,-1] for i in range(size)].count(class_)/fsize
            score+=p*p
        gini+=(1-score)*(fsize/total_rows)
    return gini

In [11]:
#This function would finally give the ans 
def to_terminal(group):
    outcomes=[group.iloc[i,-1] for i in range(group.shape[0])]
    return max(set(outcomes), key=outcomes.count)

In [12]:
def split(node,max_depth,min_entry,curr_depth):
    #To split the tree or mark it as terminal node
    left,right=node['groups']
    
    #Dividion results in either the left node or right node being totally empty
    if(left.shape[0]==0 or right.shape[0]==0):
        node['left']=node['right']=to_terminal(pd.concat([left,right]))
        return
    #when we reached the max_depth
    if(curr_depth>=max_depth):
        node['left'],node['right']=to_terminal(left),to_terminal(right)
        return
    
    #further splitting the left node
    if len(left) <= min_entry:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_Best_split(left)
        split(node['left'], max_depth, min_entry,curr_depth+1)
        # process right child
    if len(right) <= min_entry:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_Best_split(right)
        split(node['right'], max_depth, min_entry,curr_depth+1)

In [63]:
#We would check every attribute and every value as a chance of best split and would return a dictionary of attribute
#values and groups hence splitted
def get_Best_split(train):
    att=list(train)
    class_values=[0,1]
    examples=train.shape[0]
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    
    for j in range(len(att)-1):
        for i in range(examples):
            groups=split_in_grps(train,att[j],train.iloc[i,j])
            gini=giniScore(groups,class_values)
            if(gini<b_score):
                b_score=gini
                b_index,b_value,b_groups=att[j],train.iloc[i,j],groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [14]:
#Tree is built by first finding out the right split in terms of attribute and value,than it's subdivided child are
#processed and the process is repeated.
def build_tree(train,max_depth,min_entry):
    root=get_Best_split(train)
    #Dividing root now
    split(root,max_depth,min_entry,1)
    return root

In [15]:
# Predicting the class for test dataset
def predict(row,tree):
    if(row[tree['index']]>=tree['value']):
        if(isinstance(tree['right'],dict)):
            return predict(row,tree['right'])
        else:
            return tree['right']
    else:
        if(isinstance(tree['left'],dict)):
            return predict(row,tree['left'])
        else:
            return tree['left']

In [16]:
def decision_tree(train,test,max_depth,min_entry):
    tree=build_tree(train,max_depth,min_entry)
    predictions=list()
    for i in range(test.shape[0]):
        ans=predict(test.iloc[i],tree)
        predictions.append(ans)
    return predictions

In [60]:
def evaluate(df,decision_tree,n_folds,max_depth,min_entry):
    AccuracyList=list()
    nSets=df.shape[0]/n_folds
    start=0
    for fold in range(n_folds):
        train=df.iloc[:]
        test=df.iloc[start:start+nSets,:]
        cond=train.index.isin(test.index)
        train.drop(train[cond].index,inplace=True)
        start=start+nSets
        ans=test['class']
        test.drop('class',axis=1,inplace=True)
        
        predictions=decision_tree(train,test,max_depth,min_entry)
        accuracy=accuracy_score(ans,predictions)
        AccuracyList.append(accuracy)
        print(accuracy)
    return sum(AccuracyList)/len(AccuracyList)        

In [64]:
n_folds=5
max_depth=5
min_entry=10
scores=evaluate(df,decision_tree,n_folds,max_depth,min_entry)

0.8138686131386861
0.8321167883211679
0.8576642335766423
0.8138686131386861
0.8576642335766423


In [65]:
print(scores)

0.8350364963503649


In [67]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [68]:
n_folds=5
nSets=df.shape[0]/n_folds
start=0
AccuracyList=list()
for fold in range(n_folds):
    train=df.iloc[:]
    test=df.iloc[start:start+nSets,:]
    cond=train.index.isin(test.index)
    train.drop(train[cond].index,inplace=True)
    start=start+nSets
    ans=test['class']
    test.drop('class',axis=1,inplace=True)
    
    model=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=5,min_samples_leaf=5)
    model.fit(train.drop('class',axis=1,inplace=False),train['class'])
    predictions=model.predict(test)
    
    accuracy=accuracy_score(ans,predictions)
    AccuracyList.append(accuracy)
    print(accuracy)
print(sum(AccuracyList)/len(AccuracyList))   

0.9817518248175182
0.9562043795620438
0.9635036496350365
0.9598540145985401
0.9781021897810219
0.9678832116788321


In [69]:
#Using Random Forests
from sklearn.ensemble import RandomForestClassifier

In [70]:
n_folds=5
nSets=df.shape[0]/n_folds
start=0
AccuracyList=list()
for fold in range(n_folds):
    train=df.iloc[:]
    test=df.iloc[start:start+nSets,:]
    cond=train.index.isin(test.index)
    train.drop(train[cond].index,inplace=True)
    start=start+nSets
    ans=test['class']
    test.drop('class',axis=1,inplace=True)
    
    model = RandomForestClassifier(n_estimators=10, 
                               bootstrap = True,
                               max_features = 'sqrt')
    # Fit on training data
    model.fit(train.drop('class',axis=1,inplace=False),train['class'])
    predictions=model.predict(test)
    
    accuracy=accuracy_score(ans,predictions)
    AccuracyList.append(accuracy)
    print(accuracy)
print(sum(AccuracyList)/len(AccuracyList))   


0.9927007299270073
0.9890510948905109
0.9927007299270073
0.9963503649635036
0.9963503649635036
0.9934306569343067
