In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
BASE_PATH = os.path.abspath("Dataset")
DATASET_PATH_1 = os.path.join(BASE_PATH,"Data_for_UCI_named.csv")
DATASET_PATH_2 = os.path.join(BASE_PATH,"hcvdat0.csv")
DATASET_PATH_3 = os.path.join(BASE_PATH,"fertility_Diagnosis.txt")
DATASET_PATH_4 = os.path.join(BASE_PATH,"PhishingData.csv")
DATASET_PATH_5 = os.path.join(BASE_PATH,"log2.csv")

In [3]:
def TenfoldDT(X, Y, model):
    
    # define cross validation as 10 times 10 Fold CV
    cv = RepeatedKFold(n_splits=10, n_repeats=10)
    
    accuracy = []
    
    # splitting into test and train using 10 Fold CV and calculating accuracy on test data
    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        model.fit(X_train, Y_train)
        Y_Pred = model.predict(X_test)
        accuracy.append(accuracy_score(Y_test, Y_Pred))
    return np.std(accuracy)

In [4]:
def holdoutDT(X, Y, model):
   
    # define classifier as Decision Tree
    accuracy = []
    
    # loop to repeat holdout 100 times
    for i in range(100):
        # Spliting data into train and test by ratio 70:30
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
        model.fit(X_train, Y_train)
        Y_Pred = model.predict(X_test)
        accuracy.append(accuracy_score(Y_test, Y_Pred))
    return np.std(accuracy)

In [5]:
def ALLINONE(X,Y):
    # Decision Tree with 10 Fold CV, Gini, Without pruning
    model = DecisionTreeClassifier(criterion = 'gini')
    std_accc = TenfoldDT(X, Y, model)
    
    print("***Decision Tree with 10 Fold CV, Gini, Without pruning***")
    print("STD  ACC :",std_accc)

    # Decision Tree with 10 Fold CV, Gini, With pruning
    model = DecisionTreeClassifier(criterion = 'gini', ccp_alpha=0.015)
    std_accc = TenfoldDT(X, Y, model)

    print("\n***Decision Tree with 10 Fold CV, Gini, With pruning***")
    print("STD  ACC :",std_accc)
    
    # Decision Tree with 10 Fold CV, Entropy, Without pruning
    model = DecisionTreeClassifier(criterion = 'entropy')
    std_accc = holdoutDT(X, Y, model)

    print("\n***Decision Tree with 10 Fold CV, Entropy, Without pruning***")
    print("STD  ACC :",std_accc)
    
    # Decision Tree with 10 Fold CV, Entropy, With pruning
    model = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha=0.015)
    std_accc =  TenfoldDT(X, Y, model)

    print("\n***Decision Tree with 10 Fold CV, Entropy, With pruning***")
    print("STD  ACC :",std_accc)
    
    # Decision Tree with Holdout, Gini, Without pruning
    model = DecisionTreeClassifier(criterion = 'gini')
    std_accc = holdoutDT(X, Y, model)

    print("\n***Decision Tree with Holdout, Gini, Without pruning***")
    print("STD  ACC :",std_accc)
    
    # Decision Tree with Holdout, Gini, With pruning
    model = DecisionTreeClassifier(criterion = 'gini', ccp_alpha=0.015)
    std_accc = holdoutDT(X, Y, model)

    print("\n***Decision Tree with Holdout, Gini, With pruning***")
    print("STD  ACC :",std_accc)
    
    # Decision Tree with Holdout, Entropy, Without pruning
    model = DecisionTreeClassifier(criterion = 'entropy')
    std_accc = holdoutDT(X, Y, model)

    print("\n***Decision Tree with Holdout, Entropy, Without pruning***")
    print("STD  ACC :",std_accc)
    
    # Decision Tree with Holdout, Entropy, With pruning
    model = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha=0.015)
    std_accc = holdoutDT(X, Y, model)

    print("\n***Decision Tree with Holdout, Entropy, With pruning***")
    print("STD  ACC :",std_accc)

### 1. Data_for_UCI_named (DF1)

In [6]:
DF1 = pd.read_csv(DATASET_PATH_1);

DF1.drop(['stab'],axis=1,inplace=True)

# Separate class/label from DF1
Y = DF1[['stabf']].values
DF1.drop(['stabf'],axis=1,inplace=True)
X = DF1.values
ALLINONE(X,Y)

***Decision Tree with 10 Fold CV, Gini, Without pruning***
STD  ACC : 0.009669663903156106

***Decision Tree with 10 Fold CV, Gini, With pruning***
STD  ACC : 0.013569742075662286

***Decision Tree with 10 Fold CV, Entropy, Without pruning***
STD  ACC : 0.006564106946112311

***Decision Tree with 10 Fold CV, Entropy, With pruning***
STD  ACC : 0.01766713049705581

***Decision Tree with Holdout, Gini, Without pruning***
STD  ACC : 0.007422132068647417

***Decision Tree with Holdout, Gini, With pruning***
STD  ACC : 0.011098974427096097

***Decision Tree with Holdout, Entropy, Without pruning***
STD  ACC : 0.006297763271370404

***Decision Tree with Holdout, Entropy, With pruning***
STD  ACC : 0.01523683402518749


### 2. HCVDAT0 (DF2)

In [7]:
DF2 = pd.read_csv(DATASET_PATH_2,index_col=0);
DF2['Sex'].replace(['f','m'],[0,1],inplace=True)
DF2.fillna(DF2.mean(),inplace=True)

# # Separate class/label from DF1
Y = DF2[['Category']].values
DF2.drop(['Category'],axis=1,inplace=True)
X = DF2.values
ALLINONE(X,Y)

***Decision Tree with 10 Fold CV, Gini, Without pruning***
STD  ACC : 0.035446690108861166

***Decision Tree with 10 Fold CV, Gini, With pruning***
STD  ACC : 0.03625686276028305

***Decision Tree with 10 Fold CV, Entropy, Without pruning***
STD  ACC : 0.021300589675544934

***Decision Tree with 10 Fold CV, Entropy, With pruning***
STD  ACC : 0.033930773547996076

***Decision Tree with Holdout, Gini, Without pruning***
STD  ACC : 0.020058278858540765

***Decision Tree with Holdout, Gini, With pruning***
STD  ACC : 0.022240862241616175

***Decision Tree with Holdout, Entropy, Without pruning***
STD  ACC : 0.02072259294322801

***Decision Tree with Holdout, Entropy, With pruning***
STD  ACC : 0.022472188801712444


### 3.FERTILITY DIAGONSIS (DF3)

In [8]:
DF3 = pd.read_csv(DATASET_PATH_3,header=None);

DF3.head()


# # Separate class/label from DF1
Y = DF3[[9]].values
DF3.drop([9],axis=1,inplace=True)
X = DF3.values
ALLINONE(X,Y)

***Decision Tree with 10 Fold CV, Gini, Without pruning***
STD  ACC : 0.10618851161966629

***Decision Tree with 10 Fold CV, Gini, With pruning***
STD  ACC : 0.10862780491200216

***Decision Tree with 10 Fold CV, Entropy, Without pruning***
STD  ACC : 0.05812247605033895

***Decision Tree with 10 Fold CV, Entropy, With pruning***
STD  ACC : 0.11339753083731587

***Decision Tree with Holdout, Gini, Without pruning***
STD  ACC : 0.0706131873110273

***Decision Tree with Holdout, Gini, With pruning***
STD  ACC : 0.0686569572423493

***Decision Tree with Holdout, Entropy, Without pruning***
STD  ACC : 0.06410408203330997

***Decision Tree with Holdout, Entropy, With pruning***
STD  ACC : 0.06936137253543936


### 4. PHISHING DATA (DF4)

In [9]:
DF4 = pd.read_csv(DATASET_PATH_4,index_col=0);

DF4.head()

# # Separate class/label from DF1
Y = DF4[['Result']].values
DF4.drop(['Result'],axis=1,inplace=True)
X = DF4.values
ALLINONE(X,Y)

***Decision Tree with 10 Fold CV, Gini, Without pruning***
STD  ACC : 0.027082731317334154

***Decision Tree with 10 Fold CV, Gini, With pruning***
STD  ACC : 0.03080781936142326

***Decision Tree with 10 Fold CV, Entropy, Without pruning***
STD  ACC : 0.014998442815950811

***Decision Tree with 10 Fold CV, Entropy, With pruning***
STD  ACC : 0.028767888095846253

***Decision Tree with Holdout, Gini, Without pruning***
STD  ACC : 0.014630189410426632

***Decision Tree with Holdout, Gini, With pruning***
STD  ACC : 0.01691176841413255

***Decision Tree with Holdout, Entropy, Without pruning***
STD  ACC : 0.015318117958809269

***Decision Tree with Holdout, Entropy, With pruning***
STD  ACC : 0.021141328295327267


### 5.LOG2 (DF5)

In [10]:
DF5 = pd.read_csv(DATASET_PATH_5);

# # Separate class/label from DF1
Y = DF5[['Action']].values
DF5.drop(['Action'],axis=1,inplace=True)
X = DF5.values
ALLINONE(X,Y)

***Decision Tree with 10 Fold CV, Gini, Without pruning***
STD  ACC : 0.0006195698341441089

***Decision Tree with 10 Fold CV, Gini, With pruning***
STD  ACC : 0.0010985767467512548

***Decision Tree with 10 Fold CV, Entropy, Without pruning***
STD  ACC : 0.0003350142363069527

***Decision Tree with 10 Fold CV, Entropy, With pruning***
STD  ACC : 0.0010171885003022886

***Decision Tree with Holdout, Gini, Without pruning***
STD  ACC : 0.000344153731962846

***Decision Tree with Holdout, Gini, With pruning***
STD  ACC : 0.0005568660282897619

***Decision Tree with Holdout, Entropy, Without pruning***
STD  ACC : 0.00033532145928483377

***Decision Tree with Holdout, Entropy, With pruning***
STD  ACC : 0.0005568660282897619
