In [None]:
# import the necessary libraries

import numpy as np
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
# unzipping a file if required

path = os.getcwd()
path = os.path.join(path, "all_data.zip")
!unzip {path} -d "/content/"

Archive:  /content/all_data.zip
  inflating: /content/all_data/test_c1000_d100.csv  
  inflating: /content/all_data/test_c1000_d1000.csv  
  inflating: /content/all_data/test_c1000_d5000.csv  
  inflating: /content/all_data/test_c1500_d100.csv  
  inflating: /content/all_data/test_c1500_d1000.csv  
  inflating: /content/all_data/test_c1500_d5000.csv  
  inflating: /content/all_data/test_c1800_d100.csv  
  inflating: /content/all_data/test_c1800_d1000.csv  
  inflating: /content/all_data/test_c1800_d5000.csv  
  inflating: /content/all_data/test_c300_d100.csv  
  inflating: /content/all_data/test_c300_d1000.csv  
  inflating: /content/all_data/test_c300_d5000.csv  
  inflating: /content/all_data/test_c500_d100.csv  
  inflating: /content/all_data/test_c500_d1000.csv  
  inflating: /content/all_data/test_c500_d5000.csv  
  inflating: /content/all_data/train_c1000_d100.csv  
  inflating: /content/all_data/train_c1000_d1000.csv  
  inflating: /content/all_data/train_c1000_d5000.csv  
  inf

In [None]:
# creating a dataframe to store results
df_results = pd.DataFrame(columns=['DataSet_name', 'Criterion', 'Splitter', 'Max_depth', 'Min_samples_split', 'Max_features', 'Accuracy', 'F1_score'])

In [None]:
# Defining the Decision Tree Classifier

def DT():

  # Getting the features and output 
  XValid = validData.iloc[:, 0:-1].values
  YValid = validData.iloc[:, -1].values
  XTest = testData.iloc[:, 0:-1].values
  YTest = testData.iloc[:, -1].values

  # Getting the best parameters
  parameters = {'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random'), 'max_depth': (None, 5, 10, 50, 100), 'min_samples_split': (2, 5, 7, 11, 15), 'max_features': (None, 'sqrt', 'log2')}
  grid = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=10, verbose=True)
  grid.fit(XValid, YValid)

  # Combining training and validation data
  trainX1 = trainData.iloc[:, 0:-1]
  validX1 = validData.iloc[:, 0:-1]
  XData = pd.concat([trainX1, validX1], axis=0, copy=True).values

  trainY1 = trainData.iloc[:, -1]
  validY2 = validData.iloc[:, -1]
  YData = pd.concat([trainY1, validY2], axis=0, copy=True).values

  # Final training and Testing
  tree = DecisionTreeClassifier(**grid.best_params_)
  tree = tree.fit(XData, YData)
  yPredict = tree.predict(XTest)

  return grid.best_params_, metrics.accuracy_score(YTest, yPredict), f1_score(YTest, yPredict) 

In [None]:
# Getting the data
path = os.getcwd()
path = os.path.join(path, 'all_data')
firstNum = [3, 5, 10, 15, 18]
secNum = [1, 10, 50]

for i in firstNum:
  for j in secNum:
    trainData = pd.read_csv(path + "/train_c" + str(i) + "00_d" + str(j) + "00.csv", header=None)
    validData = pd.read_csv(path + "/valid_c" + str(i) + "00_d" + str(j) + "00.csv", header=None)
    testData = pd.read_csv(path + "/test_c" + str(i) + "00_d" + str(j) + "00.csv", header=None)

    Best_Parameters, Acc, F_Score = DT()
    df_results = df_results.append({'DataSet_name': "c" + str(i) + "00_d" + str(j) + "00", 'Criterion': Best_Parameters['criterion'], 'Splitter': Best_Parameters['splitter'], 'Max_depth': Best_Parameters['max_depth'], 'Min_samples_split': Best_Parameters['min_samples_split'], 'Max_features': Best_Parameters['max_features'], 'Accuracy': Acc, 'F1_score': F_Score}, ignore_index=True)

In [None]:
df_results

Unnamed: 0,DataSet_name,Criterion,Splitter,Max_depth,Min_samples_split,Max_features,Accuracy,F1_score
0,c300_d100,gini,best,,2,sqrt,0.555,0.552764
1,c300_d1000,entropy,best,5.0,2,,0.6725,0.709534
2,c300_d5000,entropy,best,10.0,11,,0.7794,0.78723
3,c500_d100,gini,random,50.0,2,,0.645,0.666667
4,c500_d1000,gini,best,5.0,2,,0.682,0.687008
5,c500_d5000,gini,best,10.0,7,,0.791,0.800801
6,c1000_d100,entropy,random,,11,sqrt,0.695,0.680628
7,c1000_d1000,gini,random,100.0,15,,0.792,0.792
8,c1000_d5000,entropy,best,10.0,7,,0.8595,0.864735
9,c1500_d100,entropy,random,,15,sqrt,0.8,0.8
