In [None]:
# import the necessary libraries

import numpy as np
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
# unzipping the file if required

path = os.getcwd()
path = os.path.join(path, "all_data.zip")
!unzip {path} -d "/content/"

Archive:  /content/all_data.zip
  inflating: /content/all_data/test_c1000_d100.csv  
  inflating: /content/all_data/test_c1000_d1000.csv  
  inflating: /content/all_data/test_c1000_d5000.csv  
  inflating: /content/all_data/test_c1500_d100.csv  
  inflating: /content/all_data/test_c1500_d1000.csv  
  inflating: /content/all_data/test_c1500_d5000.csv  
  inflating: /content/all_data/test_c1800_d100.csv  
  inflating: /content/all_data/test_c1800_d1000.csv  
  inflating: /content/all_data/test_c1800_d5000.csv  
  inflating: /content/all_data/test_c300_d100.csv  
  inflating: /content/all_data/test_c300_d1000.csv  
  inflating: /content/all_data/test_c300_d5000.csv  
  inflating: /content/all_data/test_c500_d100.csv  
  inflating: /content/all_data/test_c500_d1000.csv  
  inflating: /content/all_data/test_c500_d5000.csv  
  inflating: /content/all_data/train_c1000_d100.csv  
  inflating: /content/all_data/train_c1000_d1000.csv  
  inflating: /content/all_data/train_c1000_d5000.csv  
  inf

In [None]:
# creating a dataframe to store results
df_results = pd.DataFrame(columns=['DataSet_name', 'N_Estimators', 'Max_samples', 'Max_features', 'Bootstrap', 'Accuracy', 'F1_score'])

In [None]:
# Defining the Bagging Classifier

def DT():

  # Getting the features and output 
  XValid = validData.iloc[:, 0:-1].values
  YValid = validData.iloc[:, -1].values
  XTest = testData.iloc[:, 0:-1].values
  YTest = testData.iloc[:, -1].values

  # Getting the best parameters
  parameters = {'n_estimators': (10, 50, 100), 'max_samples': (0.1, 1, 10), 'max_features': (0.1, 1, 5, 10), 'bootstrap': [True, False]}
  grid = GridSearchCV(BaggingClassifier(DecisionTreeClassifier()), parameters, n_jobs=10, verbose=True)
  grid.fit(XValid, YValid)

  # Combining training and validation data
  trainX1 = trainData.iloc[:, 0:-1]
  validX1 = validData.iloc[:, 0:-1]
  XData = pd.concat([trainX1, validX1], axis=0, copy=True).values

  trainY1 = trainData.iloc[:, -1]
  validY2 = validData.iloc[:, -1]
  YData = pd.concat([trainY1, validY2], axis=0, copy=True).values

  # Final training and Testing
  tree = BaggingClassifier(DecisionTreeClassifier(),**grid.best_params_)
  tree = tree.fit(XData, YData)
  yPredict = tree.predict(XTest)

  return grid.best_params_, metrics.accuracy_score(YTest, yPredict), f1_score(YTest, yPredict) 

In [None]:
# Getting the data
path = os.getcwd()
path = os.path.join(path, 'all_data')
firstNum = [3, 5, 10, 15, 18]
secNum = [1, 10, 50]

for i in firstNum:
  for j in secNum:
    trainData = pd.read_csv(path + "/train_c" + str(i) + "00_d" + str(j) + "00.csv", header=None)
    validData = pd.read_csv(path + "/valid_c" + str(i) + "00_d" + str(j) + "00.csv", header=None)
    testData = pd.read_csv(path + "/test_c" + str(i) + "00_d" + str(j) + "00.csv", header=None)

    Best_Parameters, Acc, F_Score = DT()
    df_results = df_results.append({'DataSet_name': "c" + str(i) + "00_d" + str(j) + "00", 'N_Estimators': Best_Parameters['n_estimators'], 'Max_samples': Best_Parameters['max_samples'], 'Max_features': Best_Parameters['max_features'], 'Bootstrap': Best_Parameters['bootstrap'], 'Accuracy': Acc, 'F1_score': F_Score}, ignore_index=True)

In [None]:
df_results

Unnamed: 0,DataSet_name,N_Estimators,Max_samples,Max_features,Bootstrap,Accuracy,F1_score
0,c300_d100,100,0.1,0.1,True,0.635,0.697095
1,c300_d1000,100,0.1,0.1,True,0.75,0.745935
2,c300_d5000,100,0.1,0.1,True,0.8191,0.816215
3,c500_d100,50,0.1,0.1,True,0.685,0.686567
4,c500_d1000,100,0.1,0.1,False,0.887,0.88867
5,c500_d5000,100,0.1,0.1,False,0.9112,0.912114
6,c1000_d100,100,0.1,0.1,True,0.955,0.955665
7,c1000_d1000,100,0.1,0.1,True,0.9875,0.987556
8,c1000_d5000,100,0.1,0.1,True,0.9939,0.993893
9,c1500_d100,100,0.1,0.1,False,1.0,1.0
