#---------------------------------------------------------------------------------------
# --------------------------------------                                                |
#| Author: Kevin Brian Kwan Chong Loo   |                                               |
#| ID: A01192177                        |                                               |
#| Date: September 30th, 2019           |                                               |
#| Email: kb.kwanloo@gmail.com          |                                               |
# --------------------------------------                                                |
#                                                                                       |
#The presented code evaluates data sets (imported as CSV files),                        |
#   and evaluates them based on different classifiers.                                  |
#                                                                                       |
#By default, the following classifiers are used and with their respective code:         |
#   - Bayesian Networks             --> BayesianNetwork                                 |
#   - Multi-Layer Perceptron        --> MultiLayerPerceptron                            |
#   - AdaBoost                      --> AdaBoost                                        |
#   - K Nearest Neighbor            --> KNN                                             |
#   - Random Forest                 --> RandomForest                                    |
#   - Support Vector Machines       --> SVM                                             |
#   - Naive Bayes                   --> NaiveBayes                                      |
#   - Linear Discriminant Analysis  --> LDA                                             |
#                                                                                       |
#If only certain classifiers from the list above want to be used,                       |
#   it only has to be an argument when executing this Python Code.                      |
#   To specify the classifiers that want to be used, the codes of                       |
#   each classifier must be put as an argument separated with a comma                   |
#   since all the classifiers to be used are read as a string and                       |
#   parsed by the comma.                                                                |
#                                                                                       |
#As for the Cross Validation value, by default, it has a value of 10.                   |
#   Likewise, the values of K-Cross Validation can be specified as an                   |
#   argument.                                                                           |
#                                                                                       |
# --------------------                                                                  |
#|Example of Execution|                                                                 |
# --------------------                                                                  |
# $ python3 Evaluate_Classifiers.py -c SVM,NaiveBayes,LDA -k 5                          |
#                                                                                       |
#For the previous example, only Support Vector Machines, Naive Bayes and                |
#   Linear Discriminant Analysis are going to be used to evaluate a data set,           |
#   with 5-Cross Validation.                                                            |
#                                                                                       |
#For the classifiers, the "sklearn" Python library can be used. However, an alternative |
#   is set is a GPU wants to be used in order to run the code faster. "h2o4gpu" has     |
#   almost the same functions as "sklearn".                                             |
#                                                                                       |
#Likewise, a Python-Weka library was installed in order to use certain classifiers      |
#   from Weka and receive the information in Python.                                    |
#                                                                                       |
# ----------------------                                                                |
#|Description of Problem|                                                               |
# ----------------------                                                                |
#This code is designed to work with the data set put in the data folder. The data set   |
#   contains information of the Top 200 Universities of the QS ranking. Having the      |
#   data set, partitions where made in order to classify the Universities among two     |
#   classes. Take into account if the code is going to be adapted to other data sets,   |
#   it requires certain changes in the code.                                            |
#---------------------------------------------------------------------------------------


In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
#Basic Libraries to be used
#--------------------------
import numpy as np
from scipy import interp
import pandas as pd
import time
from statistics import mean
import argparse
import os
import subprocess

#Import Libraries of Classifiers using sklearn (uncomment if it wants to be used)
#---------------------------------------------
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#In order to evaluate the data sets, it is required to have the file in CSV and ARFF format.
#folderPathOfArffFiles = os.getcwd()+"/Data/2CLUSTER/"
os.chdir("/content/gdrive/My Drive/Machine_learning_team/Assignment_2/")
folderPathOfCSVFiles = os.getcwd()+"/Data/2CLUSTER/"

In [None]:
os.listdir(folderPathOfCSVFiles)[2:10]

['3CLUSTER_2.csv',
 '3CLUSTER_3.csv',
 '3CLUSTER_4.csv',
 '3CLUSTER_5.csv',
 '3CLUSTER_6.csv',
 '3CLUSTER_7.csv',
 '3CLUSTER_8.csv',
 '3CLUSTER_9.csv']

In [None]:
startProgram = time.time()

#Define how many k folds
kFold = 10

#Assignment of K-Fold Cross Validation.
cv = StratifiedKFold(n_splits=kFold)

#Extract the classifiers that want to be used.
classifiersStr = "MultiLayerPerceptron,AdaBoost,KNN,RandomForest,NaiveBayes,LDA"
classifiersList = classifiersStr.split(",")

#Name of the output file where the performance of each classifier and the maximum among them
#   per partition is saved in a file.
fileOut = "./Cluster_Evaluations_"+str(kFold)+"_Fold.txt"
if (os.path.isfile(fileOut)):
  outfile = open(fileOut, "a+")
else:
  outfile = open(fileOut, "w+")
outfile.write("Partition_File,"+classifiersStr+",Maximum\n")
outfile.close()

In [None]:
files = os.listdir(folderPathOfCSVFiles)

#Loop that goes through each file.
for input_file in files:
    information = input_file+","
    outfile = open(fileOut, "a+")
    filePath = folderPathOfCSVFiles+input_file

    #Reading the partition file.
    data = pd.read_csv(filePath, header = 0)
    attribute_Names = list(data.columns.values)

    #Define the attributes and class in separate variables.
    X = np.array(data[attribute_Names[7:len(attribute_Names)-2]])
    y = np.array(data["class"])

    classifiersMean = []
    print("Analyzing %s" %(filePath))
    startExecution = time.time()
    for classifierType in classifiersList:
        meanAUCArray = []
        #K iterations of the K-Fold Cross Validation
        for kIterations in range(0,kFold):
            print("Classifier: %s Iteration: %d" %(classifierType,kIterations))
            valid = True

            #Selection of the classifier that wants to be used.
            # if(classifierType == "SVM"):
            #     #For Support Vector Machine, the classifier of sklearn or Weka can be
            #     #   used, just by commenting the following lines.

            #     classifier = SVC(kernel='linear', probability=True)        #sklearn
                # meanAUC = obtainSVM(input_file[0:input_file.find(".")])     #Weka
                # meanAUCArray.append(meanAUC)                                #Weka
                # valid = False                                               #Weka
            if(classifierType == "NaiveBayes"):
                classifier = GaussianNB()
            elif(classifierType == "LDA"):
                classifier = LinearDiscriminantAnalysis()
            elif(classifierType == "RandomForest"):
                classifier = RandomForestClassifier()
            elif(classifierType == "KNN"):
            	classifier = KNeighborsClassifier(5)
            elif(classifierType == "AdaBoost"):
            	classifier = AdaBoostClassifier()
            elif(classifierType == "MultiLayerPerceptron"):
            	classifier = MLPClassifier()
            # elif(classifierType == "BayesianNetwork"):
            #     meanAUC = obtainBayesNet(input_file[0:input_file.find(".")])
            #     meanAUCArray.append(meanAUC)
            #     valid = False
            else:
                print("%s is not available. This classifier will be ignored. "%(classifierType))
                valid = False

            #Obtaining the Cross Validation ROC-AUC per classifier. If Weka is used for the
            #   classifier, the following code is omitted. 
            if(valid):
                i = 0
                aucs = []
                print("Starting K-Fold Cross Validation...")
                #Split per Cross Validation is applied.
                for train, test in cv.split(X, y):
                    start = time.time()
                    print("Training CV-%d..." %(i))
                    #The classifier is trained with the data and tested with the test set.
                    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
                    #ROC-AUC is obtained.
                    auc = roc_auc_score(y[test], probas_[:, 1])
                    print("Cross Validation: %d AUC: %3.2f Time: %5.3fs" %(i,auc,time.time()-start))
                    #Each AUC per Cross Validation of each classifier is saved.
                    aucs.append(auc)
                    i+=1

                #The AUC per Cross Validation of each classifier are averaged.
                meanAUC = mean(aucs)
                meanAUCArray.append(meanAUC)
                print("File: %s Classifier: %s Mean AUC: %5.3f\n" %(input_file,classifierType,meanAUC))       	
        kIterationMeanAUC = mean(meanAUCArray)
        classifiersMean.append(kIterationMeanAUC)
        information+=str(round(kIterationMeanAUC,5))+","

    #The maximum AUC value is obtained among all classifiers and writen in a text file.
    print("Maximum AUC among Classifiers: %5.3f" %(max(classifiersMean)))
    print("Total Time of Execution: %6.3fs" %(time.time()-startExecution))
    information+=str(round(max(classifiersMean),5))+"\n"
    outfile.write(information)
    outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training CV-1...
Cross Validation: 1 AUC: 0.67 Time: 3.416s
Training CV-2...
Cross Validation: 2 AUC: 0.61 Time: 3.469s
Training CV-3...
Cross Validation: 3 AUC: 0.71 Time: 3.369s
Training CV-4...
Cross Validation: 4 AUC: 0.66 Time: 3.508s
Training CV-5...
Cross Validation: 5 AUC: 0.66 Time: 3.443s
Training CV-6...
Cross Validation: 6 AUC: 0.62 Time: 3.398s
Training CV-7...
Cross Validation: 7 AUC: 0.54 Time: 3.378s
Training CV-8...
Cross Validation: 8 AUC: 0.65 Time: 3.393s
Training CV-9...
Cross Validation: 9 AUC: 0.64 Time: 3.466s
File: 3CLUSTER_6.csv Classifier: RandomForest Mean AUC: 0.636

Classifier: RandomForest Iteration: 3
Starting K-Fold Cross Validation...
Training CV-0...
Cross Validation: 0 AUC: 0.58 Time: 3.393s
Training CV-1...
Cross Validation: 1 AUC: 0.65 Time: 3.543s
Training CV-2...
Cross Validation: 2 AUC: 0.66 Time: 3.469s
Training CV-3...
Cross Validation: 3 AUC: 0.66 Time: 3.418s
Training CV-4...
C

In [None]:
print("\nComplte Program Time of Execution: %6.3fs" %(time.time()-startProgram))


Complte Program Time of Execution: 8095.607s
