In [1]:
# scaling the features by using the standard scaler
# algorithm parameter:
#      100 total trees in RF
#      select 10 trees by GA

In [1]:
# library for GA
import random

from deap import base
from deap import creator
from deap import tools
from deap import algorithms

# library for RF
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


import pandas as pd
from seaborn import violinplot
import seaborn as sns

from sklearn.preprocessing import StandardScaler

In [2]:
spectrum_data = pd.read_excel('./spectrum_data/Nothing_Mine_CEandRock.xlsx',  sep = ',', header = 0)

In [3]:
spectrum_data[0:5]

Unnamed: 0,Signal_Path,Frequency_0,Frequency_0dot32541,Frequency_0dot65083,Frequency_0dot97624,Frequency_1dot3017,Frequency_1dot6271,Frequency_1dot9525,Frequency_2dot2779,Frequency_2dot6033,...,Frequency_497dot234,Frequency_497dot5594,Frequency_497dot8848,Frequency_498dot2102,Frequency_498dot5356,Frequency_498dot861,Frequency_499dot1865,Frequency_499dot5119,Frequency_499dot8373,Label
0,1,-16.746793,-20.676797,-31.824529,-33.595664,-35.576217,-35.434632,-36.515813,-38.387189,-38.728927,...,-86.637317,-86.34354,-86.970514,-87.039263,-88.502441,-90.199369,-87.116829,-88.726635,-90.726685,0
1,1,-4.551514,-7.421589,-27.810436,-31.355778,-37.968985,-41.496863,-40.665835,-41.751791,-45.158375,...,-92.33758,-94.363767,-92.113655,-91.933969,-91.717233,-91.703702,-94.641164,-90.733663,-89.704103,0
2,1,-8.464251,-11.391572,-31.498198,-38.491415,-45.693272,-41.769994,-39.504399,-40.434299,-43.019136,...,-90.403409,-91.020674,-88.18721,-85.616805,-86.914083,-91.192905,-91.401812,-91.905812,-86.226784,0
3,1,-5.957897,-8.926523,-36.362773,-38.174161,-40.979502,-43.905112,-46.415447,-41.430853,-42.829122,...,-90.557058,-91.756146,-89.087595,-90.093507,-89.85505,-90.758625,-88.137099,-89.833695,-89.88838,0
4,1,-5.197722,-8.236918,-35.767897,-39.269133,-43.522638,-47.068337,-50.534088,-51.865788,-50.955593,...,-91.405036,-90.59251,-91.839187,-93.906633,-91.387335,-93.815508,-93.235216,-92.409103,-93.528927,0


In [4]:
# set algorithm parameters

def run_GA(NUM_TREES, IND_SIZE, POP_SIZE, CX_RATE = 0.8):
    """
    NUM_TREES is the number of trees the random forest model pro
    """    
    MUTATE_RATE = 1.0/IND_SIZE 
    
    
    
    # prepare data
   
    raw = spectrum_data.values

    X = raw[:, 0:1538]
    y = raw[:, 1538]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random.randint(0,5000))
    
    # Feature Scaling
    end = 1539
    scaler = StandardScaler()
    X_train[:, 1:end] = scaler.fit_transform(X_train[:, 1:end])   # Fit to data, then transform it. Fit means Compute the mean and std to be used for later scaling.
    X_test[:, 1:end] = scaler.transform(X_test[:, 1:end]) # Perform standardization by centering and scaling
    
    model = RandomForestClassifier(n_estimators= NUM_TREES) # create a random forest with NUM_TREES = 20 
    model.fit(X_train, y_train) # train the model
    estimators = model.estimators_ # get all the trees
    
    # implement individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)
    # implement functions for initialize population and create individual
    toolbox = base.Toolbox() # create a toolbox of operators for our GA algorithm
    toolbox.register("indices", random.sample, range(NUM_TREES), NUM_TREES) # this is a helper function for creating
                                                                            # each individual
    toolbox.register("individual", tools.initIterate, creator.Individual,   # this is the function for creating an 
                     toolbox.indices)                                       # individual


    #toolbox.individual()  # a test 

    # implement function for creating a population
    toolbox.register("population", tools.initRepeat, list, toolbox.individual, n = POP_SIZE)

    #toolbox.population()  # a test
    
    def sub_rf_predict(sub_rf, X_test):
        """
        return the predict result using the sub_rf and X_test data;
        the rule is that predict result(labels) with the maximum number of votes wins.
        """
        predict_results = []
        for tree in sub_rf:
            prediction = tree.predict(X_test)
            # record prediction result for a tree
            predict_results.append(prediction)

        # compute the vote_result, i.e. the final result
        y_predict = [0]*len(X_test)
        for idx in range(len(X_test)):
            # for each test data
            # create a vote result
            v_result = vote_result()
            for predict_tree in predict_results:
                v_result[predict_tree[idx]] += 1

            # final result
            y_predict[idx] = keywithmaxval(v_result)

        return  np.array(y_predict, dtype = float)

    # helper function
    y_set = set(y_test).union(y_train)
    def vote_result():
        result = {}
        for k in y_set:
            result[k] = 0
        return result

    def keywithmaxval(d):
        """ a) create a list of the dict's keys and values; 
        b) return the key with the max value"""  
        v=list(d.values())
        k=list(d.keys())
        return k[v.index(max(v))]
    
    
    #  modified fitness function
    def evaluate(individual):
        # return the accuracy on the test data
        sub_random_forest = []
        for tree_idx in individual[0: IND_SIZE]:
            sub_random_forest.append(estimators[tree_idx])

        predict_sub_trees = sub_rf_predict(sub_random_forest, X_test)
        # print(predict_sub_trees.__repr__())
        # score = precision_score(y_test, predict_sub_trees, average = 'macro')
        cf_matrix = evaluate_confusion_matrix(individual)
        
        pseudo_acc_case_rate = cf_matrix[1, 1] / np.sum(cf_matrix)
        bad_case_rate = (cf_matrix[1, 0] + cf_matrix[1, 2]) / np.sum(cf_matrix)
        undesired_case_rate = (cf_matrix[0, 1] + cf_matrix[2, 1]) / np.sum(cf_matrix)
        
        # give accuracy more weight
        score = 0.4 * pseudo_acc_case_rate - 0.4 * bad_case_rate - 0.2 * undesired_case_rate
        return  score,  # must return an tuple!!!!
    
    def evaluate_confusion_matrix(individual):
        # return the confusion matrix of a model
        sub_random_forest = []
        for tree_idx in individual[0: IND_SIZE]:
            sub_random_forest.append(estimators[tree_idx])

        predict_sub_trees = sub_rf_predict(sub_random_forest, X_test)
        cf_matrix = confusion_matrix(y_test, predict_sub_trees)
        return  cf_matrix 

    
    
    # implement mutation operator
    mutation_op = tools.mutShuffleIndexes
    
    
    # implement crossover
    def crossover_op(ind1, ind2):
        # only cross over the first IND_SIZE elements in the individual in place
        crossover_idx = random.randint(0, IND_SIZE - 2)
        # print(crossover_idx)
        temp = toolbox.clone(ind1[crossover_idx + 1: IND_SIZE])
        ind1[crossover_idx + 1: IND_SIZE] = ind2[crossover_idx + 1: IND_SIZE]
        ind2[crossover_idx + 1: IND_SIZE] = temp
        return (ind1, ind2)
    
    # implement selection operator
    selection_op = tools.selTournament
    
    
    # register everything in our toolbox
    toolbox.register("mate", crossover_op)
    toolbox.register("mutate", mutation_op, indpb = MUTATE_RATE)
    toolbox.register("select", selection_op, tournsize=3)
    toolbox.register("evaluate", evaluate)
    
    
    h_fame = tools.HallOfFame(100) # keep track of the first 100 best individuals and store them in h_fame

    pop = toolbox.population()
    final_pop = algorithms.eaSimple(pop, toolbox, cxpb = CX_RATE, mutpb=MUTATE_RATE, ngen=1000, 
                                    stats = None, halloffame = h_fame, verbose = False)
    
    # accuracy_of_the_best_individual = evaluate(h_fame[0])
    # accuracy_of_the_whole_trees_model = accuracy_score(y_test, model.predict(X_test))
    cf_matrix_RF_model = confusion_matrix(y_test, model.predict(X_test))
    cf_matrix_GA_RF_model = evaluate_confusion_matrix(h_fame[0])
    
    return cf_matrix_GA_RF_model, cf_matrix_RF_model, evaluate_confusion_matrix, h_fame, estimators

In [5]:
def measure_model_performance(C):
    """
    C is a confusion matrix
    accuracy, sensitivity, specificity: the higher the better
    FP_rate: the lower the better
    
    Now, only compute accuracy
    """
    accuracy = (C[0,0] + C[1, 1] + C[2, 2]) / np.sum(C)
    sensitivity = 0
    specificity = 0 
    FP_rate = 0
    
    cf_matrix = C
    pseudo_acc_case_rate = cf_matrix[1, 1] / np.sum(cf_matrix)
    bad_case_rate = (cf_matrix[1, 0] + cf_matrix[1, 2]) / np.sum(cf_matrix)
    undesired_case_rate = (cf_matrix[0, 1] + cf_matrix[2, 1]) / np.sum(cf_matrix)
    
    left_diag_case_rate = (cf_matrix[0, 0] + cf_matrix[2, 2]) / np.sum(cf_matrix)
    right_diag_case_rate = (cf_matrix[0, 2] + cf_matrix[2, 0]) / np.sum(cf_matrix)
        # give accuracy more weight
    performance = 0.3 * pseudo_acc_case_rate - 0.3 * bad_case_rate - 0.2 * undesired_case_rate + 0.2 * 0.8 * left_diag_case_rate + 0.2 * 0.2 * right_diag_case_rate
    
    return accuracy, sensitivity, specificity, FP_rate, performance

In [6]:
import warnings; warnings.simplefilter('ignore')
num_trees = 100
GA_accuracy_result = []
RF_accuracy_result = []

GA_sensitivity_result = []
RF_sensitivity_result = []

GA_specificity_result = []
RF_specificity_result = []

GA_FP_rate_result = []
RF_FP_rate_result = []

confusion_matrices_list_GA_RF = []
h_fame_list = []
GA_RF_model_list = []

GA_score_list = []

print('num_trees = ', num_trees)
for i in range(50):  # run the experiment 300 times
    print(i)
    cf_matrix_GA_RF_model, cf_matrix_RF_model, evaluate_confusion_matrix, h_fame, estimators = run_GA(num_trees, 10, 30)
    
    GA_accuracy, GA_sensitivity, GA_specificity, GA_FP_rate, GA_score = measure_model_performance(cf_matrix_GA_RF_model)
    RF_accuracy, RF_sensitivity, RF_specificity, RF_FP_rate, RF_score = measure_model_performance(cf_matrix_RF_model)

    GA_accuracy_result.append(GA_accuracy)
    RF_accuracy_result.append(RF_accuracy)

    GA_sensitivity_result.append(GA_sensitivity)
    RF_sensitivity_result.append(RF_sensitivity)

    GA_specificity_result.append(GA_specificity)
    RF_specificity_result.append(RF_specificity)

    GA_FP_rate_result.append(GA_FP_rate)
    RF_FP_rate_result.append(RF_FP_rate)
    
    confusion_matrices_list_GA_RF.append(cf_matrix_GA_RF_model)
    GA_RF_model_list.append(estimators)
    
    GA_score_list.append(GA_score)

num_trees =  100
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [7]:
def show_result():
    print("mean accuracy = ",  np.mean(GA_accuracy_result))
    print("std accuracy = ", np.std(GA_accuracy_result))
    print("Max accuracy = ", np.max(GA_accuracy_result))
    print("confusion Matrix for model with max accuracy is \n", confusion_matrices_list_GA_RF[np.argmax(GA_accuracy_result)])
    print("Max score = ", np.max(GA_score_list))
    print("confusion Matrix for model with max score is \n", confusion_matrices_list_GA_RF[np.argmax(GA_score_list)])

In [8]:
show_result()

mean accuracy =  0.6573333333333333
std accuracy =  0.04816637831516917
Max accuracy =  0.7866666666666666
confusion Matrix for model with max accuracy is 
 [[17  2  2]
 [ 5 27  1]
 [ 2  4 15]]
Max score =  0.14933333333333335
confusion Matrix for model with max score is 
 [[15  5  5]
 [ 1 25  0]
 [ 6  2 16]]
