In [1]:
# library for GA
import random

from deap import base
from deap import creator
from deap import tools
from deap import algorithms

# library for RF
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


import pandas as pd
from seaborn import violinplot
import seaborn as sns

In [2]:
spectrum_data = pd.read_excel('./spectrum_data/MIne_Nothing_Rock.xlsx',  sep = ',', header = 0)

In [3]:
spectrum_data[0:5]

Unnamed: 0,Signal_Path,Frequency_0,Frequency_0dot32541,Frequency_0dot65083,Frequency_0dot97624,Frequency_1dot3017,Frequency_1dot6271,Frequency_1dot9525,Frequency_2dot2779,Frequency_2dot6033,...,Frequency_497dot234,Frequency_497dot5594,Frequency_497dot8848,Frequency_498dot2102,Frequency_498dot5356,Frequency_498dot861,Frequency_499dot1865,Frequency_499dot5119,Frequency_499dot8373,Label
0,1,-1.459259,-4.532056,-32.680765,-34.772452,-37.36222,-41.463815,-41.8096,-41.411157,-42.444672,...,-92.727267,-94.206407,-95.049497,-92.151649,-94.120509,-95.14072,-91.976042,-88.533568,-91.717852,1
1,1,4.928653,1.957394,-35.392737,-42.128736,-46.621402,-53.255708,-56.017699,-59.087885,-62.249034,...,-88.354186,-88.465816,-85.997244,-87.914434,-88.387158,-89.374889,-87.190449,-88.074054,-85.246747,1
2,1,5.429769,2.402802,-39.912158,-40.833904,-43.26922,-44.225699,-43.763521,-44.645556,-46.069485,...,-87.576766,-92.46763,-89.857615,-86.389981,-85.861711,-88.080618,-88.199254,-89.362906,-92.505462,1
3,1,-8.860418,-11.892214,-40.604265,-41.491205,-46.728661,-45.278651,-46.840303,-48.153704,-48.827305,...,-88.342649,-90.868922,-88.49164,-89.537515,-89.388839,-90.235332,-90.477474,-91.199705,-89.209511,1
4,1,-14.143644,-17.023779,-34.837263,-43.262198,-43.991594,-47.693121,-48.950286,-45.364605,-46.008724,...,-90.700178,-93.163671,-93.431828,-90.653285,-93.611234,-90.997471,-89.479312,-87.478921,-90.607241,1


In [7]:
# define algorithm parameters
NUM_TREES = 50  # num of estimators in RF
IND_SIZE = 10   # subtree size
POP_SIZE  = 30  
CX_RATE = 0.8

MUTATE_RATE = 1.0/IND_SIZE 
    
    
    
# prepare data

raw = spectrum_data.values

X = raw[:, 0:1538]
y = raw[:, 1538]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random.randint(0,5000))


model = RandomForestClassifier(n_estimators= NUM_TREES) # create a random forest with NUM_TREES = 20 
model.fit(X_train, y_train) # train the model
estimators = model.estimators_ # get all the trees

In [14]:
estimators[0:3]

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=1546957914, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=1925914921, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_we

In [9]:
def sub_rf_predict(sub_rf, X_test):
    """
    return the predict result using the sub_rf and X_test data;
    the rule is that predict result(labels) with the maximum number of votes wins.
    """
    predict_results = []
    for tree in sub_rf:
        prediction = tree.predict(X_test)
        # record prediction result for a tree
        predict_results.append(prediction)

    # compute the vote_result, i.e. the final result
    y_predict = [0]*len(X_test)
    for idx in range(len(X_test)):
        # for each test data
        # create a vote result
        v_result = vote_result()
        for predict_tree in predict_results:
            v_result[predict_tree[idx]] += 1

        # final result
        y_predict[idx] = keywithmaxval(v_result)

    return  np.array(y_predict, dtype = float)

# helper function
y_set = set(y_test).union(y_train)
def vote_result():
    result = {}
    for k in y_set:
        result[k] = 0
    return result

def keywithmaxval(d):
    """ a) create a list of the dict's keys and values; 
    b) return the key with the max value"""  
    v=list(d.values())
    k=list(d.keys())
    return k[v.index(max(v))]

In [12]:
sub_rf_predict(estimators[0:3], X_test)

array([0., 1., 0., 0., 0., 0., 0., 2., 0., 0., 1., 2., 2., 1., 2., 1., 1.,
       0., 1., 0., 0., 1., 1., 2., 1., 2., 0., 1., 2., 2., 0., 2., 1., 0.,
       1., 1., 1., 0., 2., 1., 1., 1., 1., 0., 2.])

In [16]:
estimators[0].predict(X_test)

array([2., 1., 0., 2., 0., 1., 0., 0., 0., 2., 1., 2., 2., 1., 2., 1., 2.,
       0., 1., 2., 2., 1., 2., 2., 2., 1., 1., 1., 2., 1., 0., 2., 1., 0.,
       1., 1., 0., 1., 2., 1., 1., 1., 1., 1., 2.])

In [17]:
estimators[1].predict(X_test)

array([1., 1., 0., 0., 1., 0., 0., 2., 1., 1., 1., 2., 2., 1., 2., 1., 1.,
       1., 1., 0., 0., 1., 1., 2., 1., 2., 0., 1., 2., 2., 1., 1., 0., 1.,
       1., 1., 1., 0., 2., 1., 2., 1., 1., 0., 2.])

In [18]:
estimators[2].predict(X_test)

array([0., 0., 0., 1., 2., 0., 0., 2., 0., 0., 0., 0., 1., 1., 1., 2., 1.,
       2., 1., 1., 0., 2., 1., 0., 1., 2., 0., 1., 2., 2., 0., 2., 1., 0.,
       1., 0., 1., 0., 0., 0., 1., 2., 0., 0., 0.])

In [23]:
score = accuracy_score(y_test, sub_rf_predict(estimators[0:50], X_test))

In [24]:
score

0.28888888888888886