In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, DMatrix
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report

from hyperopt import hp
from hyperopt import STATUS_OK
from hyperopt import fmin, tpe, Trials

LOAD DATA

In [3]:
train = pd.read_csv('train_BINEXP.csv')
val = pd.read_csv('val_BINEXP.csv')

In [4]:
train['species'] = train['species'].apply(lambda x: x if x in [1,2] else 3)
val['species'] = val['species'].apply(lambda x: x if x in [1,2] else 3)

In [5]:
# store the label columns for later use
train_labels = train['label']
val_labels = val['label']

# remove area codes and labels from data
train = train.drop(['area_code', 'label'], axis=1)
val = val.drop(['area_code', 'label'], axis=1)

X_train = train.drop('species', axis=1)
y_train = train['species']
y_train = y_train-1

X_val = val.drop('species', axis=1)
y_val = val['species']
y_val = y_val-1

HELPER FUNCTION FOR FEATURE SELECTION

In [6]:
def get_used_feats_names(feats_frame_columns, features_list):
    """
    not all features saved in the features.csv will be used.
    so we need to select which features are used.
    :param feats_frame_columns: all feaetures' names
                example [hrange, hmax, imax_0, imin_0, ..., imax_1,..., isk_2, ikut_2, ip90_2]
                "h" or "i": feature's name.
                    h: height, i:intensity
    :param features_list: the features' list which will be considered.
                example [h, i_max_min_sk]
                the format of "i_max_min_sk" means that only max, min, sk subfeatures for intensity will be used.
    :return:
    """
    feats_names = feats_frame_columns  # np.arr, dtype=object
    used_feats_bool = []

    for f_considered in features_list:
        if "_" in f_considered:
            f_i_subattrs = np.zeros(shape=len(feats_names)).astype("bool")
            f_name = f_considered.split("_")[0]   # feature type info. e.g. "h"(height), "i"(intensity)
            f_i_sub = f_considered.split("_")[1:] # feature sub info. e.g. "max", "min"
            f_i_sub_channels = list(filter(lambda _:_ in ["ch0", "ch1", "ch2", "ch3"], f_i_sub)) # channel info: e.g.: "ch1"
            for fi, fn in enumerate(feats_names): # fn e.g.: "imax_1", "imin_1"
                if fn[0]==f_name: # judge h or i
                    for s in f_i_sub: # s e.g. "max", "min", "ch1"
                        if str(s) in fn or s.replace("ch", "")==fn.split("_")[-1]: # e.g.: "max" in "imax_1" and ch"1"==imax_"1"
                            if len(f_i_sub_channels)==0: # no channel info. so this feature will be considered without the channel filter
                                f_i_subattrs[fi] = True
                            elif "ch"+str(fn)[-1] in f_i_sub_channels: # filter the feature info by channel.
                                f_i_subattrs[fi] = True
            used_feats_bool.append(f_i_subattrs)
        else:
            used_feats_bool.append([_[0] == f_considered for _ in feats_names])

    used_feats_bool = np.sum(used_feats_bool, axis=0, dtype=bool)  # 1-d list, all columns name which will be considered
    used_feats_names= feats_names[used_feats_bool]

    if 'D_1' in features_list and 'D_10' not in features_list:
        used_feats_names = used_feats_names[used_feats_names != 'D10']


    return used_feats_names

feats_cols = np.array(['hrange', 'hmax', 'hstd', 'hmean' ,'imax_0', 'imin_0', 'imean_0', 'isk_0', 'ikut_0', 'ip90_0',
 'imax_1', 'imin_1', 'imean_1', 'isk_1', 'ikut_1', 'ip90_1', 'imax_2', 'imin_2',
 'imean_2', 'isk_2', 'ikut_2', 'ip90_2','Rmin_0', 'Rmin_1', 'Rmin_2','Rp90_0', 'Rp90_1', 'Rp90_2', 'D1','D2','D3','D4','D5','D6','D7','D8','D9','D10', 'penetration',
 'Rmean_0', 'Rmean_1', 'Rmean_2', 'Rmax_0', 'Rmax_1', 'Rmax_2', 'Rkut_0', 'Rkut_1', 'Rkut_2', 'Rsk_0', 'Rsk_1', 'Rsk_2', 
 'V_min', 'V_mean', 'V_max', 'V_median', 'V_std', 'V_sk', 'V_kut', 'V_p90', 'V_points',


 'istd_0', 'irange_0', 'ip5_0', 'ip10_0', 'ip20_0', 'ip30_0', 'ip40_0', 'ipmedian_0', 'ip60_0', 'ip70_0', 'ip80_0',
 'istd_1', 'irange_1', 'ip5_1', 'ip10_1', 'ip20_1', 'ip30_1', 'ip40_1', 'ipmedian_1', 'ip60_1', 'ip70_1', 'ip80_1',
 'istd_2', 'irange_2', 'ip5_2', 'ip10_2', 'ip20_2', 'ip30_2', 'ip40_2', 'ipmedian_2', 'ip60_2', 'ip70_2', 'ip80_2',
 'Rstd_0', 'Rrange_0', 'Rp5_0', 'Rp10_0', 'Rp20_0', 'Rp30_0', 'Rp40_0', 'Rpmedian_0', 'Rp60_0', 'Rp70_0', 'Rp80_0',
 'Rstd_1', 'Rrange_1', 'Rp5_1', 'Rp10_1', 'Rp20_1', 'Rp30_1', 'Rp40_1', 'Rpmedian_1', 'Rp60_1', 'Rp70_1', 'Rp80_1',
 'Rstd_2', 'Rrange_2', 'Rp5_2', 'Rp10_2', 'Rp20_2', 'Rp30_2', 'Rp40_2', 'Rpmedian_2', 'Rp60_2', 'Rp70_2', 'Rp80_2',
 'Nstd', 'Nrange', 'Np5', 'Np10', 'Np20', 'Np30', 'Np40', 'Npmedian', 'Np60', 'Np70', 'Np80',
 'Nmax', 'Nmin', 'Nmean', 'Nsk', 'Nkut', 'Np90',
 'Amax', 'Amin', 'Amean', 'Ask', 'Akut', 'Ap90', 'Astd', 'Arange', 'Ap5','Ap10','Ap20','Ap30','Ap40','Apmedian','Ap60','Ap70','Ap80',
 'Xmax', 'Xmin', 'Xmean', 'Xsk', 'Xkut', 'Xp90', 'Xstd', 'Xrange', 'Xp5','Xp10','Xp20','Xp30','Xp40','Xpmedian','Xp60','Xp70','Xp80',
 'CA', 'CV', 'CD', 'HP10', 'HP20','HP30','HP40','HP50','HP60','HP70','HP80','HP90',

'2binm1_0', '2binx1_0','2binm2_0','2binx2_0',
'2binm1_1', '2binx1_1','2binm2_1','2binx2_1',
'2binm1_2', '2binx1_2','2binm2_2','2binx2_2',

'3binm1_0','3binx1_0', '3binm2_0','3binx2_0', '3binm3_0','3binx3_0',
'3binm1_1','3binx1_1', '3binm2_1','3binx2_1', '3binm3_1','3binx3_1',
'3binm1_2','3binx1_2', '3binm2_2','3binx2_2', '3binm3_2','3binx3_2',

'4binm1_0','4binx1_0', '4binm2_0','4binx2_0', '4binm3_0','4binx3_0', '4binm4_0','4binx4_0',
'4binm1_1','4binx1_1', '4binm2_1','4binx2_1', '4binm3_1','4binx3_1', '4binm4_1','4binx4_1',
'4binm1_2','4binx1_2', '4binm2_2','4binx2_2', '4binm3_2','4binx3_2', '4binm4_2','4binx4_2',

'5binm1_0','5binx1_0', '5binm2_0','5binx2_0', '5binm3_0','5binx3_0', '5binm4_0','5binx4_0', '5binm5_0','5binx5_0',
'5binm1_1','5binx1_1', '5binm2_1','5binx2_1', '5binm3_1','5binx3_1', '5binm4_1','5binx4_1', '5binm5_1','5binx5_1',
'5binm1_2','5binx1_2', '5binm2_2','5binx2_2', '5binm3_2','5binx3_2', '5binm4_2','5binx4_2', '5binm5_2','5binx5_2',

'10binm1_0','10binx1_0', '10binm2_0','10binx2_0', '10binm3_0','10binx3_0', '10binm4_0','10binx4_0', '10binm5_0','10binx5_0',
'10binm6_0','10binx6_0', '10binm7_0','10binx7_0', '10binm8_0','10binx8_0', '10binm9_0','10binx9_0', '10binm10_0','10binx10_0',

'10binm1_1','10binx1_1', '10binm2_1','10binx2_1', '10binm3_1','10binx3_1', '10binm4_1','10binx4_1', '10binm5_1','10binx5_1',
'10binm6_1','10binx6_1', '10binm7_1','10binx7_1', '10binm8_1','10binx8_1', '10binm9_1','10binx9_1', '10binm10_1','10binx10_1',

'10binm1_2','10binx1_2', '10binm2_2','10binx2_2', '10binm3_2','10binx3_2', '10binm4_2','10binx4_2', '10binm5_2','10binx5_2',
'10binm6_2','10binx6_2', '10binm7_2','10binx7_2', '10binm8_2','10binx8_2', '10binm9_2','10binx9_2', '10binm10_2','10binx10_2',

'Rbinm1_0','Rbinm1_1','Rbinm1_2',
'Rbinm2_0','Rbinm2_1','Rbinm2_2',
'Rbinm3_0','Rbinm3_1','Rbinm3_2',
'Rbinx1_0','Rbinx1_1','Rbinx1_2',
'Rbinx2_0','Rbinx2_1','Rbinx2_2',
'Rbinx3_0','Rbinx3_1','Rbinx3_2',
'Nbinm1', 'Nbinm2', 'Nbinm3', 'Nbinx1', 'Nbinx2', 'Nbinx3',
'Abinm1', 'Abinm2', 'Abinm3',
'Abinx1', 'Abinx2', 'Abinx3',
'Xbinm1', 'Xbinm2', 'Xbinm3',
'Xbinx1', 'Xbinx2', 'Xbinx3'
 ])

FEATURE SELECTION

In [7]:
feats_to_use = ['i_p90', 'i_max', 'i_mean', 'i_sk','i_kut', 'i_min', 'h_range', 'h_max']

#'h', 'H', 'p', 'C', 'D'

feats = get_used_feats_names(feats_frame_columns=feats_cols, features_list=feats_to_use)

columns_to_drop = [
                    'V_min', 'V_median', 'V_sk', 'V_p90',
                    'Abinm1', 'Abinx1',
                    'Abinm2', 'Abinx2',
                    'Xbinm1', 'Xbinx1',
                    'Xbinm2', 'Xbinx2',
                    'ibinm1_0','ibinm1_1','ibinm1_2',
                    'ibinm2_0','ibinm2_1','ibinm2_2',
                    'ibinm3_0','ibinm3_1','ibinm3_2',
                    'ibinx1_0','ibinx1_1','ibinx1_2',
                    'ibinx2_0','ibinx2_1','ibinx2_2',
                    'ibinx3_0','ibinx3_1','ibinx3_2',
                    ]
columns_to_drop.extend([])
feats = feats[~np.isin(feats, columns_to_drop)]

print('FEATURES USED: \n', feats)
print(len(feats), ' features')

X_train_sub = X_train[feats]
X_val_sub = X_val[feats]

FEATURES USED: 
 ['hrange' 'hmax' 'imax_0' 'imin_0' 'imean_0' 'isk_0' 'ikut_0' 'ip90_0'
 'imax_1' 'imin_1' 'imean_1' 'isk_1' 'ikut_1' 'ip90_1' 'imax_2' 'imin_2'
 'imean_2' 'isk_2' 'ikut_2' 'ip90_2']
20  features


In [53]:
xgb = XGBClassifier(eval_metric='mlogloss', max_depth=7, n_estimators=101, learning_rate=0.05, subsample=0.8)
rf =  RandomForestClassifier(max_depth=14, n_estimators=101, class_weight='balanced')

# RF: {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 301}

In [8]:
best_acc = 0
epochs = 20
for epoch in range(epochs):
    clf = XGBClassifier(eval_metric='mlogloss', n_estimators=51, learning_rate=0.05, max_depth=14, subsample=0.7, random_state=epoch)
    clf.fit(X_train_sub, y_train)
    predictions = clf.predict(X_val_sub)
    accuracy = accuracy_score(y_val, predictions)
    print(accuracy)

    if accuracy>0.865:
        print('')
        print('JACKPOT')
        print(f'Validation Accuracy: {accuracy:.3f}')

        cm = confusion_matrix(y_val, predictions)
        print("")
        print("Confusion Matrix:")
        print(cm)
        break

0.8450704225352113
0.8497652582159625
0.8450704225352113
0.8403755868544601
0.8356807511737089
0.8309859154929577
0.8450704225352113
0.8403755868544601
0.8544600938967136
0.8450704225352113
0.8497652582159625
0.8591549295774648
0.8497652582159625
0.8450704225352113
0.8450704225352113
0.8450704225352113
0.8356807511737089
0.8544600938967136
0.8450704225352113
0.8497652582159625
0.8403755868544601
0.8450704225352113
0.8356807511737089
0.8450704225352113
0.8497652582159625
0.8262910798122066
0.8450704225352113
0.8403755868544601
0.8450704225352113
0.8497652582159625
0.8356807511737089
0.8262910798122066
0.8309859154929577
0.8262910798122066
0.8450704225352113
0.8309859154929577
0.8403755868544601
0.8450704225352113
0.8356807511737089
0.8450704225352113
0.8450704225352113
0.8450704225352113
0.8450704225352113
0.8497652582159625
0.8497652582159625
0.8309859154929577
0.8544600938967136
0.8497652582159625
0.8403755868544601
0.8450704225352113
0.8497652582159625
0.8591549295774648
0.8262910798

In [21]:
df_predictions = pd.DataFrame({
    'ID': val_labels, 
    'Actual Species': y_val,
    'Predicted Species': predictions
})

# Filter the dataframe to find where predictions are incorrect
incorrect_preds = df_predictions[df_predictions['Actual Species'] != df_predictions['Predicted Species']]
incorrect_preds_sorted = incorrect_preds.sort_values(by='ID')

print(incorrect_preds_sorted)


        ID  Actual Species  Predicted Species
147   1408               0                  2
160   3513               1                  2
161   3545               1                  2
7     4404               0                  2
8     5306               0                  2
167   5530               0                  2
10    5534               1                  0
173   7985               2                  1
178   8688               2                  0
194  11406               0                  1
31   13944               0                  2
32   13968               2                  1
35   14351               2                  0
38   14753               0                  2
39   14762               0                  2
43   15110               1                  2
44   15280               1                  2
65   17108               0                  2
78   18881               2                  1
90   20338               0                  2
92   20480               1        

EVALUATION OF XGBOOST AND RF

In [169]:
best_acc = 0
avg_acc = 0
epochs = 20
for epoch in range(epochs):

    clf = RandomForestClassifier(n_estimators=101, random_state=epoch)
    # clf = XGBClassifier(eval_metric='mlogloss', n_estimators=101, learning_rate=0.05, max_depth=14, subsample=0.7, random_state=epoch)
    clf.fit(X_train_sub, y_train)

    predictions = clf.predict(X_val_sub)
    accuracy = accuracy_score(y_val, predictions)

    avg_acc+=accuracy

    if accuracy>best_acc:
        best_acc = accuracy
    print(f'Validation Accuracy: {accuracy:.3f}')

avg_acc/=epochs
print("")
print(f'Best accuracy: {best_acc:.3f}')
print(f'Avg accuracy: {avg_acc:.3f}')


## Best xgboost: 85.9%
## Best RF: 85.4%


Validation Accuracy: 0.822
Validation Accuracy: 0.850
Validation Accuracy: 0.822
Validation Accuracy: 0.822
Validation Accuracy: 0.836
Validation Accuracy: 0.840
Validation Accuracy: 0.822
Validation Accuracy: 0.822
Validation Accuracy: 0.831
Validation Accuracy: 0.826
Validation Accuracy: 0.840
Validation Accuracy: 0.845
Validation Accuracy: 0.822
Validation Accuracy: 0.836
Validation Accuracy: 0.840
Validation Accuracy: 0.826
Validation Accuracy: 0.850
Validation Accuracy: 0.831
Validation Accuracy: 0.850
Validation Accuracy: 0.840
Validation Accuracy: 0.845
Validation Accuracy: 0.826
Validation Accuracy: 0.831
Validation Accuracy: 0.826
Validation Accuracy: 0.850
Validation Accuracy: 0.831
Validation Accuracy: 0.836
Validation Accuracy: 0.854
Validation Accuracy: 0.840
Validation Accuracy: 0.840
Validation Accuracy: 0.831
Validation Accuracy: 0.836
Validation Accuracy: 0.836
Validation Accuracy: 0.826
Validation Accuracy: 0.840
Validation Accuracy: 0.840
Validation Accuracy: 0.822
V

In [83]:
rf = RandomForestClassifier(n_estimators=201)
rf.fit(X_train_sub, y_train)

rf_predictions = rf.predict(X_val_sub)
rf_accuracy = accuracy_score(y_val, rf_predictions)
print(f'Validation Accuracy: {rf_accuracy}')

Validation Accuracy: 0.8450704225352113


In [33]:
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier(eval_metric='logloss')
rf = RandomForestClassifier()

# candidate hyperparameters
param_grid_xgb = {
    'max_depth': [3, 5, 9],
    'n_estimators': [100,200],
    'learning_rate': [0.05],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True, False]
}


grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train_sub, y_train)

# Best parameters and best accuracy
print("Best parameters found: ", grid_search.best_params_)
print("Best validation accuracy: ", grid_search.best_score_)

# Use best estimator to make predictions
best_estimator = grid_search.best_estimator_
predictions = best_estimator.predict(X_val_sub)
accuracy = accuracy_score(y_val, predictions)
print(f'Improved Validation Accuracy: {accuracy}')

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best parameters found:  {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
Best validation accuracy:  0.8004694835680751
Improved Validation Accuracy: 0.8356807511737089
