## Load dataset


In [69]:
import csv
import numpy as np
import pandas as pd
from IPython.display import Image
from IPython.display import SVG
from graphviz import Source
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier


In [71]:
data  = pd.read_csv('results7parsed.csv')

# sort by best CA
avg_ca = data.sort_values('Average CA', ascending=False)
avg_rfw = data.sort_values('RFW CA', ascending=False)
avg_dt = data.sort_values('DT CA', ascending=False)

# get best 5 average features
with open('CA Features.csv', 'w')as f :
    f.write('Avg')
    f.write(avg_ca[:5].to_csv())
    
# get best 5 random forest features
with open('CA Features.csv', 'a')as f :
    f.write('Rfw')
    f.write(avg_rfw[:5].to_csv())
    
# get 5 best decision tree features 
with open('CA Features.csv', 'a')as f :
    f.write('Dtree')
    f.write(avg_dt[:5].to_csv())
    

Now, let us take the most interesting features from the csv and write a parser to analyze the feature importances.

In [72]:

s ='''nof_OH, nof_NH2, nof_SO3H, C_R0, nof_HBA, PSA/Area, molLogS
nof_OH, posCharge/Volume, C_R0, nof_HBA, PSA/Area, molLogS, molLogP
nof_OH, nof_NH2, nof_SO3H, C_sp3, C_R2, nof_HBA, PSA/Area
nof_OH, nof_NH2, nof_PO4, C_R0, nof_posCharge, nof_HBA, PSA/Area
nof_OH, nof_NH2, nof_SO3H, negCharge/Volume, nof_HBA, PSA/Area, molLogS
nof_OH, nof_NH2, nof_SO3H, C_sp3, C_R2, nof_negCharge, PSA/Area
nof_SO3H, posCharge/Volume, C_R1, nof_posCharge, nof_HBA, PSA/Area, molLogS
nof_OH, nof_SO3H, nof_negCharge, nof_posCharge, PSA/Area, molPSA, molLogP
nof_OH, nof_NH2, nof_SO3H, C_sp3, C_R2, nof_posCharge, PSA/Area
nof_OH, nof_NH2, nof_SO3H, C_R0, nof_posCharge, nof_HBA, molPSA
nof_OH, nof_NH2, nof_SH, C_R0, nof_posCharge, nof_HBA, molPSA
nof_SO3H, C_sp3, C_R0, C_R2, nof_HBA, molLogS, molLogP
PSA/Area, nof_Rings, Complexity, nof_SO3H, nof_OH, nof_Chirals, C_R0'''

features =  [sorted([x]) for x in s.split('\n')]
print(features)


[['nof_OH, nof_NH2, nof_SO3H, C_R0, nof_HBA, PSA/Area, molLogS'], ['nof_OH, posCharge/Volume, C_R0, nof_HBA, PSA/Area, molLogS, molLogP'], ['nof_OH, nof_NH2, nof_SO3H, C_sp3, C_R2, nof_HBA, PSA/Area'], ['nof_OH, nof_NH2, nof_PO4, C_R0, nof_posCharge, nof_HBA, PSA/Area'], ['nof_OH, nof_NH2, nof_SO3H, negCharge/Volume, nof_HBA, PSA/Area, molLogS'], ['nof_OH, nof_NH2, nof_SO3H, C_sp3, C_R2, nof_negCharge, PSA/Area'], ['nof_SO3H, posCharge/Volume, C_R1, nof_posCharge, nof_HBA, PSA/Area, molLogS'], ['nof_OH, nof_SO3H, nof_negCharge, nof_posCharge, PSA/Area, molPSA, molLogP'], ['nof_OH, nof_NH2, nof_SO3H, C_sp3, C_R2, nof_posCharge, PSA/Area'], ['nof_OH, nof_NH2, nof_SO3H, C_R0, nof_posCharge, nof_HBA, molPSA'], ['nof_OH, nof_NH2, nof_SH, C_R0, nof_posCharge, nof_HBA, molPSA'], ['nof_SO3H, C_sp3, C_R0, C_R2, nof_HBA, molLogS, molLogP'], ['PSA/Area, nof_Rings, Complexity, nof_SO3H, nof_OH, nof_Chirals, C_R0']]


## Project Settings

Specified here are the paths for the data and the features to run over in the list of best features.
Each entry in the list is a list containing one single string of the features to try, comma seperated. In this way it is easy to write a script to 
add entries to try very easily. 

In [74]:

##### set hyperparams
NUM_ITER = 10 # number of times to run 10foldxval to get a statistical degree of confidence

''' HYPERPARAMS FOR DECISION TREE
 
 These parameters implement a rudimentary pruning algorithm, would ideally like to use AB pruning'''
enable_pruning = True
# maximum depth of dtree
max_depth = 5
# how many samples your need atleast, at a LEAF node
min_samples = 3

##### set parameters
path_train_data = 'train.csv'
path_test_data = 'test.csv'
path_all_data = 'Dataset Correlated Removed.csv'

# set features here

best_features = features

best_features = [list(map(str.strip, x[0].split(','))) for x in best_features]

k = len(best_features)


## Load Dataset

This code loads dataset into the variables below and converts the labels to categorical 0, 1 pairs.

In [75]:
# load dataset
all_data = pd.DataFrame(pd.read_csv(path_all_data))
all_labels = all_data['SLC'].astype('category').cat.codes
# drop labels
all_data.drop('SLC', axis=1, inplace=True)

train_data = pd.DataFrame(pd.read_csv(path_train_data))
train_labels = train_data['SLC'].astype('category').cat.codes
# drop labels

train_data.drop('SLC', axis=1, inplace=True)

test_data = pd.DataFrame(pd.read_csv(path_test_data))
test_labels = test_data['SLC'].astype('category').cat.codes
# drop labels
test_data.drop('SLC', axis=1, inplace=True)


## AUC and Classification Accuracy - Decision Tree

The code below will find the classification accuracy using 10-fold cross-validation using stratified sampling to help class imbalance. The AUC on the test split is also found.

In [None]:
# visualize decision tree for input features

d_trees = []


# find CA - uses 10-fold cross validation 
# with stratified sampling to help with class imbalance
# and simple average over subsets
dt_cas = []

# maintain list of cas over a period
dt_ca_matrix = []

# run the thing NUM_ITER times
for _ in range(NUM_ITER):
    for i in range(k):
        aucs = []
        # make fold
        skf = StratifiedKFold(n_splits=10, shuffle=True)
        for trx, tex in skf.split(all_data, all_labels):
            # strip data to required features
            subset_data = all_data.filter(best_features[i], axis=1)
            
            # find auc
            dtree = DecisionTreeClassifier(presort=True, max_depth=max_depth, min_samples_leaf=min_samples)
            dtree.fit(subset_data.iloc[trx, :], all_labels.iloc[trx])        
            pred = dtree.predict(subset_data.iloc[tex, :])
            labels = all_labels.iloc[tex]
            
            acc = roc_auc_score(labels, pred)
            # record auc to average later
            aucs.append(acc)
        
        dt_cas.append(np.mean(aucs))
        
    dt_ca_matrix.append(list(dt_cas))
    dt_cas.clear()


dt_ca_matrix = np.array(dt_ca_matrix)
dt_cas = dt_ca_matrix.mean(axis=0)
dt_cas_std = dt_ca_matrix.std(axis=0)

# find AUC 
dt_aucs = []

# run k-fold validation
for i in range(k):
    subset_test_data = test_data.filter(best_features[i], axis=1)
    subset_train_data = train_data.filter(best_features[i], axis=1)
    
    clf = DecisionTreeClassifier(presort=True, max_depth=max_depth, min_samples_leaf=min_samples)
    clf.fit(subset_train_data, train_labels)
    d_trees.append(clf)
    
    # make its predictions on test data
    pred = d_trees[i].predict(subset_test_data)
    
    # find auc scores
    auc = roc_auc_score(test_labels, pred)
    
    # record the scores
    dt_aucs.append(auc)


print('Decision Tree Results:')
print('   \tAUC\tAcc (%dit) (mean ± std)\t\tFeatures' % NUM_ITER)
for i, f in enumerate(zip(dt_aucs, dt_cas, dt_cas_std)):
    print('%1d' %i,'\t%05.3f\t%05.3f ± %05.03f\t' % tuple(f) + ', '.join((best_features[i])))

    

## AUC and Classification Accuracy - Random Forest Walk

The code below will find the classification accuracy using 10-fold cross-validation using stratified sampling to help class imbalance. The AUC on the test split is also found.

In [None]:
# visualize random forest features
rfws = []


# find CA - uses 10-fold cross validation 
# with stratified sampling to help with class imbalance
# and simple average over subsets
rfw_cas = []

# maintain list of cas over a period
rfw_ca_matrix = []

# run the thing NUM_ITER times
for _ in range(NUM_ITER):
    for i in range(k):
        aucs = []
        # make fold
        skf = StratifiedKFold(n_splits=10, shuffle=True)
        for trx, tex in skf.split(all_data, all_labels):
            # strip data to required features
            subset_data = all_data.filter(best_features[i], axis=1)
            
            # find auc
            rfwtree = RandomForestClassifier(n_estimators=100)
            rfwtree.fit(subset_data.iloc[trx, :], all_labels.iloc[trx])
            pred = rfwtree.predict(subset_data.iloc[tex, :])
            labels = all_labels.iloc[tex]
            
            acc = roc_auc_score(labels, pred)
            # record auc to average later
            aucs.append(acc)
        
        rfw_cas.append(np.mean(aucs))
    rfw_ca_matrix.append(list(rfw_cas))
    rfw_cas.clear()


rfw_ca_matrix = np.array(rfw_ca_matrix)
rfw_cas = rfw_ca_matrix.mean(axis=0)
rfw_cas_std = rfw_ca_matrix.std(axis=0)

# find AUC 
rfw_aucs = []
for i in range(k):
    subset_test_data = test_data.filter(best_features[i], axis=1)
    subset_train_data = train_data.filter(best_features[i], axis=1)
    
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(subset_train_data, train_labels)
    rfws.append(clf)
    
    # make its predictions on test data
    pred = rfws[i].predict(subset_test_data)
    
    # find auc scores
    auc = roc_auc_score(test_labels, pred)
    
    # record the scores
    rfw_aucs.append(auc)


print('Random Forest Results:')
print('   \tAUC\tAcc (%dit) (mean ± std)\t\tFeatures' % NUM_ITER)
for i, f in enumerate(zip(dt_aucs, rfw_cas, rfw_cas_std)):
    print('%1d' % i, '\t%05.3f\t%05.3f ± %05.03f\t' % tuple(f) + ', '.join(sorted(best_features[i])))


 
## Tabulate Results


In [14]:
to_write = [['AUC DT','AUC RFW','Acc DT', 'Std Acc DT' ,'Acc RFW', 'Std Acc RFW', 'Features']]

print('   AUC(DT)\tAUC(RFW)Acc(DT %dit, x± std)\tAcc(RFW %dit, x±std)\tFeatures' % (NUM_ITER, 
                                                                                                     NUM_ITER))
for i, f in enumerate(zip(dt_aucs, rfw_aucs, dt_cas, dt_cas_std, rfw_cas, rfw_cas_std)):
    print('%1d' % i, '%05.3f\t%05.3f\t%05.3f ± %05.03f\t%05.3f ± %05.03f\t' % tuple(f) + ', '.join((best_features[i])))
    to_write.append(list(f) + [', '.join((best_features[i]))])
    
# write results to a csv file
output = csv.writer(open('Final Result.csv', 'w', newline=''))
output.writerows(to_write)


AUC(DT)	AUC(RFW)Acc(DT 10it, x± std)	Acc(RFW 10it, x±std)	Features
0.714	0.917	0.693 ± 0.033	0.757 ± 0.026	C_R0, PSA/Area, molLogS, nof_HBA, nof_NH2, nof_OH, nof_SO3H
0.714	0.821	0.691 ± 0.035	0.760 ± 0.028	C_R0, PSA/Area, molLogP, molLogS, nof_HBA, nof_OH, posCharge/Volume
0.786	0.893	0.690 ± 0.034	0.756 ± 0.027	C_R2, C_sp3, PSA/Area, nof_HBA, nof_NH2, nof_OH, nof_SO3H
0.702	0.881	0.702 ± 0.030	0.756 ± 0.024	C_R0, PSA/Area, nof_HBA, nof_NH2, nof_OH, nof_PO4, nof_posCharge
0.726	0.881	0.705 ± 0.030	0.763 ± 0.028	PSA/Area, molLogS, negCharge/Volume, nof_HBA, nof_NH2, nof_OH, nof_SO3H
0.786	0.869	0.702 ± 0.024	0.757 ± 0.025	C_R2, C_sp3, PSA/Area, nof_NH2, nof_OH, nof_SO3H, nof_negCharge
0.738	0.857	0.699 ± 0.026	0.757 ± 0.031	C_R1, PSA/Area, molLogS, nof_HBA, nof_SO3H, nof_posCharge, posCharge/Volume
0.583	0.869	0.694 ± 0.027	0.758 ± 0.028	PSA/Area, molLogP, molPSA, nof_OH, nof_SO3H, nof_negCharge, nof_posCharge
0.786	0.845	0.703 ± 0.023	0.758 ± 0.029	C_R2, C_sp3, PSA/Area, nof_NH2, nof_

NameError: name 'csv' is not defined

In [None]:
print(k)
print(map(len, [dt_aucs, rfw_aucs, dt_cas, dt_cas_std, rfw_cas, rfw_cas_std]))


## Feature importance

The feature importances are compared below for decision trees and random forests.

In [None]:
i = 0
# visualization
for dtree in d_trees:
    if i < k:
        print('Feature importances for tree and forest (resp.) %s/%s:' % (i + 1, k))
        for e in zip(dtree.feature_importances_, rfws[i].feature_importances_, best_features[i]):
            print('\t%6f\t%6f\t%s' % e)

    else:
        print('Warning, code may be buggy')
    i += 1
