In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import tree
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA

import operator
from sklearn import metrics
from sklearn.utils import resample
from sklearn.metrics import make_scorer, matthews_corrcoef

  from numpy.core.umath_tests import inner1d


### Please make sure to run all the code cells in sequence from top to bottom. There are dependencies and variable names which might not work as intended if the order is not kept intact. 

In [2]:
data = pd.read_csv('data.csv')
labels = pd.read_csv('labels.csv')

In [3]:
# data preprocessing(normalizing the data using min-max scalar)

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)

In [4]:
# concatenate the data with the respective labels

complete_data = pd.concat([data, labels], axis=1)
labelled = complete_data[0:178]
dataset = labelled.values

In [5]:
# since the dataset is imbalanced
# we upsample the minority_class in order to create a more uniform dataset
# while address the bias our model might learn if we do not perform the upsampling

majority_class = dataset[dataset[:,186]==1]
minority_class = dataset[dataset[:,186]==2]

upsample_size = majority_class.shape[0]

# upsampling minority class which is class 2, given that our data is unbalanced as we saw in section 1.1
minority_class_upsampled = resample(minority_class, replace=True, n_samples=upsample_size, random_state=10)

if(minority_class_upsampled.shape == majority_class.shape):
    print('Test passed')
else:
    print('Oops! something went wrong.')

Test passed


In [6]:
# creating the final dataset with minor_class upsampled

dataset = np.concatenate((majority_class, minority_class_upsampled), axis=0)

In [7]:
X = dataset[:, 0:185]
Y = dataset[:, 186]

In [8]:
# splitting the data into training and test set
# for our case we will use 20% of the data as test set and 80% as training set

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

# Decision Trees

In [9]:
# implementing variations of decision trees using grid search
# we display the best set of parameters in the following sections

dt = tree.DecisionTreeClassifier(random_state=2)

pipeline = Pipeline([('dt', dt),])

parameters = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_leaf_nodes':[2, 3, 4, 5, 6, 7],
    'dt__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'dt__splitter' : ['best']
}

In [10]:
grid_search_dt = GridSearchCV(pipeline, parameters, scoring=metrics.make_scorer(metrics.matthews_corrcoef), cv=10, n_jobs=-1, verbose=0)

In [11]:
grid_search_dt.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=2,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'dt__criterion': ['gini', 'entropy'], 'dt__max_leaf_nodes': [2, 3, 4, 5, 6, 7], 'dt__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'dt__splitter': ['best']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [12]:
models_meanscores = []

number_of_permutations = len(grid_search_dt.cv_results_['params'])
for x in range(number_of_permutations):
    models_meanscores.append((grid_search_dt.cv_results_['params'][x],grid_search_dt.cv_results_['mean_test_score'][x]))

print("Best Parameters:")
print(grid_search_dt.best_params_)

# predictions from best arrangement from the grid_search
Y_predicted = grid_search_dt.predict(X_test)

output_classification_report = metrics.classification_report(Y_test, Y_predicted)

print(' ')
print(output_classification_report)
print(' ')

confusion_matrix = metrics.confusion_matrix(Y_test, Y_predicted)

print("Confusion Matrix: ")
print(confusion_matrix)
print(' ')
print("Matthews correlation coefficent")
print(metrics.matthews_corrcoef(Y_test, Y_predicted))
print(' ')
print("Normalized Accuracy")
print(metrics.accuracy_score(Y_test,Y_predicted))

Best Parameters:
{'dt__criterion': 'entropy', 'dt__max_depth': 2, 'dt__max_leaf_nodes': 5, 'dt__splitter': 'best'}
 
             precision    recall  f1-score   support

        1.0       1.00      0.91      0.95        32
        2.0       0.91      1.00      0.95        31

avg / total       0.96      0.95      0.95        63

 
Confusion Matrix: 
[[29  3]
 [ 0 31]]
 
Matthews correlation coefficent
0.9090031708997952
 
Normalized Accuracy
0.9523809523809523


# KNN

In [13]:
# KNN with Grid Search CV

scorer = make_scorer(matthews_corrcoef) 
# range of K nearest neighbor values
k = np.arange(3, 20) 
weight_options = ["uniform", "distance"]
knn_algorithm = ["ball_tree", "kd_tree", "brute"]
param_grid = dict(n_neighbors=k, weights=weight_options, algorithm=knn_algorithm)

scores = []
combinations = []

for cvv in [10, 20]:
    
    knn = KNeighborsClassifier()

    gridsearch_knn = GridSearchCV(knn, param_grid, cv=cvv, n_jobs=-1, verbose=0, scoring=scorer)
    gridsearch_knn.fit(X_train, Y_train)
    mean_test_scores = gridsearch_knn.cv_results_['mean_test_score']  
    
#     combinations.append(np.c_[mean_test_scores])
    number_of_permutations = len(gridsearch_knn.cv_results_['params'])
    for x in range(number_of_permutations):
        models_meanscores.append((gridsearch_knn.cv_results_['params'][x],gridsearch_knn.cv_results_['mean_test_score'][x]))
        
# predictions from best arrangement from the grid_search
Y_predicted = gridsearch_knn.predict(X_test)

output_classification_report = metrics.classification_report(Y_test, Y_predicted)

print(' ')
print(output_classification_report)
print(' ')

confusion_matrix = metrics.confusion_matrix(Y_test, Y_predicted)

print("Confusion Matrix: ")
print(confusion_matrix)
print(' ')
print("Matthews correlation coefficent")
print(metrics.matthews_corrcoef(Y_test, Y_predicted))
print(' ')
print("Normalized Accuracy")
print(metrics.accuracy_score(Y_test,Y_predicted))

 
             precision    recall  f1-score   support

        1.0       1.00      0.94      0.97        32
        2.0       0.94      1.00      0.97        31

avg / total       0.97      0.97      0.97        63

 
Confusion Matrix: 
[[30  2]
 [ 0 31]]
 
Matthews correlation coefficent
0.9384464919119354
 
Normalized Accuracy
0.9682539682539683


# SVM

In [14]:
# implementing variations of support vector machine using grid search
# we display the best set of parameters in the following sections

svm = SVC(random_state=3)

pipeline_svm = Pipeline([('svm', svm),])

parameters_svm = {
    'svm__C': [.001, .01, .1, 1.0, 10.],
    'svm__kernel': ['rbf', 'linear', 'sigmoid'],
    'svm__gamma': [.001, .01, .1, 1.0]
}

In [15]:
grid_search_svm = GridSearchCV(pipeline_svm, parameters_svm, scoring=metrics.make_scorer(metrics.matthews_corrcoef), cv=10, n_jobs=-1, verbose=0)

In [16]:
grid_search_svm.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=3, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svm__C': [0.001, 0.01, 0.1, 1.0, 10.0], 'svm__kernel': ['rbf', 'linear', 'sigmoid'], 'svm__gamma': [0.001, 0.01, 0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [17]:
number_of_permutations = len(grid_search_svm.cv_results_['params'])
for x in range(number_of_permutations):
    models_meanscores.append((grid_search_svm.cv_results_['params'][x],grid_search_svm.cv_results_['mean_test_score'][x]))

print("Results:")

print("Best Parameters:")
print(grid_search_svm.best_params_)


# predictions from best arrangement from the grid_search
Y_predicted = grid_search_svm.predict(X_test)

output_classification_report = metrics.classification_report(Y_test, Y_predicted)

print(' ')
print(output_classification_report)
print(' ')

confusion_matrix = metrics.confusion_matrix(Y_test, Y_predicted)

print("Confusion Matrix: ")
print(confusion_matrix)
print(' ')
print("Matthews correlation coefficent")
print(metrics.matthews_corrcoef(Y_test, Y_predicted))
print(' ')
print("Normalized Accuracy")
print(metrics.accuracy_score(Y_test,Y_predicted))

Results:
Best Parameters:
{'svm__C': 1.0, 'svm__gamma': 0.001, 'svm__kernel': 'linear'}
 
             precision    recall  f1-score   support

        1.0       1.00      0.97      0.98        32
        2.0       0.97      1.00      0.98        31

avg / total       0.98      0.98      0.98        63

 
Confusion Matrix: 
[[31  1]
 [ 0 31]]
 
Matthews correlation coefficent
0.96875
 
Normalized Accuracy
0.9841269841269841


There is a caveat in SVM, if the number of features and number of samples are comparable then it becomes very crucial and difficult to avoid overfitting. Which we also observe in our case, some parameters arrangements for SVM give out really high scores at a peak of even 1. It will be reasonable to reject some of the top performing SVM models, as they clearly look like they are overfitting. 

For our ensemble, we will discard all the variations of SVM models which result in a staggering 100% accuracy and hence, hinting towards overfitting.

In [18]:
models_meanscores.sort(key=operator.itemgetter(1), reverse=True)

# rejecting the first 12 overfitted models
final_models_meanscores = models_meanscores[12:]
final_models_meanscores

[({'algorithm': 'ball_tree', 'n_neighbors': 15, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'ball_tree', 'n_neighbors': 16, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'ball_tree', 'n_neighbors': 17, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'ball_tree', 'n_neighbors': 18, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'ball_tree', 'n_neighbors': 19, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'kd_tree', 'n_neighbors': 11, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'kd_tree', 'n_neighbors': 12, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'kd_tree', 'n_neighbors': 13, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'kd_tree', 'n_neighbors': 14, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'kd_tree', 'n_neighbors': 15, 'weights': 'distance'},
  0.9924672917790448),
 ({'algorithm': 'kd_tree', 'n_neighbors': 16, 'weights': 'di

# Ensemble

The whole idea behind creating an ensemble is to choose the set of models which are simple and can generalize well, within and outside of the ensemble. So, from the list of models_meanscores(possible parameters) we obtained in the section above, we removed all the models for which we were highly suspicious of overfitting, our primary test for overfitting is to see if the model has a very high mean_test_score(for svm and some parameter arrangements of knn, we received a score of 100%). Once we eliminated all the possible overfitting models, we used Occam's razor rule in order to select one model from a cluster of models with equal mean_test_score. So, for example, if we have a decision tree with a mean_test_score of 98.5% with max_depth of 4 and another decision tree with a mean_test_score of 98.5% with a max_depth of 2, in this case we will choose the model with the max_depth 2. As this model will eventually generalize well on the hidden data. We tried to put together an ensemble of 6 models into a voting classifier and from the prima-facie analysis which we have put together in the following sections, we observe that our ensemble performs better than the set of parameters we have tested for the random forest. 

Specifics: Our ensemble(voting classifier) basically consists of 3 nearest neighbors, 2 svm and 1 decision three models. We observe that our ensemble is much better when compared to the random forest implementation in the following section. Where our ensemble got an normalized accuracy of 98.41%, where as the random forest got a normalized accuracy of 96.82%. 

In [19]:
# in this section we will implement a voting classifier now
# we take top 10 model parameters depeding on the mean-test score
# and we create an esemble of all these classifier to create a voting classifier

kfold = model_selection.KFold(n_splits=10, random_state=10)


# sub-models for the voting classifier
estimators = []
vote_1 = neighbors.KNeighborsClassifier(algorithm='kd_tree', weights='distance', n_neighbors=19)
estimators.append(('label_1', vote_1))
vote_2 = neighbors.KNeighborsClassifier(algorithm='brute', weights='distance', n_neighbors=19)
estimators.append(('label_2', vote_2))
vote_3 = neighbors.KNeighborsClassifier(algorithm='kd_tree', weights='distance', n_neighbors=10)
estimators.append(('label_3', vote_3))
vote_4 = SVC(C=1.0, kernel='rbf', gamma=1.0)
estimators.append(('label_4', vote_4))
vote_5 = SVC(C=10.0, kernel='rbf', gamma=1.0)
estimators.append(('label_5', vote_5))
vote_6 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=2, max_leaf_nodes=5, splitter='best')
estimators.append(('label_6', vote_6))

# creating the voting classifier
ensemble = VotingClassifier(estimators)

ensemble.fit(X_train,Y_train)
Y_predicted = ensemble.predict(X_test)

output_classification_report = metrics.classification_report(Y_test, Y_predicted)

print(' ')
print(output_classification_report)
print(' ')

print("Confusion Matrix: ")
print(confusion_matrix)
print(' ')
print("Matthews correlation coefficent")
print(metrics.matthews_corrcoef(Y_test, Y_predicted))
print(' ')
print("Normalized Accuracy")
print(metrics.accuracy_score(Y_test,Y_predicted))

 
             precision    recall  f1-score   support

        1.0       1.00      0.97      0.98        32
        2.0       0.97      1.00      0.98        31

avg / total       0.98      0.98      0.98        63

 
Confusion Matrix: 
[[31  1]
 [ 0 31]]
 
Matthews correlation coefficent
0.96875
 
Normalized Accuracy
0.9841269841269841


  if diff:


# Random forest

In [20]:
rf = RandomForestClassifier(random_state=10)

pipeline_rf = Pipeline([('rf', rf),])

parameters_rf = {
    'rf__criterion': ['gini', 'entropy'],
    'rf__max_leaf_nodes': [2, 3, 4, 5],
    'rf__max_depth': [2, 3, 4, 5],
    'rf__n_estimators' : list(range(10,20))
}


In [21]:
grid_search_rf = GridSearchCV(pipeline_rf, parameters_rf, scoring=metrics.make_scorer(metrics.matthews_corrcoef), cv=10, n_jobs=-1, verbose=0)

In [22]:
grid_search_rf.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'rf__criterion': ['gini', 'entropy'], 'rf__max_leaf_nodes': [2, 3, 4, 5], 'rf__max_depth': [2, 3, 4, 5], 'rf__n_estimators': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(matthews_corrcoef), verbose=0)

In [23]:
number_of_permutations = len(grid_search_rf.cv_results_['params'])
for x in range(number_of_permutations):
    models_meanscores.append((grid_search_rf.cv_results_['params'][x],grid_search_rf.cv_results_['mean_test_score'][x]))

print("Results:")

print("Best Parameters:")
print(grid_search_rf.best_params_)


Y_predicted = grid_search_rf.predict(X_test)


output_classification_report = metrics.classification_report(Y_test, Y_predicted)

print(' ')
print(output_classification_report)
print(' ')

print("Confusion Matrix: ")
print(confusion_matrix)
print(' ')
print("Matthews correlation coefficent")
print(metrics.matthews_corrcoef(Y_test, Y_predicted))
print(' ')
print("Normalized Accuracy")
print(metrics.accuracy_score(Y_test,Y_predicted))

Results:
Best Parameters:
{'rf__criterion': 'gini', 'rf__max_depth': 3, 'rf__max_leaf_nodes': 5, 'rf__n_estimators': 12}
 
             precision    recall  f1-score   support

        1.0       1.00      0.94      0.97        32
        2.0       0.94      1.00      0.97        31

avg / total       0.97      0.97      0.97        63

 
Confusion Matrix: 
[[31  1]
 [ 0 31]]
 
Matthews correlation coefficent
0.9384464919119354
 
Normalized Accuracy
0.9682539682539683


# AML prediction

In this section we try to predict the AML patient in the unlabelled dataset using the ensemble we have created in the section above. As we already know from the challenge specifications there are in total 20 AML patients in 180 subjects and our ensemble in total predicted that there are 19 AML patients amng 180 subjects. Which is close, but it will be interesting to know how we fare given the ground truth. 

In [24]:
X_test = data.values

In [25]:
X_test = X_test[178:, 0:185]

In [26]:
Y_predictions = ensemble.predict(X_test)

  if diff:


In [27]:
patients_asml = sum([1 for x in Y_predictions if x==2])

In [28]:
print('Number of patients who have ASML as predicted by our ensemble are: ' + str(patients_asml))

Number of patients who have ASML as predicted by our ensemble are: 19


### We have provided a csv file with our prediction team_05_predictions.csv

In [29]:
X_test

array([[0.9337206 , 0.6032317 , 0.05962682, ..., 0.22969565, 0.02093932,
        0.13430102],
       [0.68960045, 0.6215028 , 0.2497939 , ..., 0.09433911, 0.00136427,
        0.77853897],
       [0.65570983, 0.84543794, 0.23691199, ..., 0.23327425, 0.02430728,
        0.05873974],
       ...,
       [0.53357892, 0.68191161, 0.39002643, ..., 0.28722161, 0.03577736,
        0.0643017 ],
       [0.38909039, 0.75139888, 0.41344577, ..., 0.23198819, 0.0185057 ,
        0.12663637],
       [0.79988665, 0.37752655, 0.1894245 , ..., 0.09027561, 0.00122795,
        0.67187818]])

In [None]:
aml_predictions = np.c_[X_test, Y_predictions]
# to_csv
amlpd = pd.DataFrame(aml_predictions)
amlpd.to_csv('Team_05_prediction.csv')

# Conclusion

For the baseline classification, we tested three classification techniques namely, decision trees, k-nearest neighbors and support vector machines. We were able to test an array of parameters for each of them using grdsearchCV in scikit-learn. Further we compared the best results for each of these techniques, and we also created a list with all the parameters arrangements and their corresponding mean_test_scores. We observed that there certain array of parameter arrangements largely for knn and svm which clearly overfit and one primary indication of overfitting is a really high score on the test data. We sorted our list of models_meanscores and removed the models which were clearly showing signs of overfitting(primarily based on a really high score). Then from the top performing models based on the mean_tes_score and occam's razor rule we selected 6 best fit for our ensemble which we thought will generalize well on the hidden data. We further compared the normalized accuracy from our ensemble with an implemetation of random forest and our ensemble performed better than the random forest, normalized accuracy for our ensemble was 98.41%, where as for the random forest it was 96.82%. 

Using this ensemble, we tried predicting the unlabelled dataset and in total 19 subjects were predicted to be AML patients out of 180 subjects in total. 