In [1]:
import pandas as pd
import mrmr
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from statistics import fmean
from sklearn.svm import SVC
from numpy import mean

In [2]:
circles, half_kernel, moons, spiral, two_gaussians33, two_gaussians42, breast_cancer =  pd.read_csv("SampleDatasets/circles0.3.csv"), pd.read_csv("SampleDatasets/halfkernel.csv"), pd.read_csv("SampleDatasets/moons1.csv"), pd.read_csv("SampleDatasets/spiral1.csv"), pd.read_csv("SampleDatasets/twogaussians33.csv"),  pd.read_csv("SampleDatasets/twogaussians42.csv"), pd.read_csv("SampleDatasets/Breastcancer.csv")

In [3]:
def evaluation_function(model_object, x_test, y_test, multi_class=False):
    if not multi_class:
        tn, fp, fn, tp = confusion_matrix(y_test, model_object.predict(x_test)).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        ppv = tp / (tp + fp)
        npv = tn / (fn + tn)
    else:
        list_of_matrix = multilabel_confusion_matrix(y_test, model_object.predict(x_test))
        specificity, sensitivity, accuracy, ppv, npv = list(), list(), list(), list(), list()
        for matrix in list_of_matrix:
            tn, fp, fn, tp = matrix.ravel()
            specificity.append(tn / (tn + fp))
            sensitivity.append(tp / (tp + fn))
            accuracy.append((tp + tn) / (tp + tn + fp + fn))
            ppv.append(tp / (tp + fp))
            npv.append(tn / (fn + tn))
        specificity, sensitivity, accuracy, ppv, npv = fmean(specificity), fmean(sensitivity), fmean(accuracy), fmean(ppv), fmean(npv)

    return specificity, sensitivity, accuracy, ppv, npv

In [4]:
def print_results(model_object, x_test, y_test, multi_class=False):

    evaluation_results = evaluation_function(model_object, x_test, y_test, multi_class)
    print("specificity: {} \nsensitivity: {} \naccuracy: {} \nppv: {} \nnpv: {}".format(evaluation_results[0], evaluation_results[1], evaluation_results[2], evaluation_results[3], evaluation_results[4]))

In [5]:
def perform_kfold(model_object, training_data, training_labels, n_splits=10):

    cv = KFold(n_splits=n_splits, random_state=1, shuffle=True)
    scores = cross_val_score(model_object, training_data, training_labels, scoring='accuracy', cv=cv, n_jobs=-1)
    return mean(scores)

In [6]:
def svm_grid_search(training_data, training_labels, n_folds=10):

    parameter_dict = {'kernel': ('linear', 'poly', 'rbf'), 'degree': range(2, 6), 'gamma': ('scale', 'auto'), 'C': [0.001, 0.01, 0.1, 1]}
    new_classifier = SVC()
    grid = GridSearchCV(new_classifier, param_grid=parameter_dict, cv=n_folds, verbose=1)
    grid.fit(training_data, training_labels)

    return grid.best_params_, grid.best_score_

# Circle0.3

In [7]:
circles_x_train, circles_x_test, circles_y_train, circles_y_test = train_test_split(circles.iloc[:, 0:2], circles['label'], test_size=0.3, random_state=42)

### Decision Tree on Circle0.3 Dataset

In [8]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(circles_x_train, circles_y_train)

In [9]:
decision_tree.tree_.max_depth

5

In [10]:
print_results(decision_tree, circles_x_test, circles_y_test)

specificity: 0.9803921568627451 
sensitivity: 0.9795918367346939 
accuracy: 0.98 
ppv: 0.9795918367346939 
npv: 0.9803921568627451


In [11]:
perform_kfold(decision_tree, circles_x_train, circles_y_train)

0.99

### Random Forest on Circle0.3

In [12]:
random_forest = RandomForestClassifier()
random_forest.fit(circles_x_train, circles_y_train)

In [13]:
print_results(random_forest, circles_x_test, circles_y_test)

specificity: 1.0 
sensitivity: 0.9727891156462585 
accuracy: 0.9866666666666667 
ppv: 1.0 
npv: 0.9745222929936306


In [14]:
perform_kfold(random_forest, circles_x_train, circles_y_train)

0.9957142857142858

### Multi Layer Perceptron on Circle0.3

In [15]:
perceptron_parameters = {'hidden_layer_sizes': (10,10),
                         'activation': 'relu',
                         'solver': 'adam',
                         'max_iter': 400,
                         'learning_rate': 'adaptive'}

perceptron = MLPClassifier(**perceptron_parameters)
perceptron.fit(circles_x_train, circles_y_train)

In [16]:
print_results(perceptron, circles_x_test, circles_y_test)

specificity: 1.0 
sensitivity: 1.0 
accuracy: 1.0 
ppv: 1.0 
npv: 1.0


In [17]:
perform_kfold(perceptron, circles_x_train, circles_y_train)

0.9985714285714286

# Halfkernel

In [18]:
half_kernel_x_train, half_kernel_x_test, half_kernel_y_train, half_kernel_y_test = train_test_split(half_kernel.iloc[:, 0:2], half_kernel['label'], test_size=0.3, random_state=42)

### Decision Tree on Halfkernel Dataset

In [19]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(half_kernel_x_train, half_kernel_y_train)

In [20]:
decision_tree.tree_.max_depth

5

In [21]:
print_results(decision_tree, half_kernel_x_test, half_kernel_y_test)

specificity: 0.9866666666666667 
sensitivity: 0.9933333333333333 
accuracy: 0.99 
ppv: 0.9867549668874173 
npv: 0.9932885906040269


In [22]:
perform_kfold(decision_tree, half_kernel_x_train, half_kernel_y_train)

0.9957142857142858

### Random Forest on Halfkernel

In [23]:
random_forest = RandomForestClassifier()
random_forest.fit(half_kernel_x_train, half_kernel_y_train)

In [24]:
print_results(random_forest, half_kernel_x_test, half_kernel_y_test)

specificity: 0.9866666666666667 
sensitivity: 0.9933333333333333 
accuracy: 0.99 
ppv: 0.9867549668874173 
npv: 0.9932885906040269


In [25]:
perform_kfold(random_forest, half_kernel_x_train, half_kernel_y_train)

0.9971428571428571

### Multi Layer Perceptron on Halfkernel

In [26]:
perceptron_parameters = {'hidden_layer_sizes': (10,10),
                         'activation': 'relu',
                         'solver': 'adam',
                         'max_iter': 400,
                         'learning_rate': 'adaptive'}

perceptron = MLPClassifier(**perceptron_parameters)
perceptron.fit(half_kernel_x_train, half_kernel_y_train)

In [27]:
print_results(perceptron, half_kernel_x_test, half_kernel_y_test)

specificity: 1.0 
sensitivity: 1.0 
accuracy: 1.0 
ppv: 1.0 
npv: 1.0


In [28]:
perform_kfold(perceptron, half_kernel_x_train, half_kernel_y_train)

1.0

# Moons1

In [29]:
moons_x_train, moons_x_test, moons_y_train, moons_y_test = train_test_split(moons.iloc[:, 0:2], moons['label'], test_size=0.3, random_state=42)

### Decision Tree on Moons1

In [30]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(moons_x_train, moons_y_train)

In [31]:
decision_tree.tree_.max_depth

7

In [32]:
print_results(decision_tree, moons_x_test, moons_y_test)

specificity: 1.0 
sensitivity: 0.9869281045751634 
accuracy: 0.9933333333333333 
ppv: 1.0 
npv: 0.9865771812080537


In [33]:
perform_kfold(decision_tree, moons_x_train, moons_y_train)

0.9914285714285714

### Random Forest on Moons1

In [34]:
random_forest = RandomForestClassifier()
random_forest.fit(moons_x_train, moons_y_train)

In [35]:
print_results(random_forest, moons_x_test, moons_y_test)

specificity: 1.0 
sensitivity: 0.9869281045751634 
accuracy: 0.9933333333333333 
ppv: 1.0 
npv: 0.9865771812080537


In [36]:
perform_kfold(random_forest, moons_x_train, moons_y_train)

0.9928571428571429

### Multi Layer Perceptron on Moons1

In [37]:
perceptron_parameters = {'hidden_layer_sizes': (10,10),
                         'activation': 'relu',
                         'solver': 'adam',
                         'max_iter': 500,
                         'learning_rate': 'adaptive'}

perceptron = MLPClassifier(**perceptron_parameters)
perceptron.fit(moons_x_train, moons_y_train)

In [38]:
print_results(perceptron, moons_x_test, moons_y_test)

specificity: 1.0 
sensitivity: 0.9934640522875817 
accuracy: 0.9966666666666667 
ppv: 1.0 
npv: 0.9932432432432432


In [39]:
perform_kfold(perceptron, moons_x_train, moons_y_train)

0.99

# Spiral1

In [40]:
spiral_x_train, spiral_x_test, spiral_y_train, spiral_y_test = train_test_split(spiral.iloc[:, 0:2], spiral['label'], test_size=0.3, random_state=42)

### Decision Tree on Spira1

In [41]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(spiral_x_train, spiral_y_train)

In [42]:
decision_tree.tree_.max_depth

8

In [43]:
print_results(decision_tree, spiral_x_test, spiral_y_test)

specificity: 0.9802631578947368 
sensitivity: 1.0 
accuracy: 0.99 
ppv: 0.9801324503311258 
npv: 1.0


In [44]:
perform_kfold(decision_tree, spiral_x_train, spiral_y_train)

0.9771428571428572

### Random Forest on Spiral1

In [45]:
random_forest = RandomForestClassifier()
random_forest.fit(spiral_x_train, spiral_y_train)

In [46]:
print_results(random_forest, spiral_x_test, spiral_y_test)

specificity: 0.9802631578947368 
sensitivity: 0.9932432432432432 
accuracy: 0.9866666666666667 
ppv: 0.98 
npv: 0.9933333333333333


In [47]:
perform_kfold(random_forest, spiral_x_train, spiral_y_train)

0.9800000000000001

### Multi-layer Perceptron on Spiral1

In [48]:
perceptron_parameters = {'hidden_layer_sizes': (10,10),
                         'activation': 'relu',
                         'solver': 'adam',
                         'max_iter': 700,
                         'learning_rate': 'adaptive'}

perceptron = MLPClassifier(**perceptron_parameters)
perceptron.fit(spiral_x_train, spiral_y_train)

In [49]:
print_results(perceptron, spiral_x_test, spiral_y_test)

specificity: 0.9802631578947368 
sensitivity: 1.0 
accuracy: 0.99 
ppv: 0.9801324503311258 
npv: 1.0


In [50]:
perform_kfold(perceptron, spiral_x_train, spiral_y_train)

0.9857142857142858

# Twogaussian33

In [51]:
gaussian33_x_train, gaussian33_x_test, gaussian33_y_train, gaussian33_y_test = train_test_split(two_gaussians33.iloc[:, 0:2], two_gaussians33['label'], test_size=0.3, random_state=42)

### Decision Tree on Twogaussian33

In [52]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(gaussian33_x_train, gaussian33_y_train)

In [53]:
decision_tree.tree_.max_depth

5

In [54]:
print_results(decision_tree, gaussian33_x_test, gaussian33_y_test)

specificity: 0.972972972972973 
sensitivity: 0.9868421052631579 
accuracy: 0.98 
ppv: 0.974025974025974 
npv: 0.9863013698630136


In [55]:
perform_kfold(decision_tree, gaussian33_x_train, gaussian33_y_train)

0.9871428571428572

### Random Forest on Twogaussian33

In [56]:
random_forest = RandomForestClassifier()
random_forest.fit(gaussian33_x_train, gaussian33_y_train)

In [57]:
print_results(random_forest, gaussian33_x_test, gaussian33_y_test)

specificity: 0.9864864864864865 
sensitivity: 0.993421052631579 
accuracy: 0.99 
ppv: 0.9869281045751634 
npv: 0.9931972789115646


In [58]:
perform_kfold(random_forest, gaussian33_x_train, gaussian33_y_train)

0.9885714285714287

### Multi-layer Perceptron on Twogaussian33

In [59]:
perceptron_parameters = {'hidden_layer_sizes': (10,10),
                         'activation': 'relu',
                         'solver': 'adam',
                         'max_iter': 250,
                         'learning_rate': 'adaptive'}

perceptron = MLPClassifier(**perceptron_parameters)
perceptron.fit(gaussian33_x_train, gaussian33_y_train)

In [60]:
print_results(perceptron, gaussian33_x_test, gaussian33_y_test)

specificity: 0.9864864864864865 
sensitivity: 0.9868421052631579 
accuracy: 0.9866666666666667 
ppv: 0.9868421052631579 
npv: 0.9864864864864865


In [61]:
perform_kfold(perceptron, gaussian33_x_train, gaussian33_y_train)

0.9914285714285714

# Twogaussian42

In [62]:
gaussian42_x_train, gaussian42_x_test, gaussian42_y_train, gaussian42_y_test = train_test_split(two_gaussians42.iloc[:, 0:2], two_gaussians42['label'], test_size=0.3, random_state=42)

### Decision Tree on Twogaussian42

In [63]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(gaussian42_x_train, gaussian42_y_train)

In [64]:
decision_tree.tree_.max_depth

10

In [65]:
print_results(decision_tree, gaussian42_x_test, gaussian42_y_test)

specificity: 0.9455782312925171 
sensitivity: 0.9215686274509803 
accuracy: 0.9333333333333333 
ppv: 0.9463087248322147 
npv: 0.9205298013245033


In [66]:
perform_kfold(decision_tree, gaussian42_x_train, gaussian42_y_train)

0.9128571428571428

### Random Forest on Twogaussian42

In [67]:
random_forest = RandomForestClassifier()
random_forest.fit(gaussian42_x_train, gaussian42_y_train)

In [68]:
print_results(random_forest, gaussian42_x_test, gaussian42_y_test)

specificity: 0.9523809523809523 
sensitivity: 0.9150326797385621 
accuracy: 0.9333333333333333 
ppv: 0.9523809523809523 
npv: 0.9150326797385621


In [69]:
perform_kfold(random_forest, gaussian42_x_train, gaussian42_y_train)

0.9357142857142857

### Multi-layer Perceptron on Twogaussian42

In [70]:
perceptron_parameters = {'hidden_layer_sizes': (10,10),
                         'activation': 'relu',
                         'solver': 'adam',
                         'max_iter': 600,
                         'learning_rate': 'adaptive'}

perceptron = MLPClassifier(**perceptron_parameters)
perceptron.fit(gaussian42_x_train, gaussian42_y_train)

In [71]:
print_results(perceptron, gaussian42_x_test, gaussian42_y_test)

specificity: 0.9591836734693877 
sensitivity: 0.869281045751634 
accuracy: 0.9133333333333333 
ppv: 0.9568345323741008 
npv: 0.8757763975155279


In [72]:
perform_kfold(perceptron, gaussian42_x_train, gaussian42_y_train)

0.9414285714285715

# Breast Cancer Dataset

In [73]:
breast_cancer.head()

Unnamed: 0,STK35,DPYSL4,GJC2,FMNL1,LIG3,CA6,BRPF1,BRMS1,CXCL13,SIAE,...,ADRB1,DRD4,GABRR1,KRT10,PIP,MS4A3,SECISBP2L,EMP3,HOXB1,Class
0,0.425,-0.316,0.465,-0.142,-0.033,0.357,-0.197,-0.28,3.164,-0.086,...,-0.188,-0.015,-0.035,0.722,0.131,-0.545,0.323,-0.4,0.064,Basal
1,0.694,-0.497,0.2,-0.215,0.852,0.3,-0.864,-0.222,6.179,-0.649,...,0.2,-0.416,0.293,-0.795,1.785,-1.224,0.505,-0.275,0.3,Basal
2,0.304,-0.205,0.739,0.176,0.726,0.715,0.298,0.147,3.612,-0.23,...,0.211,0.17,0.156,-0.733,2.414,-0.435,0.047,-0.98,0.233,Basal
3,0.309,-0.561,0.669,-0.514,0.805,0.409,-0.974,0.054,1.28,-0.15,...,-0.532,0.884,0.633,-0.586,0.163,-0.987,0.223,-1.09,0.603,Basal
4,0.732,-0.287,0.311,-0.326,-0.468,1.887,-0.347,-0.193,2.41,0.479,...,0.354,0.596,0.27,-0.282,0.601,0.164,0.04,-0.705,0.054,Basal


In [74]:
breast_cancer.describe()

Unnamed: 0,STK35,DPYSL4,GJC2,FMNL1,LIG3,CA6,BRPF1,BRMS1,CXCL13,SIAE,...,MAS1,ADRB1,DRD4,GABRR1,KRT10,PIP,MS4A3,SECISBP2L,EMP3,HOXB1
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,...,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,0.096057,-0.186918,0.523943,-0.144778,0.314177,0.181025,-0.387842,0.036722,2.539418,0.146006,...,0.074354,-0.004728,1.015829,0.153108,-0.966392,3.626222,-0.405956,0.282146,-0.863665,0.111025
std,0.208323,0.289994,0.350318,0.345459,0.443096,0.494525,0.338736,0.424131,1.636105,0.527961,...,0.431365,0.425453,0.674723,0.221736,0.716756,2.40107,0.405931,0.388707,0.408621,0.223153
min,-0.446,-1.064,-0.306,-1.155,-0.787,-0.791,-1.201,-0.855,-0.075,-1.123,...,-1.015,-1.446,-0.745,-0.428,-2.926,-0.071,-1.796,-0.442,-1.722,-0.591
25%,-0.04,-0.36775,0.263,-0.3665,-0.00125,-0.0545,-0.62275,-0.26675,1.22975,-0.1705,...,-0.23625,-0.2615,0.564,0.028,-1.31825,1.3285,-0.627,0.04375,-1.133,-0.02875
50%,0.0665,-0.188,0.4715,-0.188,0.318,0.119,-0.3895,0.0085,2.28,0.1035,...,0.083,0.011,0.9395,0.155,-0.943,3.7085,-0.388,0.2215,-0.8905,0.103
75%,0.21575,-0.04275,0.736,0.03775,0.625,0.3045,-0.19225,0.24175,3.55475,0.4095,...,0.36375,0.20675,1.4035,0.27,-0.591,5.58975,-0.13475,0.451,-0.61875,0.25075
max,0.934,0.89,1.752,1.87,1.464,3.075,0.849,1.502,6.179,2.156,...,1.17,1.436,2.72,1.169,2.257,8.057,0.948,1.888,0.511,0.642


In [75]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Columns: 13583 entries, STK35 to Class
dtypes: float64(13582), object(1)
memory usage: 16.4+ MB


## mRMR Feature Selection

In [76]:
features = breast_cancer.drop(labels=['Class'], axis=1)
mrmr_features_list = mrmr.mrmr_classif(features, breast_cancer["Class"], 20)
selected_features = features[mrmr_features_list]

100%|██████████| 20/20 [00:30<00:00,  1.50s/it]


In [77]:
cancer_x_train, cancer_x_test, cancer_y_train, cancer_y_test = train_test_split(selected_features, breast_cancer['Class'], test_size=0.3, random_state=42)

In [78]:
svm_grid_search(cancer_x_train, cancer_y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


({'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'},
 0.8545454545454545)

In [79]:
svm_best_classifier = SVC(C=0.1, gamma='scale', kernel='linear')
svm_best_classifier.fit(cancer_x_train, cancer_y_train)

In [80]:
print_results(svm_best_classifier, cancer_x_test, cancer_y_test, multi_class=True)

specificity: 0.9518728421601198 
sensitivity: 0.80995670995671 
accuracy: 0.925 
ppv: 0.8294444444444444 
npv: 0.9561240310077519


In [81]:
perform_kfold(svm_best_classifier, cancer_x_train, cancer_y_train)

0.8636363636363636

## Information Gain Feature Selection

In [82]:
selected_features = SelectKBest(mutual_info_classif, k=20).fit_transform(features, breast_cancer['Class'])

In [83]:
cancer_x_train, cancer_x_test, cancer_y_train, cancer_y_test = train_test_split(selected_features, breast_cancer['Class'], test_size=0.3, random_state=42)

In [84]:
svm_grid_search(cancer_x_train, cancer_y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


({'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'},
 0.8636363636363636)

In [85]:
svm_best_classifier = SVC(C=0.1, degree=3, gamma='auto', kernel='poly')
svm_best_classifier.fit(cancer_x_train, cancer_y_train)

In [86]:
print_results(svm_best_classifier, cancer_x_test, cancer_y_test, multi_class=True)

specificity: 0.9475878849339863 
sensitivity: 0.7929004329004329 
accuracy: 0.9166666666666666 
ppv: 0.7752380952380953 
npv: 0.9476718403547671


In [87]:
perform_kfold(svm_best_classifier, cancer_x_train, cancer_y_train)

0.8454545454545455