In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
def MetricsOfReliability(clf, data_test_df, data_label_df, clf_predict_df):
    accuracy = metrics.accuracy_score(data_label_df, clf_predict_df)
    print('accuracy = ' + str(accuracy))

    confusion_matrix = metrics.confusion_matrix(data_label_df, clf_predict_df)
    tp = confusion_matrix[0,0]
    fp = confusion_matrix[1,0]
    fn = confusion_matrix[0,1]
    tn = confusion_matrix[1,1]    
    metrics.plot_confusion_matrix(clf, data_test_df, data_label_df)
    plt.title('confusion matrix')

    sensitivity = tp/(tp + fn)
    print('sensitivity = ' + str(sensitivity))

    specificity = tn/(tn + fp)
    print('specificity = ' + str(specificity))

    metrics.plot_roc_curve(clf, data_test_df, data_label_df)

In [None]:
#Get and prepare the data
data_df = pd.read_csv('C:/Users/johan/ML1/p53_old_2010/K8.data', names=list(range(5409)), na_values='?')
data_df.drop(labels=5408, axis='columns', inplace=True)
data_df = data_df.dropna()
feature_list = list(range(5407))
label = 5407

In [None]:
#perform train-test-split
train_df, test_df = train_test_split(data_df, test_size=0.2, train_size=0.8)


<h2>Support Vector Machine</h2>

In [None]:
#rbf SVM fit, predict
svm_rbf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
svm_rbf.fit(train_df[feature_list], train_df[label])
svm_predict_df = svm_rbf.predict(test_df[feature_list])
MetricsOfReliability(svm_rbf, test_df[feature_list], test_df[label], svm_predict_df)

In [None]:
#linear SVM fit, predict
svm_rbf = make_pipeline(StandardScaler(), SVC(kernel='linear'))
svm_rbf.fit(train_df[feature_list], train_df[label])
svm_predict_df = svm_rbf.predict(test_df[feature_list])
MetricsOfReliability(svm_rbf, test_df[feature_list], test_df[label], svm_predict_df)

<h2>Random Forest</h2>

In [None]:
#RF fit and predict
rf = RandomForestClassifier(oob_score=True)
rf.fit(train_df[feature_list], train_df[label])
rf_predict_test_df = rf.predict(test_df[feature_list])
MetricsOfReliability(rf, test_df[feature_list], test_df[label], rf_predict_test_df)
print('out-of-bag-error = ' + str(rf.oob_score_))

In [None]:
#out-of-bag-error and accuracy for increasing number of trees
rf = RandomForestClassifier(oob_score=True)
test_error = []
oob_error = []
for n in [10,50,70]:
    rf.set_params(max_features=n)
    rf.fit(train_df[feature_list], train_df[label])
    test_error.append(rf.score(test_df[feature_list], test_df[label]))
    oob_error.append(rf.oob_score_)
    print(n)

print(test_error)
print(oob_error)