In [1]:
#import the dependencies
import pandas as  pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('ThoraricSurgery.csv')

In [3]:
data.head()

Unnamed: 0,id,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,1,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,2,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,3,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,4,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,5,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [4]:
print(data.columns)

Index(['id', 'DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10',
       'PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE',
       'Risk1Yr'],
      dtype='object')


In [5]:
print(data.shape)

(470, 18)


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 18 columns):
id         470 non-null int64
DGN        470 non-null object
PRE4       470 non-null float64
PRE5       470 non-null float64
PRE6       470 non-null object
PRE7       470 non-null object
PRE8       470 non-null object
PRE9       470 non-null object
PRE10      470 non-null object
PRE11      470 non-null object
PRE14      470 non-null object
PRE17      470 non-null object
PRE19      470 non-null object
PRE25      470 non-null object
PRE30      470 non-null object
PRE32      470 non-null object
AGE        470 non-null int64
Risk1Yr    470 non-null object
dtypes: float64(2), int64(2), object(14)
memory usage: 66.2+ KB


In [7]:
data.describe()

Unnamed: 0,id,PRE4,PRE5,AGE
count,470.0,470.0,470.0,470.0
mean,235.5,3.281638,4.568702,62.534043
std,135.821574,0.871395,11.767857,8.706902
min,1.0,1.44,0.96,21.0
25%,118.25,2.6,1.96,57.0
50%,235.5,3.16,2.4,62.0
75%,352.75,3.8075,3.08,69.0
max,470.0,6.3,86.3,87.0


In [8]:
import matplotlib.pyplot as plt
data.hist(bins=500, figsize=(20, 20))
plt.show()

<Figure size 2000x2000 with 4 Axes>

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# extract numerical attributes and scale it to have zero mean and unit variance  
cols = data.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(data.select_dtypes(include=['float64','int64']))


# turn the result back to a dataframe
sc_traindf = pd.DataFrame(sc_train, columns = cols)
sc_traindf.head()

Unnamed: 0,id,PRE4,PRE5,AGE
0,0.0,0.296296,0.014061,0.590909
1,0.002132,0.403292,0.01078,0.454545
2,0.004264,0.271605,0.013124,0.575758
3,0.006397,0.460905,0.024373,0.5
4,0.008529,0.205761,0.0,0.787879


In [11]:
y = data['Risk1Yr']
x = data.drop(['Risk1Yr'], axis=1)
x.columns

Index(['id', 'DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10',
       'PRE11', 'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE'],
      dtype='object')

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for i in x.columns:
    x[i] = encoder.fit_transform(x[i])
    
# x

In [13]:
from sklearn.feature_selection import RFE
import itertools
rfc = RandomForestClassifier()

# create the RFE model and select 10 attributes
rfe = RFE(rfc, n_features_to_select=10)
rfe = rfe.fit(x, y)

# summarize the selection of the attributes
# feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support())]
# selected_features = [v for i, v in feature_map if i==True]
selected_features = rfe.ranking_
selected_features

array([1, 1, 1, 1, 1, 5, 1, 1, 1, 2, 1, 3, 7, 6, 4, 8, 1])

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05)

In [15]:
#Random Forest Classifier
rf = RandomForestClassifier(n_estimators = 1000, random_state=7)
svm = SVC(kernel='linear')
abc = AdaBoostClassifier(n_estimators=100)
gbc = GradientBoostingClassifier(n_estimators=290)
dt = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
knn = KNeighborsClassifier(n_neighbors=3)
gnb = GaussianNB()

In [16]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=7, verbose=0,
                       warm_start=False)

In [25]:
svm.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [26]:
abc.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=None)

In [27]:
gbc.fit(x_train, y_train)
# print("Training Time for Gradient Boosting Classifier:", round(time.time()-t0, 3), "s")

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=290,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [91]:
dt.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [92]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [93]:
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, rf.predict(x_test))
# predict = rf.predict(x_test)
# accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)
confusion_matrix = metrics.confusion_matrix(y_test, rf.predict(x_test))
print(confusion_matrix)
# scores = metrics.cross_val_score(rf, x_train, y_train, cv=10)
# print(scores.mean())
classification = metrics.classification_report(y_train, rf.predict(x_train))
classification

Accuracy:  87.5
[[21  0]
 [ 3  0]]


'              precision    recall  f1-score   support\n\n           F       1.00      1.00      1.00       379\n           T       1.00      1.00      1.00        67\n\n    accuracy                           1.00       446\n   macro avg       1.00      1.00      1.00       446\nweighted avg       1.00      1.00      1.00       446\n'

In [29]:
predict = svm.predict(x_test)
accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)

Accuracy:  91.42857142857143


In [103]:
predict = abc.predict(x_test)
accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)

Accuracy:  79.48717948717949


In [104]:
predict = gbc.predict(x_test)
accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)

Accuracy:  76.92307692307693


In [105]:
predict = dt.predict(x_test)
accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)

Accuracy:  69.23076923076923


In [106]:
predict = knn.predict(x_test)
accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)

Accuracy:  69.23076923076923


In [107]:
predict = gnb.predict(x_test)
accuracy = metrics.accuracy_score(y_test, predict)
print("Accuracy: ", accuracy*100)

Accuracy:  82.05128205128204


In [108]:
# from sklearn.ensemble import VotingClassifier
# #create a dictionary of our models
# estimators_1=[('knn', knn), ('rf', rf)]
# #create our voting classifier, inputting our models
# ensemble_1 = VotingClassifier(estimators_1, voting='hard')

In [109]:
# estimators_2=[('svm', sv), ('rf', rf)]
# ensemble_2 = VotingClassifier(estimators_2, voting='hard')

In [110]:
# estimators_3=[('knn', knn), ('svm', sv)]
# ensemble_3 = VotingClassifier(estimators_3, voting='hard')

In [111]:
# estimators_4=[('adb', abc), ('gdc', gbc)]
# ensemble_4 = VotingClassifier(estimators_4, voting='hard')

In [112]:
# estimators_5=[('svm', sv), ('adb', abc)]
# ensemble_5 = VotingClassifier(estimators_5, voting='hard')

In [113]:
# #fit model to training data
# ensemble_1.fit(x_train, y_train)
# print("Training time:", round(time.time()-t0, 3), "s")
# #test our model on the test data
# # print(ensemble.score(x_test, y_test))
# t1 = time.time()
# predict = ensemble_1.predict(x_test)
# print("Testing Time for Ensemble Classifier:", round(time.time()-t1, 3), "s")
# accuracy = metrics.accuracy_score(y_test, predict)
# print("Accuracy: ", accuracy*100)

In [114]:
# #fit model to training data
# t0 = time.time()
# ensemble_2.fit(x_train, y_train)
# print("Training time:", round(time.time()-t0, 3), "s")
# #test our model on the test data
# # print(ensemble.score(x_test, y_test))
# t1 = time.time()
# predict = ensemble_2.predict(x_test)
# print("Testing Time for Ensemble Classifier:", round(time.time()-t1, 3), "s")
# accuracy = metrics.accuracy_score(y_test, predict)
# print("Accuracy: ", accuracy*100)

In [115]:
# #fit model to training data
# t0 = time.time()
# ensemble_3.fit(x_train, y_train)
# print("Training time:", round(time.time()-t0, 3), "s")
# #test our model on the test data
# # print(ensemble.score(x_test, y_test))
# t1 = time.time()
# predict = ensemble_3.predict(x_test)
# print("Testing Time for Ensemble Classifier:", round(time.time()-t1, 3), "s")
# accuracy = metrics.accuracy_score(y_test, predict)
# print("Accuracy: ", accuracy * 100)

In [116]:
# #fit model to training data
# t0 = time.time()
# ensemble_4.fit(x_train, y_train)
# print("Training time:", round(time.time()-t0, 3), "s")
# #test our model on the test data
# # print(ensemble.score(x_test, y_test))
# t1 = time.time()
# predict = ensemble_4.predict(x_test)
# print("Testing Time for Ensemble Classifier:", round(time.time()-t1, 3), "s")
# accuracy = metrics.accuracy_score(y_test, predict)
# print("Accuracy: ", accuracy * 100)

In [117]:
# #fit model to training data
# t0 = time.time()
# ensemble_5.fit(x_train, y_train)
# print("Training time:", round(time.time()-t0, 3), "s")
# #test our model on the test data
# # print(ensemble.score(x_test, y_test))
# t1 = time.time()
# predict = ensemble_5.predict(x_test)
# print("Testing Time for Ensemble Classifier:", round(time.time()-t1, 3), "s")
# accuracy = metrics.accuracy_score(y_test, predict)
# print("Accuracy: ", accuracy * 100)