In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [26]:
dataset = pd.read_csv('data-original.csv')
#replace ? with 1
dataset['Bare Nuclei'] = dataset['Bare Nuclei'].replace(['?'], '1')

In [27]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [35]:
#info on xgboost
#https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/
#verbosity supresses warnings, that would pop up when running cross-validation 
from xgboost import XGBClassifier
classifier = XGBClassifier(verbosity = 0)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[83  2]
 [ 2 53]]


0.9714285714285714

In [37]:
#apply k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 95.71 %
Standard Deviation: 2.67 %


In [39]:
#try grid-search to find the best parameters for the model
from sklearn.model_selection import GridSearchCV
#creation of 2 dictionaries: 
#one for linear and one for rbf kernel
#reason: gamma parameter can be only be used with rbf kernel
parameters = [{
    'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6]
}]
#create an object as an instance of the GridSearchCV class
grid_search = GridSearchCV(estimator = classifier,
               param_grid = parameters,
               scoring = 'accuracy',
               #cv means we have 10 train-test-folds (k-fold cross-val)
               cv = 10,
               #if grid search runs on machines, -1 means all processors will be used
               n_jobs = -1)
#connect object 'grid_search' to training set
grid_search.fit(X_train, y_train)
#get best accuracy
#last '_' means get an attribute
best_accuracy = grid_search.best_score_
best_std = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Standard Deviation: {:.2f} %".format(best_std*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 96.79 %
Standard Deviation: 2.08 %
Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.8}
