In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#import dataframe
breast = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', 
                     names=['Class','age','menopause','tumor-size','inv-nodes','node-cap','deg-malig','breast','breast-quad','irradiat'])

In [4]:
breast.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-cap,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [5]:
#Dataset Information
breast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-cap     286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [6]:
#cek jumlah null
breast.isnull().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-cap       0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [7]:
#statistik deskriptif
breast.describe() #numerik

Unnamed: 0,deg-malig
count,286.0
mean,2.048951
std,0.738217
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [8]:
breast.describe(include=['O']) #data kategorikal

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-cap,breast,breast-quad,irradiat
count,286,286,286,286,286,286,286,286,286
unique,2,6,3,11,7,3,2,6,2
top,no-recurrence-events,50-59,premeno,30-34,0-2,no,left,left_low,no
freq,201,96,150,60,213,222,152,110,218


In [9]:
X = breast.iloc[:,1:]
y = breast.iloc[:,0]

Train test split

In [10]:
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [11]:
#Data preparation (Missing Value)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X_train)

SimpleImputer(strategy='most_frequent')

In [12]:
#imputing nan value
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [13]:
#Encoding (Kategorikal -> Numeric)
#One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [14]:
X_train

<228x42 sparse matrix of type '<class 'numpy.float64'>'
	with 2052 stored elements in Compressed Sparse Row format>

In [15]:
#model selection (logistic)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

LogisticRegression()

In [16]:
#predicting
y_pred_logreg = logreg.predict(X_test)

In [17]:
#Model evaluation (logreg)
from sklearn.metrics import classification_report
print(classification_report(y_pred_logreg,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.90      0.75      0.82        51
   recurrence-events       0.19      0.43      0.26         7

            accuracy                           0.71        58
           macro avg       0.55      0.59      0.54        58
        weighted avg       0.82      0.71      0.75        58



In [18]:
#Cross validation 
from sklearn.model_selection import cross_val_score
cross_val_score(logreg,X_train, y_train, cv=5)

array([0.73913043, 0.7173913 , 0.7173913 , 0.66666667, 0.73333333])

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [20]:
svc = SVC()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

svc.fit(X_train,y_train)
tree.fit(X_train,y_train)
forest.fit(X_train,y_train)

RandomForestClassifier()

In [21]:
y_pred_svc = svc.predict(X_test)
y_pred_tree = tree.predict(X_test)
y_pred_forest = forest.predict(X_test)

In [22]:
#Model evaluation
print(classification_report(y_pred_svc, y_test)) #kita pilih svc karena random forest dikhawatirkan akan berat

                      precision    recall  f1-score   support

no-recurrence-events       1.00      0.76      0.87        55
   recurrence-events       0.19      1.00      0.32         3

            accuracy                           0.78        58
           macro avg       0.59      0.88      0.59        58
        weighted avg       0.96      0.78      0.84        58



In [23]:
print(classification_report(y_pred_tree, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.71      0.73      0.72        41
   recurrence-events       0.31      0.29      0.30        17

            accuracy                           0.60        58
           macro avg       0.51      0.51      0.51        58
        weighted avg       0.60      0.60      0.60        58



In [24]:
print(classification_report(y_pred_forest,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.95      0.77      0.85        52
   recurrence-events       0.25      0.67      0.36         6

            accuracy                           0.76        58
           macro avg       0.60      0.72      0.61        58
        weighted avg       0.88      0.76      0.80        58



In [25]:
#hyper parameter tuning
svc_linear = SVC(kernel='poly')
svc_linear.fit(X_train,y_train)
y_pred_svcl = svc_linear.predict(X_test)
print(classification_report(y_pred_svcl, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.98      0.77      0.86        53
   recurrence-events       0.25      0.80      0.38         5

            accuracy                           0.78        58
           macro avg       0.61      0.79      0.62        58
        weighted avg       0.91      0.78      0.82        58



In [26]:
#Hyper Param tuning
from sklearn.model_selection import GridSearchCV
params = {
    'C' : (0.1,0.5,1.0,10),
    'kernel' : ('linear','poly','sigmoid','rbf')
}

In [27]:
svc_grid = GridSearchCV(svc, param_grid=params, cv=5)

In [28]:
svc_grid.fit(X_train,y_train)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (0.1, 0.5, 1.0, 10),
                         'kernel': ('linear', 'poly', 'sigmoid', 'rbf')})

In [29]:
svc_grid.best_score_

0.7762318840579709

In [30]:
svc_grid.best_params_

{'C': 1.0, 'kernel': 'poly'}

In [31]:
from pycaret.classification import *
s = setup(breast, target='Class')

In [32]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.765,0.6254,0.3567,0.7833,0.4568,0.3373,0.3919,0.293
rf,Random Forest Classifier,0.72,0.5698,0.29,0.475,0.3495,0.2116,0.2228,0.154
dummy,Dummy Classifier,0.705,0.5,0.0,0.0,0.0,0.0,0.0,0.064
gbc,Gradient Boosting Classifier,0.695,0.5606,0.29,0.4933,0.3421,0.1704,0.1871,0.101
lr,Logistic Regression,0.69,0.6169,0.2367,0.4483,0.2916,0.1277,0.1403,0.73
nb,Naive Bayes,0.69,0.6144,0.15,0.25,0.1806,0.0629,0.0537,0.066
lightgbm,Light Gradient Boosting Machine,0.685,0.5676,0.2733,0.4714,0.3227,0.1468,0.1618,0.084
et,Extra Trees Classifier,0.68,0.5595,0.3067,0.435,0.3515,0.1548,0.1598,0.159
ridge,Ridge Classifier,0.67,0.0,0.2,0.3633,0.2396,0.0702,0.0796,0.059
ada,Ada Boost Classifier,0.66,0.5441,0.22,0.3817,0.2592,0.068,0.0763,0.102


In [33]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…