# Import package

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Data preparation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Classification model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# Hyperparams tuning
from sklearn.model_selection import GridSearchCV

# Data understanding

In [2]:
breast = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data',
                     names=['Class','age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat'],
                     )

In [3]:
breast.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [4]:
breast.tail()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no
285,recurrence-events,50-59,ge40,30-34,3-5,no,3,left,left_low,no


In [5]:
breast.sample(5)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
119,no-recurrence-events,60-69,ge40,15-19,0-2,no,1,left,right_low,no
141,no-recurrence-events,50-59,premeno,20-24,3-5,yes,2,left,left_low,no
78,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
263,recurrence-events,50-59,lt40,20-24,0-2,?,1,left,left_up,no
265,recurrence-events,30-39,premeno,35-39,9-11,yes,3,left,left_low,no


In [6]:
# Dataset information
breast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [7]:
breast.isnull().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [8]:
# Statistik deskriptif
breast.describe()

Unnamed: 0,deg-malig
count,286.0
mean,2.048951
std,0.738217
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [9]:
breast.describe(include=['O'])

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,breast,breast-quad,irradiat
count,286,286,286,286,286,286,286,286,286
unique,2,6,3,11,7,3,2,6,2
top,no-recurrence-events,50-59,premeno,30-34,0-2,no,left,left_low,no
freq,201,96,150,60,213,222,152,110,218


In [10]:
X = breast.iloc[:,1:]
y = breast.iloc[:,0]

In [11]:
X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [12]:
y.head()

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: Class, dtype: object

In [13]:
kolom_penting = ['menopause','breast','irradiat']

In [14]:
breast.loc[:,kolom_penting]

Unnamed: 0,menopause,breast,irradiat
0,premeno,left,no
1,premeno,right,no
2,premeno,left,no
3,ge40,right,no
4,premeno,right,no
...,...,...,...
281,premeno,left,no
282,premeno,left,yes
283,ge40,right,no
284,ge40,left,no


# Train test split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Data preparation

In [16]:
# Missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X_train)

SimpleImputer(strategy='most_frequent')

In [17]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [18]:
# Encoding = Categorical -> numerical
# OneHotEncoding

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

# Models

In [19]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [20]:
y_pred_logreg = logreg.predict(X_test)

# Model evaluation

In [21]:
print(classification_report(y_pred_logreg, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.85      0.78      0.82        60
   recurrence-events       0.24      0.33      0.28        12

            accuracy                           0.71        72
           macro avg       0.54      0.56      0.55        72
        weighted avg       0.75      0.71      0.73        72



In [22]:
# Cross validation
cross_val_score(logreg,X_train, y_train, cv=5)

array([0.74418605, 0.69767442, 0.65116279, 0.69767442, 0.64285714])

# Model selection

In [23]:
svc = SVC()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

svc.fit(X_train, y_train)
tree.fit(X_train, y_train)
forest.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)
y_pred_tree = tree.predict(X_test)
y_pred_forest = forest.predict(X_test)

In [24]:
print(classification_report(y_pred_svc,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       1.00      0.80      0.89        69
   recurrence-events       0.18      1.00      0.30         3

            accuracy                           0.81        72
           macro avg       0.59      0.90      0.59        72
        weighted avg       0.97      0.81      0.86        72



In [25]:
print(classification_report(y_pred_tree,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.73      0.82      0.77        49
   recurrence-events       0.47      0.35      0.40        23

            accuracy                           0.67        72
           macro avg       0.60      0.58      0.58        72
        weighted avg       0.65      0.67      0.65        72



In [26]:
print(classification_report(y_pred_forest,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.89      0.78      0.83        63
   recurrence-events       0.18      0.33      0.23         9

            accuracy                           0.72        72
           macro avg       0.53      0.56      0.53        72
        weighted avg       0.80      0.72      0.76        72



# Hyper param tuning

In [27]:
svc_linear = SVC(kernel='poly')

In [28]:
svc_linear.fit(X_train, y_train)

SVC(kernel='poly')

In [29]:
y_pred_svc_lin = svc_linear.predict(X_test)

In [30]:
print(classification_report(y_pred_svc_lin,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.95      0.80      0.87        65
   recurrence-events       0.24      0.57      0.33         7

            accuracy                           0.78        72
           macro avg       0.59      0.69      0.60        72
        weighted avg       0.88      0.78      0.81        72



In [31]:
params = {
    'C' : (0.1, 0.5, 1.0, 10.0),
    'kernel' : ('linear','poly','sigmoid','rbf')
}

In [32]:
svc_grid = GridSearchCV(svc, params, cv=5)

In [33]:
svc_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (0.1, 0.5, 1.0, 10.0),
                         'kernel': ('linear', 'poly', 'sigmoid', 'rbf')})

In [34]:
print(svc_grid.best_score_)

0.7335548172757476


In [35]:
print(svc_grid.best_params_)

{'C': 1.0, 'kernel': 'poly'}


# PyCaret

In [36]:
!pip install --pre pycaret

^C





[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
breast.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [38]:
from pycaret.classification import *
s = setup(breast, target='Class')

In [42]:
best = compare_models()
best

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.77,0.7212,0.4367,0.6892,0.5062,0.373,0.4037,0.047
rf,Random Forest Classifier,0.76,0.7665,0.4033,0.6314,0.4546,0.334,0.3654,0.059
lr,Logistic Regression,0.74,0.7261,0.44,0.6155,0.4902,0.3247,0.3457,0.056
ada,Ada Boost Classifier,0.74,0.6733,0.4733,0.5355,0.4925,0.331,0.3326,0.058
et,Extra Trees Classifier,0.735,0.7259,0.4367,0.5776,0.4656,0.3041,0.3258,0.058
lightgbm,Light Gradient Boosting Machine,0.725,0.681,0.39,0.58,0.4547,0.2797,0.2966,0.039
ridge,Ridge Classifier,0.715,0.0,0.3367,0.571,0.3858,0.2241,0.2516,0.034
gbc,Gradient Boosting Classifier,0.705,0.7195,0.3733,0.5017,0.421,0.2318,0.239,0.057
dummy,Dummy Classifier,0.705,0.5,0.0,0.0,0.0,0.0,0.0,0.045
dt,Decision Tree Classifier,0.695,0.6464,0.5267,0.4656,0.4716,0.2674,0.2814,0.043


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [40]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…