# Import package

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Data preparation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Classification model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# Hyperparams tuning
from sklearn.model_selection import GridSearchCV

# Data understanding

In [None]:
breast = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data',
                     names=['Class','age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat'],
                     )

In [None]:
breast.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [None]:
breast.tail()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no
285,recurrence-events,50-59,ge40,30-34,3-5,no,3,left,left_low,no


In [None]:
breast.sample(5)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
154,no-recurrence-events,30-39,lt40,15-19,0-2,no,3,right,left_up,no
89,no-recurrence-events,40-49,premeno,40-44,0-2,no,1,right,left_up,no
108,no-recurrence-events,40-49,premeno,30-34,0-2,no,3,right,right_up,no
21,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no
27,no-recurrence-events,60-69,ge40,25-29,0-2,no,3,right,left_up,no


In [None]:
# Dataset information
breast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [None]:
breast.isnull().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [None]:
# Statistik deskriptif
breast.describe()

Unnamed: 0,deg-malig
count,286.0
mean,2.048951
std,0.738217
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [None]:
breast.describe(include=['O'])

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,breast,breast-quad,irradiat
count,286,286,286,286,286,286,286,286,286
unique,2,6,3,11,7,3,2,6,2
top,no-recurrence-events,50-59,premeno,30-34,0-2,no,left,left_low,no
freq,201,96,150,60,213,222,152,110,218


In [None]:
X = breast.iloc[:,1:]
y = breast.iloc[:,0]

In [None]:
X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [None]:
y.head()

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: Class, dtype: object

In [None]:
kolom_penting = ['menopause','breast','irradiat']

In [None]:
breast.loc[:,kolom_penting]

Unnamed: 0,menopause,breast,irradiat
0,premeno,left,no
1,premeno,right,no
2,premeno,left,no
3,ge40,right,no
4,premeno,right,no
...,...,...,...
281,premeno,left,no
282,premeno,left,yes
283,ge40,right,no
284,ge40,left,no


# Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Data preparation

In [None]:
# Missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X_train)

In [None]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
# Encoding = Categorical -> numerical
# OneHotEncoding

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

# Models

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred_logreg = logreg.predict(X_test)

# Model evaluation

In [None]:
print(classification_report(y_pred_logreg, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.90      0.74      0.81        61
   recurrence-events       0.27      0.55      0.36        11

            accuracy                           0.71        72
           macro avg       0.59      0.64      0.59        72
        weighted avg       0.80      0.71      0.74        72



In [None]:
# Cross validation
cross_val_score(logreg,X_train, y_train, cv=5)

array([0.62790698, 0.76744186, 0.6744186 , 0.79069767, 0.76190476])

# Model selection

In [None]:
svc = SVC()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

svc.fit(X_train, y_train)
tree.fit(X_train, y_train)
forest.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)
y_pred_tree = tree.predict(X_test)
y_pred_forest = forest.predict(X_test)

In [None]:
print(classification_report(y_pred_svc,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.98      0.72      0.83        68
   recurrence-events       0.14      0.75      0.23         4

            accuracy                           0.72        72
           macro avg       0.56      0.74      0.53        72
        weighted avg       0.93      0.72      0.80        72



In [None]:
print(classification_report(y_pred_tree,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.82      0.75      0.78        55
   recurrence-events       0.36      0.47      0.41        17

            accuracy                           0.68        72
           macro avg       0.59      0.61      0.60        72
        weighted avg       0.71      0.68      0.69        72



In [None]:
print(classification_report(y_pred_forest,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.88      0.75      0.81        59
   recurrence-events       0.32      0.54      0.40        13

            accuracy                           0.71        72
           macro avg       0.60      0.64      0.60        72
        weighted avg       0.78      0.71      0.73        72



# Hyper param tuning

In [None]:
svc_linear = SVC(kernel='poly')

In [None]:
svc_linear.fit(X_train, y_train)

In [None]:
y_pred_svc_lin = svc_linear.predict(X_test)

In [None]:
print(classification_report(y_pred_svc_lin,y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.94      0.77      0.85        61
   recurrence-events       0.36      0.73      0.48        11

            accuracy                           0.76        72
           macro avg       0.65      0.75      0.67        72
        weighted avg       0.85      0.76      0.79        72



In [None]:
params = {
    'C' : (0.1, 0.5, 1.0, 10.0),
    'kernel' : ('linear','poly','sigmoid','rbf')
}

In [None]:
svc_grid = GridSearchCV(svc, params, cv=5)

In [None]:
svc_grid.fit(X_train, y_train)

In [None]:
print(svc_grid.best_score_)

0.7619047619047619


In [None]:
print(svc_grid.best_params_)

{'C': 0.5, 'kernel': 'poly'}


# PyCaret

In [None]:
!pip install --pre pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
breast.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [None]:
from pycaret.classification import *
s = setup(breast, target='Class')

Unnamed: 0,Description,Value
0,Session id,5228
1,Target,Class
2,Target type,Binary
3,Target mapping,"no-recurrence-events: 0, recurrence-events: 1"
4,Original data shape,"(286, 10)"
5,Transformed data shape,"(286, 40)"
6,Transformed train set shape,"(200, 40)"
7,Transformed test set shape,"(86, 40)"
8,Ordinal features,2
9,Numeric features,1


In [None]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.71,0.6895,0.3067,0.5933,0.3729,0.2102,0.2438,1.073
dummy,Dummy Classifier,0.705,0.5,0.0,0.0,0.0,0.0,0.0,0.338
ridge,Ridge Classifier,0.7,0.0,0.2533,0.5033,0.3171,0.1625,0.1905,0.286
gbc,Gradient Boosting Classifier,0.7,0.655,0.34,0.515,0.3976,0.2099,0.2247,0.73
et,Extra Trees Classifier,0.7,0.6596,0.3567,0.54,0.4076,0.2195,0.2383,0.951
lr,Logistic Regression,0.695,0.6443,0.2367,0.5733,0.313,0.1488,0.1894,1.851
lightgbm,Light Gradient Boosting Machine,0.695,0.6158,0.27,0.5521,0.3308,0.1616,0.1958,0.983
knn,K Neighbors Classifier,0.685,0.5708,0.22,0.5133,0.2817,0.1186,0.1499,0.346
lda,Linear Discriminant Analysis,0.675,0.6073,0.2533,0.4662,0.3044,0.1205,0.1403,0.31
xgboost,Extreme Gradient Boosting,0.665,0.6235,0.3733,0.4367,0.3941,0.1685,0.1726,0.436


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…