## Choosing the right ML algorithms

To be able to choose the right machine learning models, we need to formulate a clear and concise description of our problem. We would like to predict cancer malignancy (diagnosis) by looking at the size of cell features from images of cell substracts. The features of the cells concern the mean texture, mean radius, area, concavity and other geometrical features of the cells themself. With that said, the task of predicting whether a cancer diagnosis is malignant or benign, is clearly a classification task. 

Since in our dataset we already have the labeled diagnosis, it seems prudent to test some of the supervised models which scikit-learn offers.

In [29]:
# Load data
import pandas as pd
import matplotlib.pyplot as plt

# Read dataset
data_breast_cancer = pd.read_csv("breast_cancer_win/data.csv")

# data (as pandas dataframes)
print(data_breast_cancer.shape)
features = data_breast_cancer.columns
print(len(features))

print("Data features: \n", features)
print(data_breast_cancer.describe())

# Prepare the dataset

means_labels : list[str] = list(data_breast_cancer.columns[1:11])
worst_labels : list[str] = list(data_breast_cancer.columns[-10: -1])

# Drop the faulty column
data_breast_cancer.drop(columns=["Unnamed: 32", "id"], inplace=True)

# Set the data diagnosis results to integers
data_breast_cancer['diagnosis'] = data_breast_cancer['diagnosis'].map({"M":1,"B":0})

# split dataframe into two based on diagnosis
dfM=data_breast_cancer[data_breast_cancer['diagnosis'] == 1]
dfB=data_breast_cancer[data_breast_cancer['diagnosis'] == 0]

# Copy result array and drop it from our dataset
y_all = data_breast_cancer["diagnosis"].copy()
data_breast_cancer.drop(columns=["diagnosis"], inplace=True)


(569, 33)
33
Data features: 
 Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351

Split data into train + test set using KFold strategy

In [47]:
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, train_test_split
from python_src.sandbox import FullReport

# Make a new kfold split and train/test a model
def SplitWithKFold(fit_fun):
    # Default splits into 5 folds
    kf = KFold()
    print(kf)
    kfsplit = kf.split(data_breast_cancer)

    i = 0
    for idx_train, idx_test in kfsplit:
        print("Test set: ", idx_test[0], " to: ", idx_test[-1])
        X_train, y_train = data_breast_cancer.iloc[idx_train], y_all[idx_train]
        X_test, y_test = data_breast_cancer.iloc[idx_test], y_all[idx_test]
        print(X_train.shape)
        print(X_test.shape)


        # Train model



Test SGDClassifier using GridSearchCV just as we did in O3

In [51]:
# Use some classifiers
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from time import time

model = SGDClassifier()

tuning_parameters = {
    'loss': ['hinge', 'log_loss', 'perceptron', 'modified_huber', 'squared_error', 'huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 100],
    'tol' : [0.0001, 0.001, 0.01, 0.1],
    'max_iter' : [int(1e+5), int(1e+6)]
}


# Default splits into 5 folds
kf = KFold()
print(kf)
kfsplit = kf.split(data_breast_cancer)

CV_layers = 5
VERBOSE = 2
# This part is almost the same as in previous code block, we just swap the model
grid_tuned = GridSearchCV(model,
                          tuning_parameters,
                          cv=CV_layers,
                          scoring='f1_micro',
                          verbose=VERBOSE,
                          n_jobs=-1)

for idx_train, idx_test in kfsplit:
    print("Test set: ", idx_test[0], " to: ", idx_test[-1])
    X_train, y_train = data_breast_cancer.iloc[idx_train], y_all[idx_train]
    X_test, y_test = data_breast_cancer.iloc[idx_test], y_all[idx_test]
    print(X_train.shape)
    print(X_test.shape)

    start = time()
    grid_tuned.fit(X_train, y_train)
    t = time() - start

    b0, m0 = FullReport(grid_tuned, X_test, y_test, t)
    print('OK(grid-search)')




KFold(n_splits=5, random_state=None, shuffle=False)
Test set:  0  to:  113
(455, 30)
(114, 30)
Fitting 5 folds for each of 864 candidates, totalling 4320 fits
SEARCH TIME: 464.42 sec

Best model set found on train set:

	best parameters={'alpha': 0.0001, 'loss': 'log_loss', 'max_iter': 1000000, 'penalty': 'l1', 'tol': 0.01}
	best 'f1_micro' score=0.9340659340659341
	best index=42

Best estimator CTOR:
	SGDClassifier(loss='log_loss', max_iter=1000000, penalty='l1', tol=0.01)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.855 (+/-0.168) for {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 100000, 'penalty': 'l2', 'tol': 0.0001}
	[ 1]: 0.914 (+/-0.022) for {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 100000, 'penalty': 'l2', 'tol': 0.001}
	[ 2]: 0.886 (+/-0.082) for {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 100000, 'penalty': 'l2', 'tol': 0.01}
	[ 3]: 0.919 (+/-0.051) for {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 100000, 'penalty': 'l2', 'tol': 0.1}
	[ 4]: 0.866 (+/-0.11

Save our SGDClassifier.

In [55]:

# Chosen model
sgd = SGDClassifier(max_iter=100000, penalty='l1')

# Just split the data... what's the use of KFold, when GridSearchCV already does the same thing?
X_train, X_test, Y_train, Y_test = train_test_split(data_breast_cancer, y_all, test_size = 0.25, shuffle=True)

# Just a container for the data
class Data:
    def __init__(self, x_train, y_train, x_test, y_test) -> None:
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

data = Data(X_train, Y_train, X_test, Y_test)


In [62]:
model = RidgeClassifier()

tuning_parameters = {
    'alpha' : [0.01, 0.1, 1, 2, 10],
    'tol' : [0.00001, 0.0001, 0.001],
    'solver' : ['svd', 'cholesky', 'lsqr', 'sag']
}

grid_tuned = GridSearchCV(model,
                          tuning_parameters,
                          cv=CV_layers,
                          scoring='f1_micro',
                          verbose=VERBOSE,
                          n_jobs=-1)


print(data.x_test.shape)
print(data.x_train.shape)

def trainAndReport(gridcv, data: Data):
    start = time()
    gridcv.fit(data.x_train, data.y_train)
    t = time() - start

    b0, m0 = FullReport(gridcv, data.x_test, data.y_test, t)
    print('OK(grid-search)')
    return b0, m0


b, m = trainAndReport(grid_tuned, data)
print("results: ", b)
print("results m: ", m)



(143, 30)
(426, 30)
Fitting 5 folds for each of 60 candidates, totalling 300 fits
SEARCH TIME: 1.57 sec

Best model set found on train set:

	best parameters={'alpha': 0.01, 'solver': 'svd', 'tol': 1e-05}
	best 'f1_micro' score=0.9530232558139534
	best index=0

Best estimator CTOR:
	RidgeClassifier(alpha=0.01, solver='svd', tol=1e-05)

Grid scores ('f1_micro') on development set:
	[ 0]: 0.953 (+/-0.073) for {'alpha': 0.01, 'solver': 'svd', 'tol': 1e-05}
	[ 1]: 0.953 (+/-0.073) for {'alpha': 0.01, 'solver': 'svd', 'tol': 0.0001}
	[ 2]: 0.953 (+/-0.073) for {'alpha': 0.01, 'solver': 'svd', 'tol': 0.001}
	[ 3]: 0.953 (+/-0.073) for {'alpha': 0.01, 'solver': 'cholesky', 'tol': 1e-05}
	[ 4]: 0.953 (+/-0.073) for {'alpha': 0.01, 'solver': 'cholesky', 'tol': 0.0001}
	[ 5]: 0.953 (+/-0.073) for {'alpha': 0.01, 'solver': 'cholesky', 'tol': 0.001}
	[ 6]: 0.951 (+/-0.050) for {'alpha': 0.01, 'solver': 'lsqr', 'tol': 1e-05}
	[ 7]: 0.923 (+/-0.050) for {'alpha': 0.01, 'solver': 'lsqr', 'tol': 0.000

In [64]:
rclass = RidgeClassifier(alpha=0.01, solver='svd', tol=1e-05)


0.9440559440559441

In [69]:
# Try dummy classifier for a more realistic comparison of score

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Classifies EVERYTHING in the most frequent class (here 1 == Malignant tumor diagnosis)

def CompareToDummy(data: Data, model):
    print("Comparing cross val score and accuracy of model to dummy classifier scores")
    dummy_clf = DummyClassifier()
    dummy_clf.fit(data.x_train, data.y_train)
    dummy_y = dummy_clf.predict(data.x_test)

    model.fit(data.x_train, data.y_train)
    model_y = rclass.predict(data.x_test)

    a_s = accuracy_score(data.y_test, model_y)
    c_v_s = cross_val_score(model, data.x_train, data.y_train, cv=3, scoring="accuracy")
    # Uses 'jaccard score' function, see: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
    a_s_dummy = accuracy_score(data.y_test, dummy_y)
    c_v_s_dummy = cross_val_score(dummy_clf, data.x_train, data.y_train, cv = 3, scoring="accuracy")
    print("Model accuracy: ", a_s, " | Dummy accuracy: ", a_s_dummy)
    print("Model cvs: ", "c_v_s", " | Dummy cvs: ", c_v_s_dummy)
