# Mount google drive

In [1]:
from google.colab import drive

DRIVE_PATH = '/content/drive'
drive.mount(DRIVE_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Libraries Import

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Constant Values definition

In [3]:
missing_values = ["?"]

columns = [
'Simple code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class'
]

FILE_PATH = 'MyDrive'
# FILE_PATH = 'MyDrive/machine_learning/data'
FILE_NAME = 'breast-cancer-wisconsin.data'

target_column = 'Class'

# Data Reads

In [4]:
df = pd.read_csv(f"{DRIVE_PATH}/{FILE_PATH}/{FILE_NAME}", names=columns, na_values=missing_values)
df

Unnamed: 0,Simple code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,2
695,841769,2,1,1,1,2,1.0,1,1,1,2
696,888820,5,10,10,3,7,3.0,8,10,2,4
697,897471,4,8,6,4,3,4.0,10,6,1,4


In [5]:
original_df = df
df = original_df.copy()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Simple code number           699 non-null    int64  
 1   Clump Thickness              699 non-null    int64  
 2   Uniformity of Cell Size      699 non-null    int64  
 3   Uniformity of Cell Shape     699 non-null    int64  
 4   Marginal Adhesion            699 non-null    int64  
 5   Single Epithelial Cell Size  699 non-null    int64  
 6   Bare Nuclei                  683 non-null    float64
 7   Bland Chromatin              699 non-null    int64  
 8   Normal Nucleoli              699 non-null    int64  
 9   Mitoses                      699 non-null    int64  
 10  Class                        699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.2 KB


In [7]:
df.isna().sum()

Simple code number              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

# Preprocessing

In [8]:
# Drop Null
df.dropna(axis=0, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Simple code number           683 non-null    int64  
 1   Clump Thickness              683 non-null    int64  
 2   Uniformity of Cell Size      683 non-null    int64  
 3   Uniformity of Cell Shape     683 non-null    int64  
 4   Marginal Adhesion            683 non-null    int64  
 5   Single Epithelial Cell Size  683 non-null    int64  
 6   Bare Nuclei                  683 non-null    float64
 7   Bland Chromatin              683 non-null    int64  
 8   Normal Nucleoli              683 non-null    int64  
 9   Mitoses                      683 non-null    int64  
 10  Class                        683 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 64.0 KB


In [9]:
# Drop an feature - results higher score
df.drop(["Simple code number"], axis=1, inplace=True)

# Change target(Class) value (2 -> 0 / 4 -> 1)
df[target_column].replace([2, 4], [0, 1], inplace=True)

# Split feature and target data
X, y = df[df.columns.difference([target_column])], df[[target_column]].values.ravel()

In [10]:
# print X
X

Unnamed: 0,Bare Nuclei,Bland Chromatin,Clump Thickness,Marginal Adhesion,Mitoses,Normal Nucleoli,Single Epithelial Cell Size,Uniformity of Cell Shape,Uniformity of Cell Size
0,1.0,3,5,1,1,1,2,1,1
1,10.0,3,5,5,1,2,7,4,4
2,2.0,3,3,1,1,1,2,1,1
3,4.0,3,6,1,1,7,3,8,8
4,1.0,3,4,3,1,1,2,1,1
...,...,...,...,...,...,...,...,...,...
694,2.0,1,3,1,1,1,3,1,1
695,1.0,1,2,1,1,1,2,1,1
696,3.0,8,5,3,2,10,7,10,10
697,4.0,10,4,4,1,6,3,6,8


In [11]:
# print y
df[[target_column]]

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
...,...
694,0
695,0
696,1
697,1


In [12]:
## Find Best model and options
# Run findBestOptions()
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.7, shuffle=True)

# Modeling Functions

In [13]:
# TODO: generate scaled X using scalers
# param scalers -> set: scalers to use (ex: [StandardScaler(), MinMaxScaler()])
# param X -> pd.DataFrame: original dataset to be scaled
# output: dict: scaled datasets
def do_scaling(scalers, X):
  scaled_X = {} # output value

  for scaler in scalers: # do scaling for all scalers
    scaled_X[scaler] = scaler.fit_transform(X.copy())
  
  return scaled_X

In [14]:
# TODO: hyperparameter tuning for decsion tree (with gini)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def decision_tree_gini(scaler_dict, y, cv, params):
  # Model
  dtc = DecisionTreeClassifier(random_state=1)

  # output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      gd_sr = GridSearchCV(estimator=dtc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = DecisionTreeClassifier(criterion='gini',
                                          splitter=gd_sr.best_params_['splitter'],
                                          max_depth=gd_sr.best_params_['max_depth'],
                                          min_samples_split=gd_sr.best_params_['min_samples_split'],
                                          random_state=1)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model
      
      return_list.append(['DecisionTreeClassifier(criterion=\'gini\')', temp_list])

  return return_list

In [15]:
# TODO: hyperparameter tuning for SVM(SVC)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_SVC(scaler_dict, y, cv, params):
  # Model
  svc = SVC(probability=True, random_state=100)

  # Output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      # Do gridsearch
      gd_sr = GridSearchCV(estimator=svc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = SVC(kernel=gd_sr.best_params_['kernel'],
                       gamma=gd_sr.best_params_['gamma'],
                       C=gd_sr.best_params_['C'],
                       decision_function_shape=gd_sr.best_params_['decision_function_shape'],
                       probability=True,
                       random_state=100)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model

      return_list.append(['SVC', temp_list])

  return return_list

In [16]:
# TODO: hyperparameter tuning for SVM(SVC)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def search_LogisticRegression(scaler_dict, y, cv, params):
  # Model
  lr = LogisticRegression(random_state=1)

  # Output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      # Do gridsearch
      gd_sr = GridSearchCV(estimator=lr, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = LogisticRegression(solver=gd_sr.best_params_['solver'],
                       random_state=1,
                       penalty=gd_sr.best_params_['penalty'],
                       C=gd_sr.best_params_['C'])
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model

      return_list.append(['LogisticRegression', temp_list])

  return return_list

In [17]:
def find_Scale_Model(X, y):
  DecisionTreeClassifier_criterion=["gini","entropy"]

  # Expected foramt = [Model name, Dictionary]
  # Dictionary format = {
  # score = best_score
  # param = best_params
  # model = model generated by best params
  # scaler = scaler used   
  # }
  result_list = []

  # *** Parameter descriptions ***
  # C : Controls tradeoff detween smooth decision boundary and classfying training points correctly. So, C adjusts the margin. Smaller C allows more, larger C allows less
  #
  # Gamma: Defines how far the influence of a single training point reaches
  # reach can think of reach as the range of data that affects the curvature of the decision boudary. If the gamma is small, it means the reach is far, and if the gamma is large, the reach is narrow.
  # decision_function_shape : decision_function_shape : Whether to return a one-vs-rest (‘ovr’) decision function of shape as all other classifiers, or the original one-vs-one (‘ovo’) decision function.
  #
  # random_state : Controls the pseudo random number generation for shuffling the data for probability estimates. 
  # probability : gives per-class scores for each sample
  #
  # https://bkshin.tistory.com/entry/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-3%EC%84%9C%ED%8F%AC%ED%8A%B8-%EB%B2%A1%ED%84%B0-%EB%A8%B8%EC%8B%A0-SVM-%EC%8B%A4%EC%8A%B5?category=1057680
  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

  params = {
      'DTC_Gini': {
          'criterion':['gini'],
          'splitter':['best','random'],
          'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split':[2,3,4,5]
      },
      'SVC': {
          'kernel': ["rbf", "poly", "sigmoid", "linear"],
          'gamma': [1e-3, 1e-2, 1e-1, 1, 1e+1],
          'C': [1, 5, 10, 50, 100],
          'decision_function_shape': ["ovr", "ovo"]
      },
      'LR': {
          'solver': ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
          "penalty": ["l2"],
          'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
      }
  }

  # CV values
  cv_k = set(range(2,11))

  # Scalers using
  scalers = [StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler()]

  # Do scaling
  scaled_dict = do_scaling(scalers, X)

  # Decision Tree with gin
  result_list += decision_tree_gini(scaled_dict, y, cv_k, params['DTC_Gini'])

  # SVC
  result_list += find_SVC(scaled_dict, y, cv_k, params['SVC'])

  # LogisticRegression
  result_list += search_LogisticRegression(scaled_dict, y, cv_k, params['LR'])

  # Sort list descending order to elements score
  result_list.sort(key = lambda i : i[1]['score'], reverse=True)

  return result_list

# Modeling

In [18]:
result = find_Scale_Model(train_X, train_y)



In [19]:
for i in range(5):
  print(result[i])

['SVC', {'score': 0.990311986863711, 'param': {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf'}, 'scaler': MaxAbsScaler(), 'model': SVC(C=5, gamma=1, probability=True, random_state=100)}]
['SVC', {'score': 0.9902380952380951, 'param': {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf'}, 'scaler': MinMaxScaler(), 'model': SVC(C=5, gamma=1, probability=True, random_state=100)}]
['SVC', {'score': 0.9902380952380951, 'param': {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf'}, 'scaler': MaxAbsScaler(), 'model': SVC(C=5, gamma=1, probability=True, random_state=100)}]
['SVC', {'score': 0.9901219512195121, 'param': {'C': 10, 'decision_function_shape': 'ovr', 'gamma': 0.1, 'kernel': 'sigmoid'}, 'scaler': StandardScaler(), 'model': SVC(C=10, gamma=0.1, kernel='sigmoid', probability=True, random_state=100)}]
['SVC', {'score': 0.9901219512195121, 'param': {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf'}, 'scaler

In [20]:
scaled_X = result[0][1]['scaler'].fit_transform(X.copy())
train_X, test_X, train_y, test_y = train_test_split(scaled_X, y, test_size=0.7, shuffle=True)
model = result[0][1]['model'].fit(train_X, train_y)
print("Model score: ", end="")
print(model.score(test_X, test_y))

Model score: 0.964509394572025
