In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
# /content/drive/MyDrive/machine_learning/data/breast-cancer-wisconsin.data
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


missing_values = ["?"]

df = pd.read_csv("/content/drive/MyDrive/breast-cancer-wisconsin.data", na_values=missing_values)
# df = pd.read_csv("/content/drive/MyDrive/machine_learning/data/breast-cancer-wisconsin.data")
columns = [
'Simple code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class'
]
df.columns = columns
print(df)


     Simple code number  Clump Thickness  Uniformity of Cell Size  \
0               1002945                5                        4   
1               1015425                3                        1   
2               1016277                6                        8   
3               1017023                4                        1   
4               1017122                8                       10   
..                  ...              ...                      ...   
693              776715                3                        1   
694              841769                2                        1   
695              888820                5                       10   
696              897471                4                        8   
697              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           4                  5                            7   
1        

In [23]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Simple code number           698 non-null    int64  
 1   Clump Thickness              698 non-null    int64  
 2   Uniformity of Cell Size      698 non-null    int64  
 3   Uniformity of Cell Shape     698 non-null    int64  
 4   Marginal Adhesion            698 non-null    int64  
 5   Single Epithelial Cell Size  698 non-null    int64  
 6   Bare Nuclei                  682 non-null    float64
 7   Bland Chromatin              698 non-null    int64  
 8   Normal Nucleoli              698 non-null    int64  
 9   Mitoses                      698 non-null    int64  
 10  Class                        698 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.1 KB
None


In [24]:
df.isna().sum()

Simple code number              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [25]:
df.dropna(axis=0, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 682 entries, 0 to 697
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Simple code number           682 non-null    int64  
 1   Clump Thickness              682 non-null    int64  
 2   Uniformity of Cell Size      682 non-null    int64  
 3   Uniformity of Cell Shape     682 non-null    int64  
 4   Marginal Adhesion            682 non-null    int64  
 5   Single Epithelial Cell Size  682 non-null    int64  
 6   Bare Nuclei                  682 non-null    float64
 7   Bland Chromatin              682 non-null    int64  
 8   Normal Nucleoli              682 non-null    int64  
 9   Mitoses                      682 non-null    int64  
 10  Class                        682 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 63.9 KB
None


In [26]:
df.drop(["Simple code number"], axis=1, inplace=True)
# Change target(Class) value (2 -> 0 / 4 -> 1)
df.at[df[df['Class'] == 2].index, 'Class'] = 0
df.at[df[df['Class'] == 4].index, 'Class'] = 1
# Split feature and target data
X = pd.DataFrame(df.iloc[:,0:9], dtype=np.dtype("int64"))
y = df.iloc[:,9]
## Find Best model and options
# Run findBestOptions()
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.7, shuffle=True)



In [27]:
def decision_tree_gini(scalers, X, y):
  params = {'criterion':['gini'],'splitter':['best','random'], 'max_depth':[2,3,4,5,6,7,8],'min_samples_split':[2,3,4,5]}
  dtc = DecisionTreeClassifier()
  return_list = []
  for scaler in scalers:
    X = scaler.fit_transform(X)
    for i in range(2,11):
      gd_sr = GridSearchCV(estimator = dtc, param_grid = params, cv=i, n_jobs=-1)
      gd_sr.fit(X,y)
      temp_list = []
      temp_list.append(gd_sr.best_score_)
      temp_list.append("decision tree with gini")
      temp_list.append(gd_sr.best_params_)
      temp_model = DecisionTreeClassifier(criterion='gini', splitter=gd_sr.best_params_['splitter'], max_depth=gd_sr.best_params_['max_depth'], min_samples_split=gd_sr.best_params_['min_samples_split'])
      temp_model.fit(X,y)
      temp_list.append(temp_model)
      temp_list.append(scaler)
      return_list.append(temp_list)

  return return_list


In [28]:
def find_SVC(scalers,X,y):
  # Gamma: Defines how far the influence of a single training point reaches
  # reach can think of reach as the range of data that affects the curvature of the decision boudary. If the gamma is small, it means the reach is far, and if the gamma is large, the reach is narrow.
  # C : Controls tradeoff detween smooth decision boundary and classfying training points correctly. So, C adjusts the margin. Smaller C allows more, larger C allows less
  # decision_function_shape : decision_function_shape : Whether to return a one-vs-rest (‘ovr’) decision function of shape as all other classifiers, or the original one-vs-one (‘ovo’) decision function.
  # random_state : Controls the pseudo random number generation for shuffling the data for probability estimates. 
  # probability : gives per-class scores for each sample
  # https://bkshin.tistory.com/entry/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-3%EC%84%9C%ED%8F%AC%ED%8A%B8-%EB%B2%A1%ED%84%B0-%EB%A8%B8%EC%8B%A0-SVM-%EC%8B%A4%EC%8A%B5?category=1057680
  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
  params = {'kernel':["rbf","poly","sigmoid","linear"],'gamma':[0.001,0.01,0.1,1,10], 'C':[1,5,10,50,100],'decision_function_shape':["ovr","ovo"]}
  SVC_kernel=["rbf","poly","sigmoid","linear"]
  SVC_gamma_list=[0.001,0.01,0.1,1,10]
  SVC_C=[1,5,10,50,100]
  SVC_decision_function_shape=["ovr","ovo"]
  dtc = SVC()
  return_list = []
  for scaler in scalers:
    X = scaler.fit_transform(X)
    for i in range(2,11):
      gd_sr = GridSearchCV(estimator = dtc, param_grid = params, cv=i, n_jobs=-1)
      gd_sr.fit(X,y)
      temp_list = []
      temp_list.append(gd_sr.best_score_)
      temp_list.append("SVC")
      temp_list.append(gd_sr.best_params_)
      temp_model = SVC(kernel='rbf',probability=True,random_state=100,gamma=0.01,C=1,decision_function_shape="ovr")
      temp_model.fit(X,y)
      temp_list.append(temp_model)
      temp_list.append(scaler)
      return_list.append(temp_list)
  return return_list

In [29]:
def find_Scale_Model(X,y):
  global best_score,best_cv,best_scaler,best_model
  best_score=-1.0
  DecisionTreeClassifier_criterion=["gini","entropy"]

  LogisticRegression_solver=["lbfgs","newton-cg","liblinear","sag","saga"]
  # result list foramt = [점수, ]
  result_list = []
  cv_k=[2,3,4,5,6,7,8,9,10]
  scalers=[StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler()]
  gini_result = decision_tree_gini(scalers, X, y)
  print(gini_result)
  result_list += gini_result
  SVC_result=find_SVC(scalers,X,y)
  print(SVC_result)
  result_list+=SVC_result
  result_list.sort(key = lambda i : i[0], reverse = True)


  return result_list

In [30]:
result = find_Scale_Model(train_X, train_y)

[[0.9607843137254901, 'decision tree with gini', {'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 2, 'splitter': 'random'}, DecisionTreeClassifier(max_depth=6, splitter='random'), StandardScaler()], [0.9607843137254902, 'decision tree with gini', {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 3, 'splitter': 'random'}, DecisionTreeClassifier(max_depth=4, min_samples_split=3, splitter='random'), StandardScaler()], [0.9607843137254902, 'decision tree with gini', {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'random'}, DecisionTreeClassifier(max_depth=3, splitter='random'), StandardScaler()], [0.9608536585365852, 'decision tree with gini', {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5, 'splitter': 'random'}, DecisionTreeClassifier(max_depth=5, min_samples_split=5, splitter='random'), StandardScaler()], [0.9705882352941178, 'decision tree with gini', {'criterion': 'gini', 'max_depth': 8, 'min_samples_split': 3, 'splitter': 'ra

In [32]:
for i in range(5):
  print(result[i])

[0.9803921568627452, 'SVC', {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 0.1, 'kernel': 'sigmoid'}, SVC(C=1, gamma=0.01, probability=True, random_state=100), StandardScaler()]
[0.9802380952380952, 'SVC', {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 0.1, 'kernel': 'sigmoid'}, SVC(C=1, gamma=0.01, probability=True, random_state=100), StandardScaler()]
[0.9754901960784315, 'SVC', {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'sigmoid'}, SVC(C=1, gamma=0.01, probability=True, random_state=100), StandardScaler()]
[0.9754901960784315, 'SVC', {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 0.1, 'kernel': 'sigmoid'}, SVC(C=1, gamma=0.01, probability=True, random_state=100), StandardScaler()]
[0.9754901960784315, 'SVC', {'C': 50, 'decision_function_shape': 'ovr', 'gamma': 0.01, 'kernel': 'sigmoid'}, SVC(C=1, gamma=0.01, probability=True, random_state=100), RobustScaler()]


In [33]:
X = result[0][4].fit_transform(X)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.7, shuffle=True)
model = result[0][3].fit(train_X, train_y)
print("Model score: ", end="")
print(model.score(test_X, test_y))

Model score: 0.9665271966527197
