# find_Scaler_Model

## find_Scaler_Model(train_X, train_Y)
* Parameters:
    - X: array-like of shape
      - Training Vector
    - y: array-like of shape
      - Target relative to X for classfication
* Returns:
    - result_list: list
      - Returns a list that contains result of various combination of scalers, models,and model hyper parameters.
* Description:
    - code for our funciton:

In [None]:
def find_Scaler_Model(X, y):

  # Expected foramt = [Model name, Dictionary]
  # Dictionary format = {
  # score = best_score
  # param = best_params
  # model = model generated by best params
  # scaler = scaler used   
  # }
  result_list = []

  # *** Parameter descriptions ***
  # C : Controls tradeoff detween smooth decision boundary and classfying training points correctly. So, C adjusts the margin. Smaller C allows more, larger C allows less
  #
  # Gamma: Defines how far the influence of a single training point reaches
  # reach can think of reach as the range of data that affects the curvature of the decision boudary. If the gamma is small, it means the reach is far, and if the gamma is large, the reach is narrow.
  # decision_function_shape : decision_function_shape : Whether to return a one-vs-rest (‘ovr’) decision function of shape as all other classifiers, or the original one-vs-one (‘ovo’) decision function.
  #
  # random_state : Controls the pseudo random number generation for shuffling the data for probability estimates. 
  # probability : gives per-class scores for each sample
  #
  # https://bkshin.tistory.com/entry/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-3%EC%84%9C%ED%8F%AC%ED%8A%B8-%EB%B2%A1%ED%84%B0-%EB%A8%B8%EC%8B%A0-SVM-%EC%8B%A4%EC%8A%B5?category=1057680
  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

  params = {
      'DTC_Gini': {
          'criterion':['gini'],
          'splitter':['best','random'],
          'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split':[2,3,4,5]
      },
      'DTC_Entropy':{
          'criterion':['entropy'],
          'splitter':['best','random'],
          'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split':[2,3,4,5]
      },
      'SVC': {
          'kernel': ["rbf", "poly", "sigmoid", "linear"],
          'gamma': [1e-3, 1e-2, 1e-1, 1, 1e+1],
          'C': [1, 5, 10, 50, 100],
          'decision_function_shape': ["ovr", "ovo"]
      },
      'LR': {
          'solver': ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
          "penalty": ["l2"],
          'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
      }
  }
  # CV values
  cv_k = set(range(2,11))
  # Scalers using
  scalers = [StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler()]
  # Do scaling
  scaled_dict = do_scaling(scalers, X)
  # Decision Tree with gini
  result_list += find_DecisionTree_gini(scaled_dict, y, cv_k, params['DTC_Gini'])
  # Decision Tree with entropy
  result_list += find_DecisionTree_entropy(scaled_dict, y, cv_k, params['DTC_Entropy'])
  # SVC
  result_list += find_SVC(scaled_dict, y, cv_k, params['SVC'])
  # LogisticRegression
  result_list += find_LogisticRegression(scaled_dict, y, cv_k, params['LR'])
  # Sort list descending order to elements score
  result_list.sort(key = lambda i : i[1]['score'], reverse=True)

  return result_list

* Main function contains 5 sub functions: 
  - do_scaling, find_DecisionTree_gini, find_DecisionTree_entropy, find_SVC, find_LogisticRegression

* Brief description about sub functions:
  - Our main function first calls do_scaling function. do_scaling function returns a dictonary that contains scaler as a key and scaled dataframe as a value. Our team also added "None" key to the dictonary to test with non-scaled data. Then our main function calls find_DecisionTree_gini, find_DecisionTree_entropy, find_SVC, find_LogisticRegression. Each function returns a list that contains score, parameter, model of the result of each GridSearchCv. Using that results, our function can sort the result and returns the sorted result. Then we can use the best model to actually validate the model using test dataset.

* code for our sub functions:
  - do_scaling function code

In [None]:
# TODO: generate scaled X using scalers
# param scalers -> set: scalers to use (ex: [StandardScaler(), MinMaxScaler()])
# param X -> pd.DataFrame: original dataset to be scaled
# output: dict: scaled datasets
def do_scaling(scalers, X):
  scaled_X = {} # output value

  for scaler in scalers: # do scaling for all scalers
    scaled_X[scaler] = scaler.fit_transform(X.copy())
  
  scaled_X["None"] = X.copy() # adding non-scaled data into dictonary
  
  return scaled_X

* find_DecisionTree_gini function code:

In [None]:
# TODO: hyperparameter tuning for decsion tree (with gini)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_DecisionTree_gini(scaler_dict, y, cv, params):
  # Model
  dtc = DecisionTreeClassifier(random_state=1)

  # output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      gd_sr = GridSearchCV(estimator=dtc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = DecisionTreeClassifier(criterion='gini',
                                          splitter=gd_sr.best_params_['splitter'],
                                          max_depth=gd_sr.best_params_['max_depth'],
                                          min_samples_split=gd_sr.best_params_['min_samples_split'],
                                          random_state=1)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model
      
      return_list.append(['DecisionTreeClassifier(criterion=\'gini\')', temp_list])

  return return_list

* find_DecisionTree_entropy function code:

In [None]:
# TODO: hyperparameter tuning for decsion tree (with gini)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_DecisionTree_entropy(scaler_dict, y, cv, params):
  # Model
  dtc = DecisionTreeClassifier(random_state=1)

  # output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      gd_sr = GridSearchCV(estimator=dtc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = DecisionTreeClassifier(criterion='entropy',
                                          splitter=gd_sr.best_params_['splitter'],
                                          max_depth=gd_sr.best_params_['max_depth'],
                                          min_samples_split=gd_sr.best_params_['min_samples_split'],
                                          random_state=1)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model
      
      return_list.append(['DecisionTreeClassifier(criterion=\'gini\')', temp_list])

  return return_list

* find_SVC function code:

In [None]:
# TODO: hyperparameter tuning for SVM(SVC)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_SVC(scaler_dict, y, cv, params):
  # Model
  svc = SVC(probability=True, random_state=100)

  # Output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      # Do gridsearch
      gd_sr = GridSearchCV(estimator=svc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = SVC(kernel=gd_sr.best_params_['kernel'],
                       gamma=gd_sr.best_params_['gamma'],
                       C=gd_sr.best_params_['C'],
                       decision_function_shape=gd_sr.best_params_['decision_function_shape'],
                       probability=True,
                       random_state=100)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model

      return_list.append(['SVC', temp_list])

  return return_list

* find_LogisticRegression funciton code:

In [None]:
# TODO: hyperparameter tuning for SVM(SVC)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_LogisticRegression(scaler_dict, y, cv, params):
  # Model
  lr = LogisticRegression(random_state=1)

  # Output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      # Do gridsearch
      gd_sr = GridSearchCV(estimator=lr, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = LogisticRegression(solver=gd_sr.best_params_['solver'],
                       random_state=1,
                       penalty=gd_sr.best_params_['penalty'],
                       C=gd_sr.best_params_['C'])
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model

      return_list.append(['LogisticRegression', temp_list])

  return return_list

## Entire code we developed for this Programming HomeWork:
  - Mount google drive

In [None]:
from google.colab import drive

DRIVE_PATH = '/content/drive'
drive.mount(DRIVE_PATH)

## Libraries import

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Constant value definition

In [None]:
missing_values = ["?"]

columns = [
'Simple code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class'
]

FILE_PATH = 'MyDrive/machine_learning/data'
FILE_NAME = 'breast-cancer-wisconsin.data'

target_column = 'Class'

## Data reads

In [None]:
df = pd.read_csv(f"{DRIVE_PATH}/{FILE_PATH}/{FILE_NAME}", names=columns, na_values=missing_values)
df

In [None]:
original_df = df
df = original_df.copy()
df.info()
 
df.isna().sum()

## Preprocessing

In [None]:
# Drop Null
df.dropna(axis=0, inplace=True)
df.info()

In [None]:
# Drop an feature - results higher score
df.drop(["Simple code number"], axis=1, inplace=True)

# Change target(Class) value (2 -> 0 / 4 -> 1)
df[target_column].replace([2, 4], [0, 1], inplace=True)

# Split feature and target data
X, y = df[df.columns.difference([target_column])], df[[target_column]].values.ravel()
# print X
X

In [None]:
# print y
df[[target_column]]

In [None]:
## Find Best model and options
# Run findBestOptions()
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.7, shuffle=True)

## Modeling functions

In [None]:
# TODO: generate scaled X using scalers
# param scalers -> set: scalers to use (ex: [StandardScaler(), MinMaxScaler()])
# param X -> pd.DataFrame: original dataset to be scaled
# output: dict: scaled datasets
def do_scaling(scalers, X):
  scaled_X = {} # output value

  for scaler in scalers: # do scaling for all scalers
    scaled_X[scaler] = scaler.fit_transform(X.copy())
  
  scaled_X["None"] = X.copy()
  
  return scaled_X
# TODO: hyperparameter tuning for decsion tree (with gini)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_DecisionTree_gini(scaler_dict, y, cv, params):
  # Model
  dtc = DecisionTreeClassifier(random_state=1)

  # output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      gd_sr = GridSearchCV(estimator=dtc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = DecisionTreeClassifier(criterion='gini',
                                          splitter=gd_sr.best_params_['splitter'],
                                          max_depth=gd_sr.best_params_['max_depth'],
                                          min_samples_split=gd_sr.best_params_['min_samples_split'],
                                          random_state=1)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model
      
      return_list.append(['DecisionTreeClassifier(criterion=\'gini\')', temp_list])

  return return_list
# TODO: hyperparameter tuning for decsion tree (with gini)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_DecisionTree_entropy(scaler_dict, y, cv, params):
  # Model
  dtc = DecisionTreeClassifier(random_state=1)

  # output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      gd_sr = GridSearchCV(estimator=dtc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = DecisionTreeClassifier(criterion='entropy',
                                          splitter=gd_sr.best_params_['splitter'],
                                          max_depth=gd_sr.best_params_['max_depth'],
                                          min_samples_split=gd_sr.best_params_['min_samples_split'],
                                          random_state=1)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model
      
      return_list.append(['DecisionTreeClassifier(criterion=\'gini\')', temp_list])

  return return_list
# TODO: hyperparameter tuning for SVM(SVC)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_SVC(scaler_dict, y, cv, params):
  # Model
  svc = SVC(probability=True, random_state=100)

  # Output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      # Do gridsearch
      gd_sr = GridSearchCV(estimator=svc, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = SVC(kernel=gd_sr.best_params_['kernel'],
                       gamma=gd_sr.best_params_['gamma'],
                       C=gd_sr.best_params_['C'],
                       decision_function_shape=gd_sr.best_params_['decision_function_shape'],
                       probability=True,
                       random_state=100)
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model

      return_list.append(['SVC', temp_list])

  return return_list
# TODO: hyperparameter tuning for SVM(SVC)
# param scaler_dict -> dict: dictionary contains the scaler and scaled data used it
# param y -> pd.DataFrame: target dataset
# param cv -> set: CV integers
# param params -> dict: parameters for hyperparameter tuning
def find_LogisticRegression(scaler_dict, y, cv, params):
  # Model
  lr = LogisticRegression(random_state=1)

  # Output
  return_list = []

  # Do the same modeling for input scalers
  for scaler, scaled_X in scaler_dict.items():
    for i in cv: # various cv: from 2 to 10
      # Do gridsearch
      gd_sr = GridSearchCV(estimator=lr, param_grid=params, cv=i, n_jobs=-1)
      gd_sr.fit(scaled_X, y)

      # Push output value
      temp_list = {
          'score': gd_sr.best_score_,
          'param': gd_sr.best_params_,
          'scaler': scaler
      }

      # Add model
      temp_model = LogisticRegression(solver=gd_sr.best_params_['solver'],
                       random_state=1,
                       penalty=gd_sr.best_params_['penalty'],
                       C=gd_sr.best_params_['C'])
      temp_model.fit(scaled_X, y)
      temp_list['model'] = temp_model

      return_list.append(['LogisticRegression', temp_list])

  return return_list
def find_Scaler_Model(X, y):

  # Expected foramt = [Model name, Dictionary]
  # Dictionary format = {
  # score = best_score
  # param = best_params
  # model = model generated by best params
  # scaler = scaler used   
  # }
  result_list = []

  # *** Parameter descriptions ***
  # C : Controls tradeoff detween smooth decision boundary and classfying training points correctly. So, C adjusts the margin. Smaller C allows more, larger C allows less
  #
  # Gamma: Defines how far the influence of a single training point reaches
  # reach can think of reach as the range of data that affects the curvature of the decision boudary. If the gamma is small, it means the reach is far, and if the gamma is large, the reach is narrow.
  # decision_function_shape : decision_function_shape : Whether to return a one-vs-rest (‘ovr’) decision function of shape as all other classifiers, or the original one-vs-one (‘ovo’) decision function.
  #
  # random_state : Controls the pseudo random number generation for shuffling the data for probability estimates. 
  # probability : gives per-class scores for each sample
  #
  # https://bkshin.tistory.com/entry/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-3%EC%84%9C%ED%8F%AC%ED%8A%B8-%EB%B2%A1%ED%84%B0-%EB%A8%B8%EC%8B%A0-SVM-%EC%8B%A4%EC%8A%B5?category=1057680
  # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

  params = {
      'DTC_Gini': {
          'criterion':['gini'],
          'splitter':['best','random'],
          'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split':[2,3,4,5]
      },
      'DTC_Entropy':{
          'criterion':['entropy'],
          'splitter':['best','random'],
          'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split':[2,3,4,5]
      },
      'SVC': {
          'kernel': ["rbf", "poly", "sigmoid", "linear"],
          'gamma': [1e-3, 1e-2, 1e-1, 1, 1e+1],
          'C': [1, 5, 10, 50, 100],
          'decision_function_shape': ["ovr", "ovo"]
      },
      'LR': {
          'solver': ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
          "penalty": ["l2"],
          'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
      }
  }

  # CV values
  cv_k = set(range(2,11))

  # Scalers using
  scalers = [StandardScaler(), RobustScaler(), MinMaxScaler(), MaxAbsScaler()]

  # Do scaling
  scaled_dict = do_scaling(scalers, X)

  # Decision Tree with gini
  result_list += find_DecisionTree_gini(scaled_dict, y, cv_k, params['DTC_Gini'])

  # Decision Tree with entropy
  result_list += find_DecisionTree_entropy(scaled_dict, y, cv_k, params['DTC_Entropy'])

  # SVC
  result_list += find_SVC(scaled_dict, y, cv_k, params['SVC'])

  # LogisticRegression
  result_list += find_LogisticRegression(scaled_dict, y, cv_k, params['LR'])

  # Sort list descending order to elements score
  result_list.sort(key = lambda i : i[1]['score'], reverse=True)

  return result_list

## Modeling

In [None]:
result = find_Scaler_Model(train_X, train_y)
# printing top 5 result of the function
for i in range(5):
  print(result[i])

* ['SVC', {'score': 0.9950980392156863, 'param': {'C': 1, 'decision_function_shape': 'ovr', 'gamma': 0.001, 'kernel': 'linear'}, 'scaler': StandardScaler(), 'model': SVC(C=1, gamma=0.001, kernel='linear', probability=True, random_state=100)}]
* ['SVC', {'score': 0.9950980392156863, 'param': {'C': 50, 'decision_function_shape': 'ovr', 'gamma': 0.01, 'kernel': 'rbf'}, 'scaler': RobustScaler(), 'model': SVC(C=50, gamma=0.01, probability=True, random_state=100)}]
* ['SVC', {'score': 0.9950980392156863, 'param': {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 0.001, 'kernel': 'linear'}, 'scaler': MinMaxScaler(), 'model': SVC(C=5, gamma=0.001, kernel='linear', probability=True, random_state=100)}]
* ['SVC', {'score': 0.9950980392156863, 'param': {'C': 50, 'decision_function_shape': 'ovr', 'gamma': 0.1, 'kernel': 'sigmoid'}, 'scaler': MinMaxScaler(), 'model': SVC(C=50, gamma=0.1, kernel='sigmoid', probability=True, random_state=100)}]
* ['SVC', {'score': 0.9950980392156863, 'param': {'C': 5, 'decision_function_shape': 'ovr', 'gamma': 0.001, 'kernel': 'linear'}, 'scaler': MaxAbsScaler(), 'model': SVC(C=5, gamma=0.001, kernel='linear', probability=True, random_state=100)}]


In [None]:
if result[0][1]['scaler'] == "None":
  scaled_X = X.copy()
else :
  scaled_X = result[0][1]['scaler'].fit_transform(X.copy())
train_X, test_X, train_y, test_y = train_test_split(scaled_X, y, test_size=0.7, shuffle=True)
model = result[0][1]['model'].fit(train_X, train_y)
print("Model score: ", end="")
print(model.score(test_X, test_y))