#### Auto-SKlearn (AutoML for Scikit-learn)

- Google Colab 에서 코드 실행을 추천합니다. (Windows OS Local에서 실행 및 설치 X)
- Windows OS 에서는 사용 불가 (다음 중 하나를 활용 가능 : Windows 10 bash shell / virtual machine / docker image)
- Installation : http://j.mp/2HWDKUF
- 공식 문서 : http://j.mp/2HRtIUv

#### 1. Installation

In [0]:
# Installation 1 of 3
!sudo apt-get install build-essential swig

In [0]:
# Installation 2 of 3
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install

In [0]:
# Installation 3 of 3
!pip install auto-sklearn

In [0]:
# # import autosklearn 중 numpy or scikit-learn 패키지 관련 에러 발생 시 (아래 항목을 하나씩 차례대로 실행)
# !pip uninstall -y numpy
# !pip install numpy # ==1.9.0
# !pip uninstall -y scikit-learn
# !pip install scikit-learn # ==0.19

#### 2. Basic usage 

In [0]:
# import autosklearn.classification

# cls = autosklearn.classification.AutoSklearnClassifier()
# cls.fit(X_train, y_train)
# predictions = cls.predict(X_test)

#### 3. Training & Test (vs SVM)

In [0]:
import autosklearn.classification
from sklearn import model_selection, datasets, metrics, preprocessing, svm

In [0]:
# X, y = datasets.load_digits(return_X_y=True) # 공식문서에 나와 있는 example에서 사용하는 데이터
X, y = datasets.load_breast_cancer(return_X_y=True) # return_X_y : data & target 을 바로 X & y 로 꽂아줌 (return)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=0)

#### 1) Baseline with SVM & GridSearchCV

In [25]:
sc = preprocessing.StandardScaler()
sc.fit(X_train) 
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

## SVM without GridSearchCV
# model = svm.SVC()
# model.fit(X_train_scaled, y_train)

param_grid = {'C' : [0.1, 1, 10, 100, 1000], 
             'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel' : ['rbf']}

grid = model_selection.GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=1, cv=3)
grid.fit(X_train_scaled, y_train)
print('\nThe best parameters are ', grid.best_params_)

grid_predictions = grid.predict(X_test_scaled)

print()
print(metrics.classification_report(y_test, grid_predictions)) 
print("훈련 세트 정확도: {:.3f}".format(grid.score(X_train_scaled, y_train)))
print("테스트 세트 정확도: {:.3f}".format(grid.score(X_test_scaled, y_test)))

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.



The best parameters are  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        53
           1       0.98      0.99      0.98        90

   micro avg       0.98      0.98      0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143

훈련 세트 정확도: 0.986
테스트 세트 정확도: 0.979


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.7s finished


#### 2) Use AutoSKlearn.Classification

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
model = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=3600) # default 3600 seconds # use AutoSklearnRegressor() for Regression

In [0]:
model.fit(X_train, y_train) # Feature Scaling 불필요

## the result printed after training finished
# AutoSklearnClassifier(delete_output_folder_after_terminate=True,
#            delete_tmp_folder_after_terminate=True,
#            disable_evaluator_output=False, ensemble_memory_limit=1024,
#            ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
#            exclude_preprocessors=None, get_smac_object_callback=None,
#            include_estimators=None, include_preprocessors=None,
#            initial_configurations_via_metalearning=25, logging_config=None,
#            ml_memory_limit=3072, output_folder=None,
#            per_run_time_limit=360, resampling_strategy='holdout',
#            resampling_strategy_arguments=None, seed=1, shared_mode=False,
#            smac_scenario_args=None, time_left_for_this_task=3600,
#            tmp_folder=None)

In [29]:
prediction = model.predict(X_test) 
print("Accuracy score", metrics.accuracy_score(y_test, prediction)) # other autosklearn built-in metrics @ https://goo.gl/EgzYGZ

Accuracy score 0.972027972027972


In [30]:
statistics = model.sprint_statistics()
print(statistics)

auto-sklearn results:
  Dataset name: 528ab82dffffb5039325fc43a2c64979
  Metric: accuracy
  Best validation score: 0.985816
  Number of target algorithm runs: 1612
  Number of successful target algorithm runs: 1577
  Number of crashed target algorithm runs: 35
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



In [31]:
trained_models = model.show_models()
print(trained_models)

[(0.060000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'categorical_encoding:__choice__': 'no_encoding', 'classifier:__choice__': 'liblinear_svc', 'imputation:strategy': 'median', 'preprocessor:__choice__': 'select_rates', 'rescaling:__choice__': 'standardize', 'classifier:liblinear_svc:C': 1.0, 'classifier:liblinear_svc:dual': 'False', 'classifier:liblinear_svc:fit_intercept': 'True', 'classifier:liblinear_svc:intercept_scaling': 1, 'classifier:liblinear_svc:loss': 'squared_hinge', 'classifier:liblinear_svc:multi_class': 'ovr', 'classifier:liblinear_svc:penalty': 'l2', 'classifier:liblinear_svc:tol': 0.00010000000000000009, 'preprocessor:select_rates:alpha': 0.0870708305464051, 'preprocessor:select_rates:mode': 'fpr', 'preprocessor:select_rates:score_func': 'f_classif'},
dataset_properties={
  'task': 1,
  'sparse': False,
  'multilabel': False,
  'multiclass': False,
  'target_type': 'classification',
  'signed': False})),
(0.060000, SimpleClassificationPipeline({'ba

In [32]:
models_with_weights = model.get_models_with_weights()
print(models_with_weights)

[(0.06, SimpleClassificationPipeline({'balancing:strategy': 'none', 'categorical_encoding:__choice__': 'no_encoding', 'classifier:__choice__': 'liblinear_svc', 'imputation:strategy': 'median', 'preprocessor:__choice__': 'select_rates', 'rescaling:__choice__': 'standardize', 'classifier:liblinear_svc:C': 1.0, 'classifier:liblinear_svc:dual': 'False', 'classifier:liblinear_svc:fit_intercept': 'True', 'classifier:liblinear_svc:intercept_scaling': 1, 'classifier:liblinear_svc:loss': 'squared_hinge', 'classifier:liblinear_svc:multi_class': 'ovr', 'classifier:liblinear_svc:penalty': 'l2', 'classifier:liblinear_svc:tol': 0.00010000000000000009, 'preprocessor:select_rates:alpha': 0.0870708305464051, 'preprocessor:select_rates:mode': 'fpr', 'preprocessor:select_rates:score_func': 'f_classif'},
dataset_properties={
  'task': 1,
  'sparse': False,
  'multilabel': False,
  'multiclass': False,
  'target_type': 'classification',
  'signed': False})), (0.06, SimpleClassificationPipeline({'balancing:

In [33]:
params = model.get_params()
print(params)

{'delete_output_folder_after_terminate': True, 'delete_tmp_folder_after_terminate': True, 'disable_evaluator_output': False, 'ensemble_memory_limit': 1024, 'ensemble_nbest': 50, 'ensemble_size': 50, 'exclude_estimators': None, 'exclude_preprocessors': None, 'get_smac_object_callback': None, 'include_estimators': None, 'include_preprocessors': None, 'initial_configurations_via_metalearning': 25, 'logging_config': None, 'ml_memory_limit': 3072, 'output_folder': None, 'per_run_time_limit': 360, 'resampling_strategy': 'holdout', 'resampling_strategy_arguments': None, 'seed': 1, 'shared_mode': False, 'smac_scenario_args': None, 'time_left_for_this_task': 3600, 'tmp_folder': None}
