In [1]:
!pip install auto-sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import autosklearn
print('autosklearn: %s' % autosklearn.__version__)

autosklearn: 0.15.0


In [4]:
import pandas as pd
import sklearn.metrics 
from sklearn.model_selection import train_test_split,StratifiedKFold
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import (accuracy,f1,roc_auc,precision,average_precision,recall,log_loss)

Load the dataset 

In [7]:
df = pd.read_csv("https://raw.githubusercontent.com/Rishav-hub/Auto-sklearn/main/bank-additional-full.csv", sep= ";")

In [8]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


##Now preparing the Data
Auto-Sklearn requires us to identify is a column is numerical categorical either in the pandas dataframe or we can do it later in the fit function. Lets convert it now. 

In [9]:
num_cols = ['duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
df[num_cols] = df[num_cols].apply(pd.to_numeric)
df[cat_cols] = df[cat_cols].apply(pd.Categorical)

In [10]:
y = df.pop('y')
X = df.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, stratify=y)
#This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.
#For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, 
#stratify=y will make sure that your random split has 25% of 0's and 75% of 1's

In [11]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

##Starting the classifier


In [16]:
skf = StratifiedKFold(n_splits=5)
  
clf = AutoSklearnClassifier(time_left_for_this_task=200,
                            max_models_on_disc=5,
                            memory_limit = 10240,
                            resampling_strategy=skf,
                            ensemble_size = 3,
                            metric = average_precision,
                            scoring_functions=[roc_auc, average_precision, accuracy, f1, precision, recall, log_loss])

  clf = AutoSklearnClassifier(time_left_for_this_task=200,


##Fitting the classifier

In [17]:
clf.fit(X = X_train,y = y_train)



AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      ensemble_kwargs={'ensemble_size': 3}, ensemble_size=3,
                      max_models_on_disc=5, memory_limit=10240,
                      metric=average_precision, per_run_time_limit=20,
                      resampling_strategy=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                      scoring_functions=[roc_auc, average_precision, accuracy,
                                         f1, precision, recall, log_loss],
                      time_left_for_this_task=200)

In [18]:
df_cv_results = pd.DataFrame(clf.cv_results_).sort_values(by = 'mean_test_score', ascending = False)
df_cv_results

Unnamed: 0,mean_test_score,rank_test_scores,metric_roc_auc,metric_average_precision,metric_accuracy,metric_f1,metric_precision,metric_recall,metric_log_loss,mean_fit_time,params,status,budgets,param_balancing:strategy,param_classifier:__choice__,param_data_preprocessor:__choice__,param_feature_preprocessor:__choice__,param_classifier:adaboost:algorithm,param_classifier:adaboost:learning_rate,param_classifier:adaboost:max_depth,param_classifier:adaboost:n_estimators,param_classifier:bernoulli_nb:alpha,param_classifier:bernoulli_nb:fit_prior,param_classifier:decision_tree:criterion,param_classifier:decision_tree:max_depth_factor,param_classifier:decision_tree:max_features,param_classifier:decision_tree:max_leaf_nodes,param_classifier:decision_tree:min_impurity_decrease,param_classifier:decision_tree:min_samples_leaf,param_classifier:decision_tree:min_samples_split,param_classifier:decision_tree:min_weight_fraction_leaf,param_classifier:extra_trees:bootstrap,param_classifier:extra_trees:criterion,param_classifier:extra_trees:max_depth,param_classifier:extra_trees:max_features,param_classifier:extra_trees:max_leaf_nodes,param_classifier:extra_trees:min_impurity_decrease,param_classifier:extra_trees:min_samples_leaf,param_classifier:extra_trees:min_samples_split,param_classifier:extra_trees:min_weight_fraction_leaf,...,param_feature_preprocessor:nystroem_sampler:n_components,param_feature_preprocessor:pca:keep_variance,param_feature_preprocessor:pca:whiten,param_feature_preprocessor:polynomial:degree,param_feature_preprocessor:polynomial:include_bias,param_feature_preprocessor:polynomial:interaction_only,param_feature_preprocessor:random_trees_embedding:bootstrap,param_feature_preprocessor:random_trees_embedding:max_depth,param_feature_preprocessor:random_trees_embedding:max_leaf_nodes,param_feature_preprocessor:random_trees_embedding:min_samples_leaf,param_feature_preprocessor:random_trees_embedding:min_samples_split,param_feature_preprocessor:random_trees_embedding:min_weight_fraction_leaf,param_feature_preprocessor:random_trees_embedding:n_estimators,param_feature_preprocessor:select_percentile_classification:percentile,param_feature_preprocessor:select_percentile_classification:score_func,param_feature_preprocessor:select_rates_classification:alpha,param_feature_preprocessor:select_rates_classification:score_func,param_classifier:gradient_boosting:n_iter_no_change,param_classifier:gradient_boosting:validation_fraction,param_classifier:lda:shrinkage_factor,param_classifier:libsvm_svc:coef0,param_classifier:libsvm_svc:degree,param_classifier:mlp:validation_fraction,param_classifier:sgd:epsilon,param_classifier:sgd:eta0,param_classifier:sgd:l1_ratio,param_classifier:sgd:power_t,param_data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:n_quantiles,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:output_distribution,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min,param_feature_preprocessor:fast_ica:n_components,param_feature_preprocessor:kernel_pca:coef0,param_feature_preprocessor:kernel_pca:degree,param_feature_preprocessor:kernel_pca:gamma,param_feature_preprocessor:nystroem_sampler:coef0,param_feature_preprocessor:nystroem_sampler:degree,param_feature_preprocessor:nystroem_sampler:gamma,param_feature_preprocessor:select_rates_classification:mode
0,0.0,1,,0.0,,,,,,20.016864,"{'balancing:strategy': 'none', 'classifier:__c...",Timeout,0.0,none,random_forest,feature_type,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.01,,,,,,,,,,,,
1,0.0,1,,0.0,,,,,,20.03233,"{'balancing:strategy': 'weighting', 'classifie...",Timeout,0.0,weighting,random_forest,feature_type,polynomial,,,,,,,,,,,,,,,,,,,,,,,,...,,,,3.0,False,False,,,,,,,,,,,,,,,,,,,,,,0.062054,,,,,,,,,,,,
2,0.0,1,,0.0,,,,,,20.025667,"{'balancing:strategy': 'none', 'classifier:__c...",Timeout,0.0,none,random_forest,feature_type,polynomial,,,,,,,,,,,,,,,,,,,,,,,,...,,,,3.0,True,False,,,,,,,,,,,,,,,,,,,,,,,1062.0,uniform,,,,,,,,,,
3,0.0,1,,0.0,,,,,,20.02024,"{'balancing:strategy': 'weighting', 'classifie...",Timeout,0.0,weighting,gradient_boosting,feature_type,polynomial,,,,,,,,,,,,,,,,,,,,,,,,...,,,,2.0,True,True,,,,,,,,,,,,5.0,,,,,,,,,,0.01,,,,,,,,,,,,
4,0.0,1,,0.0,,,,,,20.016727,"{'balancing:strategy': 'none', 'classifier:__c...",Timeout,0.0,none,mlp,feature_type,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,88.264042,mutual_info,,,,,,,,,,,,,,,,0.768284,0.241008,,,,,,,,
5,0.0,1,,0.0,,,,,,20.028953,"{'balancing:strategy': 'none', 'classifier:__c...",Timeout,0.0,none,adaboost,feature_type,feature_agglomeration,SAMME,0.403408,7.0,280.0,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1440.0,normal,,,,,,,,,,
6,0.0,1,,0.0,,,,,,20.017107,"{'balancing:strategy': 'weighting', 'classifie...",Timeout,0.0,weighting,adaboost,feature_type,no_preprocessing,SAMME.R,0.03389,9.0,374.0,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.003266,,,,,,,,,,,,
7,0.0,1,,0.0,,,,,,20.017824,"{'balancing:strategy': 'weighting', 'classifie...",Timeout,0.0,weighting,mlp,feature_type,feature_agglomeration,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,0.0,1,,0.0,,,,,,15.024815,"{'balancing:strategy': 'none', 'classifier:__c...",Timeout,0.0,none,extra_trees,feature_type,fast_ica,,,,,,,,,,,,,,,False,gini,,0.568423,,0.0,2.0,11.0,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.015768,,,0.747942,0.187013,18.0,,,,,,,


In [21]:
clf.get_models_with_weights()

[(1.0, MyDummyClassifier(config=1,
                    feat_type={'age': 'numerical', 'campaign': 'numerical',
                               'cons.conf.idx': 'numerical',
                               'cons.price.idx': 'numerical',
                               'contact': 'categorical',
                               'day_of_week': 'categorical',
                               'default': 'categorical', 'duration': 'numerical',
                               'education': 'categorical',
                               'emp.var.rate': 'numerical',
                               'euribor3m': 'numerical', 'housing': 'categorical',
                               'job': 'categor...
                                                                 'default': 'categorical',
                                                                 'duration': 'numerical',
                                                                 'education': 'categorical',
                                        

##Refit the Model 

In [22]:
clf.refit(X = X_train, y = y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      ensemble_kwargs={'ensemble_size': 3}, ensemble_size=3,
                      max_models_on_disc=5, memory_limit=10240,
                      metric=average_precision, per_run_time_limit=20,
                      resampling_strategy=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                      scoring_functions=[roc_auc, average_precision, accuracy,
                                         f1, precision, recall, log_loss],
                      time_left_for_this_task=200)

##Save Model 

In [23]:
import joblib
joblib.dump(clf, 'model.joblib')

['model.joblib']

##Loading & Prediction 

In [25]:
from sklearn.metrics import accuracy_score
model = joblib.load("model.joblib")
y_hat = model.predict(X_test)
acc = accuracy_score(y_test, y_hat)
print("Accuracy: %.3f" % acc)

Accuracy: 0.887
