In [20]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo
import random

## Download Data

### Bank Marketing

In [2]:
bank_marketing = fetch_ucirepo(id=222)
X_bm = bank_marketing.data.features
Y_bm = bank_marketing.data.targets

### Adult Income

In [3]:
adult = fetch_ucirepo(id=2)
X_adult = adult.data.features
Y_adult = adult.data.targets

### Skin Segmentation

In [4]:
skin_segmentation = fetch_ucirepo(id=229) 
X_sk = skin_segmentation.data.features 
Y_sk = skin_segmentation.data.targets

### Credit Card Defaults

In [5]:
default_of_credit_card_clients = fetch_ucirepo(id=350) 
X_cc = default_of_credit_card_clients.data.features 
Y_cc = default_of_credit_card_clients.data.targets

## Clean the Data

### Bank Marketing

In [6]:
X_bm = X_bm.fillna('unknown')
X_bm = pd.get_dummies(X_bm, columns=X_bm.select_dtypes(include=['object']).columns).astype(int)
X_bm = X_bm.to_numpy()
X_bm

array([[  58, 2143,    5, ...,    0,    0,    1],
       [  44,   29,    5, ...,    0,    0,    1],
       [  33,    2,    5, ...,    0,    0,    1],
       ...,
       [  72, 5715,   17, ...,    0,    1,    0],
       [  57,  668,   17, ...,    0,    0,    1],
       [  37, 2971,   17, ...,    1,    0,    0]])

In [27]:
X_bm_short = X_bm[:10000]
X_bm_short

array([[  58, 2143,    5, ...,    0,    0,    1],
       [  58, 2143,    5, ...,    0,    0,    1],
       [  44,   29,    5, ...,    0,    0,    1],
       ...,
       [  53,  315,    5, ...,    0,    0,    1],
       [  29,  220,    4, ...,    0,    0,    1],
       [  55,    0,    5, ...,    0,    0,    1]])

In [7]:
Y_bm.loc[:, 'y'] = Y_bm['y'].map({'yes': 1, 'no': 0})
Y_bm = Y_bm.to_numpy().astype(int)
Y_bm

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])

In [30]:
Y_bm_short = Y_bm[:10000]

### Adult Income

In [8]:
X_adult = X_adult.dropna()
X_adult = pd.get_dummies(X_adult, columns=X_adult.select_dtypes(include=['object']).columns).astype(int)
X_adult = X_adult.to_numpy()
X_adult

array([[    39,  77516,     13, ...,      1,      0,      0],
       [    50,  83311,     13, ...,      1,      0,      0],
       [    38, 215646,      9, ...,      1,      0,      0],
       ...,
       [    38, 374983,     13, ...,      1,      0,      0],
       [    44,  83891,     13, ...,      1,      0,      0],
       [    35, 182148,     13, ...,      1,      0,      0]])

In [31]:
X_adult_short = X_adult[:10000]

In [9]:
Y_adult.loc[:, 'income'] = Y_adult['income'].map({'<=50K': 0, '<=50K.': 0, '>50K.': 1, '>50K': 1})
Y_adult = Y_adult.to_numpy().astype(int)
Y_adult

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [32]:
Y_adult_short = Y_adult[:10000]

### Skin Segmentation

In [10]:
X_sk = X_sk.to_numpy()
X_sk

array([[ 74,  85, 123],
       [ 73,  84, 122],
       [ 72,  83, 121],
       ...,
       [163, 162, 112],
       [163, 162, 112],
       [255, 255, 255]])

In [33]:
X_sk_short = X_sk[:10000]

In [11]:
Y_sk.loc[:, 'y'] = Y_sk['y'].map({1: 0, 2: 1})
Y_sk = Y_sk.to_numpy().astype(int)
Y_sk

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [34]:
Y_sk_short = Y_sk[:10000]

### Credit Card Defaults

In [12]:
X_cc = X_cc.to_numpy()
X_cc

array([[ 20000,      2,      2, ...,      0,      0,      0],
       [120000,      2,      2, ...,   1000,      0,   2000],
       [ 90000,      2,      2, ...,   1000,   1000,   5000],
       ...,
       [ 30000,      1,      2, ...,   4200,   2000,   3100],
       [ 80000,      1,      3, ...,   1926,  52964,   1804],
       [ 50000,      1,      2, ...,   1000,   1000,   1000]])

In [35]:
X_cc_short = X_cc[:10000]

In [13]:
Y_cc = Y_cc.to_numpy().astype(int)
Y_cc

array([[1],
       [1],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [36]:
Y_cc_short = Y_cc[:10000]

## Create Model

In [37]:
datasets = {
    'bm': (X_bm_short, Y_bm_short),
    'adult': (X_adult_short, Y_adult_short),
    'sk': (X_sk_short, Y_sk_short),
    'cc': (X_cc_short, Y_cc_short)
}

In [46]:
classifiers = {
    'SVM': {
        'model': SVC(),
        'param_grid': {
            'kernel': ['linear', 'rbf'],
            'C': [0.1, 1],
            'gamma': [0.001, 0.01],
        },
    },
    'Neural Network': {
        'model': MLPClassifier(max_iter=1000),
        'param_grid': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001],
            'learning_rate': ['constant'],
        },
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'param_grid': {
            'C': [0.01, 0.1, 1],
            'penalty': ['l2'],
        },
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 10],
            'weights': ['uniform', 'distance'],
        },
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
        },
    },
}

In [47]:
partitions = {
    '20/80': 0.8,
    '50/50': 0.5,
    '80/20': 0.2
}

In [95]:
Y_cc.shape

Y_ccc = Y_cc.ravel()
Y_ccc.shape

(30000,)

In [104]:
X_bm = X_bm.to_numpy()

In [108]:
Y_bm = Y_bm.astype(int)

In [48]:
results = []

for dataset_name, dataset in datasets.items():
    X, Y = dataset[0], dataset[1]
    Y = Y.ravel()
    for partition_name, test_size in partitions.items():
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
        for clf_name, clf_info in classifiers.items():
            print(f'Processing: {dataset_name}, Classifier: {clf_name}, Partition: {partition_name}')

            grid_search = GridSearchCV(
                estimator=clf_info['model'],
                param_grid=clf_info['param_grid'],
                cv=3,
                scoring='accuracy',
                n_jobs=-1,
                verbose=2,
                return_train_score=True
            )

            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            cv_results = grid_search.cv_results_
            
            #train_accuracy = accuracy_score(y_train, best_model.predict(X_train))
            #test_accuracy = accuracy_score(y_test, best_model.predict(X_test))
            #val_accuracy = grid_search.best_score_

            avg_train_accuracy = np.mean(cv_results['mean_train_score'])
            avg_val_accuracy = np.mean(cv_results['mean_test_score'])
            avg_test_accuracy = np.mean(cv_results['split0_test_score'])

            results.append({
                'Dataset': dataset_name,
                'Classifier': clf_name,
                'Partition': partition_name,
                'Best Hyperparameters': grid_search.best_params_,
                'Train Accuracy (Avg)': avg_train_accuracy,
                'Validation Accuracy (Avg)': avg_val_accuracy,
                'Test Accuracy (Avg)': avg_test_accuracy,
            })

Processing: bm, Classifier: SVM, Partition: 20/80
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.2s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time= 9.6min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.3s
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.3s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.3s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   0.2s
[CV] END .....................C=1, gamma=0.01, kernel=linear; total time= 9.6min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.2s
[CV] END ....................C=1, gamma=0.001, kernel=linear; total time=10.9min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.3s

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Processing: bm, Classifier: KNN, Partition: 20/80
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Processing: bm, Classifier: Random Forest, Partition: 20/80
Fitting 3 folds for each of 18 candidates, totalling 54 fits


KeyboardInterrupt: 

In [49]:
cv_results

{'mean_fit_time': array([0.00121236, 0.00175166, 0.00172853, 0.00213059, 0.00177439,
        0.00190123]),
 'std_fit_time': array([3.58859379e-04, 4.10340517e-04, 7.89697138e-05, 2.47574503e-04,
        6.41664479e-05, 3.00959575e-04]),
 'mean_score_time': array([0.09920828, 0.0403417 , 0.12257735, 0.05972409, 0.13399323,
        0.04910644]),
 'std_score_time': array([0.00368872, 0.00649639, 0.0048959 , 0.00794803, 0.01185344,
        0.00585076]),
 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'uniform', 'distance'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 3, 'weights': 'uniform'},
  {'n_neighbors': 3, 'weights': 'distance'},
  {'n_neighbors': 5, 'weights': 'uniform'},
  {'n_ne

In [45]:
grid_search.best_score_

0.9635002318660489