<a href="https://colab.research.google.com/github/Leomutz/NIDS1/blob/main/Hyperparameters_tuning_26November2024v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**K-fold CV - Hyperparameter tuning**

https://medium.com/@24littledino/k-fold-cv-hyper-parameter-tuning-in-python-4ad95880e477
- cross validation for hyperparameter tuning: finding the best model with certain hyper-parameters combination
- we can use grid search with for loops
- perform K-fold CV on every model, then select the one with the best average accuracies

- The scoring measure is a dictionary with key *test_score* (an array with one performance measure for each fold). Then take average of these scores as a general representation of model performance
- we use only training data inside the *cross_validate* function

In [10]:
# import needed packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from prettytable import PrettyTable

In [11]:
# reading data
data = pd.read_csv('/content/iris.csv')

In [13]:
# view data
data.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [14]:
# Split the data into train and test set

# take four columns
data_chars = data.iloc[:, :4]
# take the target (last column)
target = data.iloc[:, -1]

# split the data
x_train, x_test, y_train, y_test = train_test_split(
    data_chars,target,test_size=0.2, random_state=42)

In [16]:
# cross validation for hyperparameter tuning
hyperpar_score_list = []
for p in range(1,4):
  for neighbor in range(2,7):
    knn = KNeighborsClassifier(p=p, n_neighbors=neighbor)
    scores = cross_validate(knn, x_train, y_train, cv=10, scoring='accuracy')
    mean_score = np.mean(scores['test_score'])
    hyperpar_score_list.append([p, neighbor, mean_score])

In [18]:
# view the hyperparameters obtained
myTable = PrettyTable(['p (distance)', 'Number of neighbors', 'Avg accuracy'])

for i in hyperpar_score_list:
  myTable.add_row([i[0], i[1], round(i[2],2) ])
print(myTable)

+--------------+---------------------+--------------+
| p (distance) | Number of neighbors | Avg accuracy |
+--------------+---------------------+--------------+
|      1       |          2          |     0.92     |
|      1       |          3          |     0.95     |
|      1       |          4          |     0.93     |
|      1       |          5          |     0.93     |
|      1       |          6          |     0.93     |
|      2       |          2          |     0.93     |
|      2       |          3          |     0.95     |
|      2       |          4          |     0.94     |
|      2       |          5          |     0.94     |
|      2       |          6          |     0.96     |
|      3       |          2          |     0.93     |
|      3       |          3          |     0.95     |
|      3       |          4          |     0.95     |
|      3       |          5          |     0.97     |
|      3       |          6          |     0.96     |
+--------------+------------

- The best parameters are p=3 and n_neighbors=5 which gives the average accuracy of 0.97

- Now we can train the model and evaluate the model's performance on the test data

In [19]:
# train the model now
knn = KNeighborsClassifier(p=3, n_neighbors=5)
knn_best_model = knn.fit(x_train, y_train)

print(f'Best Model Testing Score: {round(knn_best_model.score(x_test,y_test)*100,2)}')

Best Model Testing Score: 96.67


In [21]:
# Convert arff file to csv
# https://github.com/mfahadzafar/Convert-Arff-to-CSV/blob/master/arffToCsv.py

def getCSVFromArff(fileName):
	with open(fileName + '.arff', 'r') as fin:
		data = fin.read().splitlines(True)
	i = 0
	cols = []
	for line in data:
		line = line.lower()
		if ('@data' in line):
			i+= 1
			break
		else:
			#print line
			i+= 1
			if (line.startswith('@attribute')):
				if('{' in line):
					cols.append(line[11:line.index('{')-1])
				else:
					cols.append(line[11:line.index(' ', 11)])
	headers = ",".join(cols)
	with open(fileName + '.csv', 'w') as fout:
		fout.write(headers)
		fout.write('\n')
		fout.writelines(data[i:])

getCSVFromArff("/content/adult-census")

In [27]:
adult_census = pd.read_csv('/content/adult-census.csv')

In [28]:
target_name = 'class'
target = adult_census[target_name]
target

Unnamed: 0,class
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
32556,<=50K
32557,>50K
32558,<=50K
32559,<=50K


In [29]:
target.unique()

array(['<=50K', '>50K'], dtype=object)

In [30]:
adult_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               32561 non-null  int64 
 1   age              32561 non-null  int64 
 2   workclass        32561 non-null  object
 3   fnlwgt:          32561 non-null  int64 
 4   education:       32561 non-null  object
 5   education-num:   32561 non-null  int64 
 6   marital-status:  32561 non-null  object
 7   occupation:      32561 non-null  object
 8   relationship:    32561 non-null  object
 9   race:            32561 non-null  object
 10  sex:             32561 non-null  object
 11  capital-gain:    32561 non-null  int64 
 12  capital-loss:    32561 non-null  int64 
 13  hours-per-week:  32561 non-null  int64 
 14  native-country:  32561 non-null  object
 15  class            32561 non-null  object
dtypes: int64(7), object(9)
memory usage: 4.0+ MB


In [32]:
# drop 'education-num' is a duplucate of 'education'
# drop the target column 'class'

data_census = adult_census.drop(columns=[target_name, 'education-num:'])


In [33]:

# view the data
data_census

Unnamed: 0,id,age,workclass,fnlwgt:,education:,marital-status:,occupation:,relationship:,race:,sex:,capital-gain:,capital-loss:,hours-per-week:,native-country:
0,1,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,2,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,3,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,4,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,5,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32557,27,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,32558,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,32559,58,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,32560,22,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [34]:
# split the data into train and test
data_train,data_test,target_train, target_test = train_test_split(
    data_census, target, random_state=42)

In [35]:
# create a predictive pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(
    handle_unknown='use_encoded_value', unknown_value=-1
)

preprocessor = ColumnTransformer(
    [('cat_preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough'
)

In [36]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([('preprocessor', preprocessor),
                  ('classifier', HistGradientBoostingClassifier(random_state=42,
                                                                max_leaf_nodes=4))])

In [37]:
model

**Randomized-search Tuning**

https://inria.github.io/scikit-learn-mooc/python_scripts/parameter_tuning_randomized_search.html

- generate randomly the parameters, which avoids the regularity of the grid
- Use RandomizedSearchCV class allows stochastic search
- log-uniform distribution is used because the parameters interested in take positive values with a batural log scaling
- to optimize 3 or more parameters is better to use random search  

In [40]:
# randomized search

%%time
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_distributions = {
    'classifier__l2_regularization': loguniform(1e-6, 1e3),
    'classifier__learning_rate': loguniform(0.001, 10),
    'classifier__max_leaf_nodes': loguniform(2, 256),
    'classifier__min_samples_leaf': loguniform(1, 100),
    'classifier__max_bins': loguniform(2, 255)
}

model_random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    verbose=1,
)

model_random_search.fit(data_train, target_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'class'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_indexing.py", line 361, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'class'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/local/lib/python3.10/dist-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/compose/_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/compose/_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_indexing.py", line 369, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [41]:
# then find the accuracy score on the test set
accuracy = model_random_search.score(data_test, target_test)
print(f'The test accuracy score of the best model is {accuracy:.2f}')

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [42]:
from pprint import pprint

print('The best parameters are:')
pprint(model_random_search.best_params_)

The best parameters are:


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [45]:
# view the results using attributes cv_results

# get the parameter names
column_results = [f'param_{name}' for name in param_distributions.keys()]
column_results += ['mean_test_score', 'std_test_score', 'rank_test_score']

cv_results = pd.DataFrame(model_random_search.cv_results_)
cv_results = cv_results[column_results].sort_values(
    'mean_test_score', ascending=False
)

def shorten_param(param_name):
  if "__" in param_name:
    return param_name.rsplit("__", 1)[1]
  return param_name

cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

AttributeError: 'RandomizedSearchCV' object has no attribute 'cv_results_'