# Estinmating the cost with cross-validation
We mentioned that there are 3 ways of estimating the cost
* Domain expert provides the cost
* Balance Ratio (we did this in previous notebook)
* Cross-validation: find the cost as hyperparameter

In this notebook, we will find the cost with hyperparemeter tuning and cross-validation.

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# load data
data = pd.read_csv('../datasets/kdd2004.csv').sample(10000)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
43297,52.05,24.36,-0.66,-24.5,55.0,1460.4,1.28,-1.48,-6.0,-41.5,...,1687.5,-0.32,-1.48,4.0,-38.0,159.5,1.11,-0.03,-0.74,-1
71809,60.09,25.56,-0.2,-26.0,26.0,2030.8,0.67,0.53,10.0,-85.5,...,2740.7,-0.83,-0.33,-1.0,-51.0,346.6,-0.18,0.56,0.45,-1
100193,85.05,25.39,1.67,-9.5,-31.5,3843.0,-0.55,1.2,-12.5,-141.0,...,3072.8,-0.88,0.96,7.0,-167.0,1735.9,-1.6,0.37,0.81,-1
107972,69.28,21.01,-0.29,10.5,25.5,1630.9,-0.5,0.18,-6.5,-54.0,...,1526.0,-0.4,-0.43,-3.0,-41.0,260.6,0.51,0.11,-0.46,-1
27388,78.57,23.76,0.0,-34.0,-8.0,1319.7,-0.29,0.5,-5.5,-70.5,...,1161.6,-0.06,-1.55,-3.0,-23.0,64.5,1.41,0.21,0.01,-1


In [3]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['target']),
    data['target'],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((7000, 74), (3000, 74))

In [4]:
# set up initial random forest
rf = RandomForestClassifier(
    n_estimators=50,
    random_state=39,
    max_depth=2,
    n_jobs=-1,
    class_weight=None,
)

In [5]:
# set up parameter grid search
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 2, 3,],
    'class_weight': [None, {-1: 1, 1:10}, {-1:1, 1:100}],
}

In [6]:
search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=2,
)

search.fit(X_train, y_train)

In [7]:
search.best_score_

0.9889062264744932

In [8]:
search.best_params_

{'class_weight': {-1: 1, 1: 100}, 'max_depth': 2, 'n_estimators': 100}

In [9]:
search.score(X_test, y_test)

0.9806559017883588