# Grid Search with dask ml

```cmd
conda install dask-ml -y
```

In [1]:
import time

import cupy as cp

import cudf
import dask
import dask_cudf
import dask.dataframe as dd
dask.config.set({"dataframe.backend": "cudf"})

from dask_ml.model_selection import  GridSearchCV as GSCV
from cuml.model_selection import train_test_split, GridSearchCV
from cuml.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
class Timer:
    def __init__(self, name="cpu"):
        self.name = name
        
    def __enter__(self):
        self.start = time.perf_counter()
        
    def __exit__(self, type, value, trackback):
        self.end = time.perf_counter()
        self.execute_time = self.end - self.start
        print(f"{self.name} execute time : {self.execute_time:.4f} seconds")

In [3]:
with Timer(name="dask read parquet") as dask_time:
    train_dask = dd.read_parquet("./data/train.parquet")
    test_dask  = dd.read_parquet("./data/test.parquet")

dask read parquet execute time : 0.0375 seconds


In [4]:
def preprocess_data(data):
    # Convert categorical variables into numerical
    data = dd.reshape.get_dummies(data.categorize(), columns=["Sex", "Embarked"])
    # Fill missing values in Age and Fare with median
    data["Age"] = data["Age"].fillna(data["Age"].median())
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())
    # Drop unnecessary columns
    data = data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
    return data


In [5]:
train_dask = train_dask.persist()
test_dask  = test_dask.persist()

In [6]:
with Timer(name="dask preprocessed") as dask_time:
    train_dask = preprocess_data(train_dask)
    test_dask  = preprocess_data(test_dask)

dask preprocessed execute time : 0.3023 seconds


In [7]:
train_dask.visualize()

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [8]:
train_dask = train_dask.persist()
test_dask  = test_dask.persist()

In [9]:
with Timer(name="Train Valid Split") as dask_time:
    X_train = train_dask.drop("Survived", axis=1).compute()
    y_train = train_dask["Survived"].compute()
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Train Valid Split execute time : 0.0316 seconds


In [10]:
model = XGBClassifier(tree_method="gpu_hist")

param_grid = {
    "max_depth": [10,30,50],
    "min_child_weight" : [1,3,6,10],
    "n_estimators": [200,300,500,1000],
    "learning_rate": [0.1, 0.01, 0.001],
}

In [11]:
with Timer(name="grid search with xgb") as dask_time:
    # Perform grid search to find the best hyperparameters
    grid_search = GSCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train.to_numpy())
    
    # Get the best model
    best_model = grid_search.best_estimator_

grid search with xgb execute time : 542.6457 seconds


In [12]:
with Timer(name="xgb gpu inference") as dask_time:
    y_pred_train = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    print("Training Accuracy:", train_accuracy)
    
    y_pred_val = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    print("Validation Accuracy:", val_accuracy)

Training Accuracy: 0.9214586019515991
Validation Accuracy: 0.8146067261695862
xgb gpu inference execute time : 0.0597 seconds


In [13]:
test_predictions = best_model.predict(test_dask.compute())

# Prepare submission file
submission_df = cudf.DataFrame({
    "PassengerId": range(892, 892 + len(test_predictions)),
    "Survived": test_predictions
})

# Save submission file
submission_df.to_csv("submission_xgb.csv", index=False)

# CPU Example

In [14]:
model = XGBClassifier()

# Define hyperparameters to tune
param_grid = {
    "max_depth": [10,30,50],
    "min_child_weight" : [1,3,6,10],
    "n_estimators": [200,300,500,1000],
    "learning_rate": [0.1, 0.01, 0.001],
}

In [15]:
with Timer(name="grid search with xgb") as dask_time:
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train.to_numpy())
    
    # Get the best model
    best_model = grid_search.best_estimator_

Fitting 5 folds for each of 144 candidates, totalling 720 fits
grid search with xgb execute time : 263.6332 seconds


In [16]:
with Timer(name="xgb cpu inference") as dask_time:
    y_pred_train = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    print("Training Accuracy:", train_accuracy)
    
    y_pred_val = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    print("Validation Accuracy:", val_accuracy)

Training Accuracy: 0.8906030654907227
Validation Accuracy: 0.8202247023582458
xgb cpu inference execute time : 0.0312 seconds
