In [3]:
import scipy.sparse as sp
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the processed data
combined_sparse = sp.load_npz('data/combined_sparse.npz')
df_targets = pd.read_csv('data/df_targets.csv')
df_targets['PRIM_CONTRIBUTORY_CAUSE'] = pd.Categorical(df_targets['PRIM_CONTRIBUTORY_CAUSE'])

# Encode the target variable
label_encoder = LabelEncoder()
df_targets_encoded = pd.DataFrame()
df_targets_encoded['PRIM_CONTRIBUTORY_CAUSE_LABEL'] = label_encoder.fit_transform(df_targets['PRIM_CONTRIBUTORY_CAUSE'].cat.codes)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    combined_sparse,
    df_targets_encoded['PRIM_CONTRIBUTORY_CAUSE_LABEL'].values,
    test_size=0.2,
    random_state=42
)

# Convert the sparse matrix to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set the XGBoost parameters
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': len(label_encoder.classes_)
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=10)

# Evaluate the model
predictions = model.predict(dtest)
accuracy = (predictions == y_test).mean()
print("Accuracy:", accuracy)


Accuracy: 0.435656836461126


In [4]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [4, 6, 8],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)


486 fits failed out of a total of 729.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
243 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Anaconda3\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "c:\Anaconda3\lib\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36], got [ 0  2  3  4  5  6  7  8  9

Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 4, 'subsample': 0.8}
Best Score: nan
