In [16]:
pip install cmaes

Note: you may need to restart the kernel to use updated packages.


In [17]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd


In [18]:
# Load the dataset
df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')

# Display the DataFrame
print(df.head())

   id  Gender        Age    Height      Weight family_history_with_overweight  \
0   0    Male  24.443011  1.699998   81.669950                            yes   
1   1  Female  18.000000  1.560000   57.000000                            yes   
2   2  Female  18.000000  1.711460   50.165754                            yes   
3   3  Female  20.952737  1.710730  131.274851                            yes   
4   4    Male  31.641081  1.914186   93.798055                            yes   

  FAVC      FCVC       NCP        CAEC SMOKE      CH2O SCC       FAF  \
0  yes  2.000000  2.983297   Sometimes    no  2.763573  no  0.000000   
1  yes  2.000000  3.000000  Frequently    no  2.000000  no  1.000000   
2  yes  1.880534  1.411685   Sometimes    no  1.910378  no  0.866045   
3  yes  3.000000  3.000000   Sometimes    no  1.674061  no  1.467863   
4  yes  2.679664  1.971472   Sometimes    no  1.979848  no  1.967973   

        TUE       CALC                 MTRANS           NObeyesdad  
0  0.976473

In [19]:
# Correct target variable name
target = 'NObeyesdad'

# Splitting the dataset into features and target variable
X = df.drop(target, axis=1)
y = df[target]

# Convert categorical features using one-hot encoding
X = pd.get_dummies(X)

print(X.head())

   id        Age    Height      Weight      FCVC       NCP      CH2O  \
0   0  24.443011  1.699998   81.669950  2.000000  2.983297  2.763573   
1   1  18.000000  1.560000   57.000000  2.000000  3.000000  2.000000   
2   2  18.000000  1.711460   50.165754  1.880534  1.411685  1.910378   
3   3  20.952737  1.710730  131.274851  3.000000  3.000000  1.674061   
4   4  31.641081  1.914186   93.798055  2.679664  1.971472  1.979848   

        FAF       TUE  Gender_Female  ...  SCC_no  SCC_yes  CALC_Frequently  \
0  0.000000  0.976473          False  ...    True    False            False   
1  1.000000  1.000000           True  ...    True    False            False   
2  0.866045  1.673584           True  ...    True    False            False   
3  1.467863  0.780199           True  ...    True    False            False   
4  1.967973  0.931721          False  ...    True    False            False   

   CALC_Sometimes  CALC_no  MTRANS_Automobile  MTRANS_Bike  MTRANS_Motorbike  \
0           

In [20]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

In [22]:



# Define the objective function to optimize
def objective(trial):
    # Define hyperparameters to search
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0)
    }

    # Initialize XGBoost classifier
    model = xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss')

    # Fit the model
    model.fit(X_train, y_train_encoded)

    # Predict on the validation set
    y_pred_encoded = model.predict(X_val)

    # Decode the predictions back to original labels
    y_pred = label_encoder.inverse_transform(y_pred_encoded)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

# Define study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.CmaEsSampler())

# Optimize hyperparameters
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
best_params['eval_metric'] = 'mlogloss'


[I 2024-02-08 10:46:18,056] A new study created in memory with name: no-name-101d1378-508e-4153-a583-f82bf9d1e84c
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0)
[I 2024-02-08 10:46:21,717] Trial 0 finished with value: 0.8908959537572254 and parameters: {'max_depth': 3, 'learning_rate': 0.028196166685244165, 'n_estimators': 244, 'gamma': 0.0011727934582113232, 'min_child_weight': 0.45510998991845125, 'subsample': 0.9274224235048038, 'colsample_bytree': 0.8510308727453375, 'reg_alpha': 1.3836797326844985e-06, 'reg_lambda': 0.5959741171272507}. Best is trial 0 with value:

In [23]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBClassifier(**best_params, use_label_encoder=False)
final_model.fit(X_train, y_train_encoded)

# Predict on the test set
y_pred_encoded = final_model.predict(X_val)

# Decode the predictions back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Validation Accuracy:', accuracy)


Validation Accuracy: 0.9099229287090559


In [24]:
# Load the test dataset
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

# Preprocess the test dataset (e.g., one-hot encoding for categorical features)
test_X = pd.get_dummies(test_df)

# Reorder columns in the test dataset to match the order of columns in the training dataset
test_X = test_X.reindex(columns=X.columns, fill_value=0)

# Predict on the test set using the final model
test_y_pred_encoded = final_model.predict(test_X)

# Decode the predictions back to original labels
test_y_pred = label_encoder.inverse_transform(test_y_pred_encoded)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NObeyesdad': test_y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)
