In [13]:
pip install cmaes

Note: you may need to restart the kernel to use updated packages.


In [15]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd


In [16]:
# Load the dataset
df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')

# Display the DataFrame
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [17]:
# Correct target variable name
target = 'NObeyesdad'

# Splitting the dataset into features and target variable
X = df.drop(target, axis=1)
y = df[target]

# Convert categorical features using one-hot encoding
X = pd.get_dummies(X)

X.head()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,...,SCC_no,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,False,...,True,False,False,True,False,False,False,False,True,False
1,1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,True,...,True,False,False,False,True,True,False,False,False,False
2,2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,True,...,True,False,False,False,True,False,False,False,True,False
3,3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,True,...,True,False,False,True,False,False,False,False,True,False
4,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,False,...,True,False,False,True,False,False,False,False,True,False


In [18]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

In [None]:

# Define the objective function to optimize
def objective(trial):
    # Define hyperparameters to search
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),  # Adjusted upper bound
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),  # Narrowed range
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),  # Expanded range
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 100.0),  # Expanded range
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 100.0),  # Expanded range
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),  # Expanded range
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 2.0)  # Expanded range
    }


    # Initialize XGBoost classifier
    model = xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss')

    # Fit the model
    model.fit(X_train, y_train_encoded)

    # Predict on the validation set
    y_pred_encoded = model.predict(X_val)

    # Decode the predictions back to original labels
    y_pred = label_encoder.inverse_transform(y_pred_encoded)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

# Define study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.CmaEsSampler())

# Optimize hyperparameters
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
best_params['eval_metric'] = 'mlogloss'


[I 2024-02-12 10:20:05,157] A new study created in memory with name: no-name-0c3397c2-ff7e-4db9-b7cb-eb25acd1ebb0
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),  # Narrowed range
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 100.0),  # Expanded range
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 100.0),  # Expanded range
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),  # Expanded range
  'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 2.0)  # Expanded range
[I 2024-02-12 10:20:26,585] Trial 0 finished with value: 0.8492292870905588 and parameters: {'max_depth': 3, 'learning_rate': 0.0019308873493836785, 'n_estimators': 1499, 'gamma': 0.03175951030233102, 'min_child_weight': 2.632642074384929e-06, 'subsample': 0.632265114524523, 'colsample_bytree': 0.7709556670473614, '

In [None]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBClassifier(**best_params, use_label_encoder=False)
final_model.fit(X_train, y_train_encoded)

# Predict on the test set
y_pred_encoded = final_model.predict(X_val)

# Decode the predictions back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Validation Accuracy:', accuracy)


In [None]:
# Load the test dataset
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

# Preprocess the test dataset (e.g., one-hot encoding for categorical features)
test_X = pd.get_dummies(test_df)

# Reorder columns in the test dataset to match the order of columns in the training dataset
test_X = test_X.reindex(columns=X.columns, fill_value=0)

# Predict on the test set using the final model
test_y_pred_encoded = final_model.predict(test_X)

# Decode the predictions back to original labels
test_y_pred = label_encoder.inverse_transform(test_y_pred_encoded)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NObeyesdad': test_y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)
