In [None]:
pip install cmaes

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Load the dataset
df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')

# Correct target variable name
target = 'NObeyesdad'

# Splitting the dataset into features and target variable
X = df.drop(target, axis=1)
y = df[target]

# Convert categorical features using one-hot encoding
X = pd.get_dummies(X)

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Define the objective function to optimize
def objective(trial):
    # Define hyperparameters to search
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0)
    }

    # Initialize XGBoost classifier
    model = xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss')

    # Fit the model
    model.fit(X_train, y_train_encoded)

    # Predict on the validation set
    y_pred_encoded = model.predict(X_val)

    # Decode the predictions back to original labels
    y_pred = label_encoder.inverse_transform(y_pred_encoded)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

# Define study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.CmaEsSampler())

# Optimize hyperparameters
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
best_params['eval_metric'] = 'mlogloss'

# Train the final model with the best hyperparameters
final_model = xgb.XGBClassifier(**best_params, use_label_encoder=False)
final_model.fit(X_train, y_train_encoded)

# Predict on the test set
y_pred_encoded = final_model.predict(X_val)

# Decode the predictions back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Validation Accuracy:', accuracy)


In [None]:
import pandas as pd

# Load the test dataset
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

# Preprocess the test dataset (e.g., one-hot encoding for categorical features)
test_X = pd.get_dummies(test_df)

# Reorder columns in the test dataset to match the order of columns in the training dataset
test_X = test_X.reindex(columns=X.columns, fill_value=0)

# Predict on the test set using the final model
test_y_pred_encoded = final_model.predict(test_X)

# Decode the predictions back to original labels
test_y_pred = label_encoder.inverse_transform(test_y_pred_encoded)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NObeyesdad': test_y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)
