In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


> # Importing Necessary Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

In [3]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

In [4]:
train_X = train_df.drop(['id', 'NObeyesdad'], axis=1)
train_y = train_df['NObeyesdad']

In [5]:
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

In [6]:
train_X_encoded = pd.get_dummies(train_X, columns=categorical_cols)
test_X_encoded = pd.get_dummies(test_df.drop('id', axis=1), columns=categorical_cols)

In [7]:
common_cols = list(set(train_X_encoded.columns) & set(test_X_encoded.columns))
train_X_encoded = train_X_encoded[common_cols]
test_X_encoded = test_X_encoded[common_cols]

In [8]:
label_encoder = LabelEncoder()
train_y_encoded = label_encoder.fit_transform(train_y)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(train_X_encoded, train_y_encoded, test_size=0.2, random_state=42)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_X_encoded)

In [11]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
}

In [12]:
grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

In [13]:
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}


In [14]:
best_model = grid_search.best_estimator_

In [15]:
cv_scores = cross_val_score(best_model, train_X_encoded, train_y_encoded, cv=5, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean()}')


Cross-Validation Scores: [0.90510597 0.9026975  0.91498073 0.90701036 0.90845579]
Mean CV Accuracy: 0.9076500706239274


In [16]:
val_predictions = best_model.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.91


In [17]:
test_predictions = best_model.predict(test_scaled)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)


In [18]:
submission_df = pd.DataFrame({'id': test_df['id'], 'NObeyesdad': test_predictions_labels})

In [19]:
submission_df.to_csv('submission.csv', index=False)