### Goal: The goal of this competition is to use various factors to predict obesity risk in individuals, which is related to cardiovascular disease.
### Evaluation: Submissions are evaluated using the **accuracy** score.

In [None]:
# My place is 2524/3587 (Top 70%)
# Public Score: 0.89920
# Private score: 0.89432

### Submission File
For each id row in the test set, you must predict the class value of the target, NObeyesdad. The file should contain a header and have the following format:

| id    | NObeyesdad    |
|-------|---------------|
| 20757 | Normal_Weight |
| 20758 | Normal_Weight |
| 20759 | Normal_Weight |
| 20760 | Normal_Weight |

- id: unique identifier for each individual
- Gender: male / female
- Age: age in years
- Height: height in meters
- Weight: weight in kilograms
- family_history_with_overweight: yes / no
- FAVC: frequently consumes high caloric food: yes / no
- FCVC: frequency of consumption of vegetables: 1 to 3 (1 - never, 2 - sometimes, 3 - always)
- NCP: number of main meals: 1 to 3 (1 - one meal, 2 - two meals, 3 - three meals)
- CAEC: consumption of food between meals: 1 to 4 (1 - never, 2 - sometimes, 3 - frequently, 4 - always)
- SMOKE: yes / no
- CH2O: daily water consumption: 1 to 3 (1 - less than 1 liter, 2 - 1 to 2 liters, 3 - more than 2 liters)
- SCC: calorie consumption monitoring: yes / no
- FAF: physical activity frequency: 1 to 3 (1 - never, 2 - sometimes, 3 - always)
- TUE: time using technology devices: 1 to 4 (1 - less than 1 hour, 2 - 1 to 2 hours, 3 - 2 to 4 hours, 4 - more than 4 hours)
- CALC: consumption of alcohol: yes / no
- MTRANS: means of transportation used most often:
    - 1 - walking
    - 2 - bike
    - 3 - motorbike
    - 4 - public transportation
    - 5 - car
- NObeyesdad: the obesity level of the individual, which is the target variable to predict. The possible values are:
    - Insufficient_Weight
    - Normal_Weight
    - Overweight_Level_I
    - Overweight_Level_II
    - Obesity_Type_I
    - Obesity_Type_II
    - Obesity_Type_III

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
test_path = '/kaggle/input/playground-series-s4e2/test.csv'
train_path = '/kaggle/input/playground-series-s4e2/train.csv'
submission_path = '/kaggle/working/submission.csv'
target_column = 'NObeyesdad'

In [None]:
test_data = pd.read_csv(test_path)
train_data = pd.read_csv(train_path)
submission_data = pd.read_csv(submission_path)

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
X = train_data.drop(columns=[target_column])
y = train_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    """ Custom transformer to create new features based on existing ones."""

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """ Create new features based on existing ones."""
        X['is_kid'] = (X['Age'] <= 12).astype(int)
        X['is_teen'] = ((X['Age'] > 12) & (X['Age'] <= 19)).astype(int)
        X['is_young_adult'] = ((X['Age'] > 19) & (X['Age'] <= 39)).astype(int)
        X['is_adult'] = ((X['Age'] > 39) & (X['Age'] <= 59)).astype(int)
        X['is_senior'] = (X['Age'] > 59).astype(int)
        return X


In [None]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=[object]).columns.tolist()
drop_cols = ['Age', 'id']  # keep id and target variable for correlation analysis

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('drop', 'drop', drop_cols)  # drop columns
    ],
    remainder='drop', n_jobs=-1
)

In [None]:
target_encoder = OrdinalEncoder()

y_test = pd.Series(target_encoder.fit_transform(y_test.values.reshape(-1, 1)).ravel(), name=target_column)
y_train = pd.Series(target_encoder.transform(y_train.values.reshape(-1, 1)).ravel(), name=target_column)

In [None]:
# Create a pipeline with custom transformer
transformer = Pipeline(steps=[
    ('feature_engineering', FeatureTransformer()),
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# estimator = HistGradientBoostingClassifier(max_iter=280, learning_rate=0.1, max_depth=5)
# pipeline = make_pipeline(transformer, estimator)
#
# # let's tune model using RandomSearchCV
# param_grid = {
#     'histgradientboostingclassifier__max_iter': [100, 200, 300, 500],
#     'histgradientboostingclassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'histgradientboostingclassifier__max_depth': [None, 3, 5, 7, 10],
#     'histgradientboostingclassifier__min_samples_leaf': [10, 20, 30, 50],
#     'histgradientboostingclassifier__l2_regularization': [0.0, 0.1, 1.0],
#     'histgradientboostingclassifier__max_bins': [32, 64, 128, 255],
#     'histgradientboostingclassifier__early_stopping': [True, False]
# }
#
# search = RandomizedSearchCV(pipeline,
#                             param_grid,
#                             n_iter=20,
#                             cv=cv,
#                             scoring='accuracy',
#                             n_jobs=-1,
#                             verbose=1,
#                             return_train_score=False)
#
#
# search.fit(X_train, y_train)

In [None]:
# best_model = search.best_estimator_
# best_score = -search.best_score_

In [None]:
# scores = cross_val_score(best_model, X_test, y_test, scoring='accuracy', cv=cv)
# mean_score = scores.mean()
# r2_score = best_model.score(X_test, y_test)
#
# print(f'Best Parameters: {search.best_params_}')
# print(f'Best Cross-Validation Score: {best_score}') # -0.9052752982543538
# print(f'Mean Accuracy: {mean_score}') # 0.8906467748249316
# print(f'R2 Score: {r2_score}') # 0.910645472061657

In [None]:
best_params = {'histgradientboostingclassifier__min_samples_leaf': 50, 'histgradientboostingclassifier__max_iter': 100, 'histgradientboostingclassifier__max_depth': 5, 'histgradientboostingclassifier__max_bins': 255, 'histgradientboostingclassifier__learning_rate': 0.1, 'histgradientboostingclassifier__l2_regularization': 0.1, 'histgradientboostingclassifier__early_stopping': False}

pipeline = make_pipeline(transformer, HistGradientBoostingClassifier(**{k.split('__')[1]: v for k, v in best_params.items()}))
pipeline.fit(X_train, y_train)

best_model = pipeline

In [None]:
# let's predict on test data
y_test_pred = best_model.predict(test_data)

In [None]:
# save predictions to submission file
_id = test_data['id']
submission_df = pd.DataFrame({'id': _id, target_column: y_test_pred})
# let's revert target encoding
submission_df[target_column] = target_encoder.inverse_transform(
    y_test_pred.reshape(-1, 1)
).ravel()
submission_df.to_csv(submission_path, index=False)