In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [27]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load data
train = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [29]:
# Encode target variable with a dedicated encoder
le_target = LabelEncoder()
train['NObeyesdad'] = le_target.fit_transform(train['NObeyesdad'])
target_classes = le_target.classes_  # Store target label classes

# Encode other categorical variables using new LabelEncoder instances for each
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    # For test data, handle unseen categories by mapping them to -1
    test[col] = test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Feature selection
X = train.drop(columns=['id', 'NObeyesdad'])
y = train['NObeyesdad']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1234)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data = scaler.transform(test.drop(columns=['id']))



In [30]:
# Define models and hyperparameter grids
models = {
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [3, 5, 10]}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10]}),
    'Bagging': (BaggingClassifier(), {'n_estimators': [10, 50, 100]}),
    'Boosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]})
}

best_models = {}

# Train models using GridSearchCV
for name, (model, params) in models.items():
    print(f'Training {name}...')
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Validation Accuracy: {grid_search.best_score_}')


Training Decision Tree...
Best parameters for Decision Tree: {'max_depth': 10}
Validation Accuracy: 0.8711303526842211
Training Random Forest...
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Validation Accuracy: 0.8923273547405352
Training Bagging...
Best parameters for Bagging: {'n_estimators': 100}
Validation Accuracy: 0.8870882409406071
Training Boosting...
Best parameters for Boosting: {'learning_rate': 0.2, 'n_estimators': 200}
Validation Accuracy: 0.9034078830988322


In [33]:
# Generate predictions and create submission files
for name, model in best_models.items():
    predictions = model.predict(test_data)
    # Ensure predictions are within the known range
    predictions = np.clip(predictions, 0, len(target_classes) - 1)
    # Convert numeric predictions back to obesity category labels
    predictions_labels = le_target.inverse_transform(predictions.astype(int))
    submission = pd.DataFrame({'id': test['id'], 'NObeyesdad': predictions_labels})
    submission.to_csv(f'submission_{name}.csv', index=False)
    print(f'Submission file for {name} created.')


Submission file for Decision Tree created.
Submission file for Random Forest created.
Submission file for Bagging created.
Submission file for Boosting created.


In [37]:
import shutil

#This code just changes the names of the submission output to submit 
#Submission 1: Decision Tree
# Copy the file while keeping the original
#shutil.copy('submission_Decision Tree.csv', 'submission.csv')

#Submission 2: Random Forest
# Copy the file while keeping the original
#shutil.copy('submission_Random Forest.csv', 'submission.csv')

#Submission 3: Bagging
# Copy the file while keeping the original
#shutil.copy('submission_Bagging.csv', 'submission.csv')

#Submission 4: Boosting
# Copy the file while keeping the original
shutil.copy('submission_Boosting.csv', 'submission.csv')

'submission.csv'