# Load Libraries

Overview

This challenge is part of the "2024 Kaggle Playground Series". These competitions use synthetic data, so it is expected that some noise generated during data creation will impact overall model quality.

Goal: use various factors to predict obesity risk in individuals ('Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight' 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I' 'Obesity_Type_I')

In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import lightgbm as lgb
from sklearn.metrics import accuracy_score

2024-04-15 11:10:26.219723: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-15 11:10:26.219813: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-15 11:10:26.221474: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load data

In [2]:
train_df1 = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test_df1 = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')
print(train_df1.info())
train_df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


# EDA and feature engineering

In [3]:
print("Unique values in 'CAEC':", train_df1['CAEC'].unique())
print("Unique values in 'SMOKE':", train_df1['SMOKE'].unique())
print("Unique values in 'CALC':", train_df1['CALC'].unique())
print("Unique values in 'MTRANS':", train_df1['MTRANS'].unique())
print("Unique values in 'NObeyesdad':", train_df1['NObeyesdad'].unique())
print("Unique values in 'Smoke':", train_df1['SMOKE'].unique())
print("Unique values in 'SCC':", train_df1['SCC'].unique())

Unique values in 'CAEC': ['Sometimes' 'Frequently' 'no' 'Always']
Unique values in 'SMOKE': ['no' 'yes']
Unique values in 'CALC': ['Sometimes' 'no' 'Frequently']
Unique values in 'MTRANS': ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']
Unique values in 'NObeyesdad': ['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']
Unique values in 'Smoke': ['no' 'yes']
Unique values in 'SCC': ['no' 'yes']


In [4]:
#variable remaping

transport_remap = {'Public_Transportation': 'motor', 'Automobile': 'motor', 'Motorbike': 'motor', 'Walking' : 'physical', 'Bike' : 'physical'}
alchool_remap = {'Always': 'Frequently'}
train_df1['MTRANS'] = train_df1['MTRANS'].replace(transport_remap)
test_df1['MTRANS'] = test_df1['MTRANS'].replace(transport_remap)
test_df1['CALC'] = test_df1['CALC'].replace(alchool_remap)

In [5]:
train_dummies = pd.get_dummies(train_df1[['Gender','SCC', 'SMOKE', 'CAEC', 'CALC', 'MTRANS', 'family_history_with_overweight', 'FAVC']], drop_first=True)
test_dummies = pd.get_dummies(test_df1[['Gender','SCC', 'SMOKE', 'CAEC', 'CALC', 'MTRANS', 'family_history_with_overweight', 'FAVC']], drop_first=True)
train_dummies.head()

Unnamed: 0,Gender_Male,SCC_yes,SMOKE_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,CALC_Sometimes,CALC_no,MTRANS_physical,family_history_with_overweight_yes,FAVC_yes
0,True,False,False,False,True,False,True,False,False,True,True
1,False,False,False,True,False,False,False,True,False,True,True
2,False,False,False,False,True,False,False,True,False,True,True
3,False,False,False,False,True,False,True,False,False,True,True
4,True,False,False,False,True,False,True,False,False,True,True


In [6]:
train_df2 = pd.concat([train_df1, train_dummies], axis=1)
test_df2 = pd.concat([test_df1, test_dummies], axis=1)
train_df2.drop(['Gender','SCC', 'CAEC', 'SMOKE', 'CALC', 'MTRANS', 'family_history_with_overweight', 'FAVC'], axis=1, inplace=True)
test_df2.drop(['Gender','SCC', 'CAEC', 'SMOKE', 'CALC', 'MTRANS', 'family_history_with_overweight', 'FAVC'], axis=1, inplace=True)
train_df2.head()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,...,SCC_yes,SMOKE_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,CALC_Sometimes,CALC_no,MTRANS_physical,family_history_with_overweight_yes,FAVC_yes
0,0,24.443011,1.699998,81.66995,2.0,2.983297,2.763573,0.0,0.976473,Overweight_Level_II,...,False,False,False,True,False,True,False,False,True,True
1,1,18.0,1.56,57.0,2.0,3.0,2.0,1.0,1.0,Normal_Weight,...,False,False,True,False,False,False,True,False,True,True
2,2,18.0,1.71146,50.165754,1.880534,1.411685,1.910378,0.866045,1.673584,Insufficient_Weight,...,False,False,False,True,False,False,True,False,True,True
3,3,20.952737,1.71073,131.274851,3.0,3.0,1.674061,1.467863,0.780199,Obesity_Type_III,...,False,False,False,True,False,True,False,False,True,True
4,4,31.641081,1.914186,93.798055,2.679664,1.971472,1.979848,1.967973,0.931721,Overweight_Level_II,...,False,False,False,True,False,True,False,False,True,True


In [7]:
train_df2['BMI'] = train_df2['Weight'] / train_df2['Height']**2
test_df2['BMI'] = test_df2['Weight'] / test_df2['Height']**2

In [8]:
all_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'Gender_Male', 'SCC_yes', 'SMOKE_yes', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no', 'CALC_Sometimes', 'CALC_no', 'MTRANS_physical', \
                'family_history_with_overweight_yes', 'FAVC_yes', 'BMI']

used_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'Gender_Male', 'SCC_yes', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no', 'CALC_Sometimes', 'CALC_no', 'MTRANS_physical', \
                'family_history_with_overweight_yes', 'FAVC_yes', 'BMI']


X = train_df2[used_features].values
X_test = test_df2[used_features].values
y = train_df2['NObeyesdad']

In [9]:
print(X.shape, y.shape)
print(X_test.shape)

(20758, 19) (20758,)
(13840, 19)


In [18]:
# Encode labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(y_encoded.dtype)


# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

y_train = y_train.astype(int)
y_val = y_val.astype(int)
print(np.isnan(y_train).any())
print(np.isnan(y_val).any())

int64
False
False


In [19]:
#modeling with light GBM


# Create a LightGBM dataset
train_dataset = lgb.Dataset(X_train, label=y_train)

# Set hyperparameters
params = {
    'objective': 'multiclass',
    'metric': 'multi_error',  
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'num_class': 7  
}

# Train the model
num_round = 100
bst = lgb.train(params, train_dataset, num_round)

# Make predictions on the validation set
predictions = bst.predict(X_val, num_iteration=bst.best_iteration)
# Convert predicted probabilities to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_val, predicted_labels)
print(f'Accuracy: {accuracy}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2301
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 19
[LightGBM] [Info] Start training from score -2.107657
[LightGBM] [Info] Start training from score -1.907572
[LightGBM] [Info] Start training from score -1.964755
[LightGBM] [Info] Start training from score -1.855022
[LightGBM] [Info] Start training from score -1.635117
[LightGBM] [Info] Start training from score -2.146046
[LightGBM] [Info] Start training from score -2.107657
Accuracy: 0.9063102119460501


In [12]:
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

In [13]:
y_pred_classes = np.argmax(y_pred, axis=1)

In [14]:
y_pred_classes

array([3, 5, 4, ..., 0, 1, 3])

In [15]:
# Convert predictions back to original labels
predicted_labels_sub = label_encoder.inverse_transform(y_pred_classes)
predicted_labels_sub

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [16]:
# Create the submission DataFrame
output = pd.DataFrame({'id': test_df1['id'], 'NObeyesdad': predicted_labels_sub})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [17]:
output.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
