<a href="https://www.kaggle.com/code/sajjadhajian/xgboost-multi-class-prediction-of-obesity-risk?scriptVersionId=163030129" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from xgboost import XGBClassifier 


# Read the data
file_path = '/kaggle/input/playground-series-s4e2/train.csv' 
file_path2 = '/kaggle/input/playground-series-s4e2/test.csv'  

df = pd.read_csv(file_path)
df2 = pd.read_csv(file_path2)

submission_id = df2['id'].reset_index(drop=True)

df = df.drop(columns='id')
df2 = df2.drop(columns='id')


# Optimize datatypes
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype('category')
    
for col in df2.columns:
    if df2[col].dtype == 'object':
        df2[col] = df2[col].astype('category')

        
# Define the mapping from category to label
NObeyesdad_label = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
df['NObeyesdad'] = df['NObeyesdad'].map(NObeyesdad_label).astype('int8')


# Define the mapping from category to label
family_history_with_overweight_label = {
    'yes': 2,
    'no': 1
}
df['family_history_with_overweight'] = df['family_history_with_overweight'].map(family_history_with_overweight_label).astype('int8')
df2['family_history_with_overweight'] = df2['family_history_with_overweight'].map(family_history_with_overweight_label).astype('int8')


# Define the mapping from category to label
FAVC_label = {
    'no': 1,
    'yes': 2
}
df['FAVC'] = df['FAVC'].map(FAVC_label).astype('int8')
df2['FAVC'] = df2['FAVC'].map(FAVC_label).astype('int8')


# Define the mapping from category to label
CAEC_label = {
    'Sometimes': 2,
    'Frequently': 3,
    'Always': 4,
    'no': 1
}
df['CAEC'] = df['CAEC'].map(CAEC_label).astype('int8')
df2['CAEC'] = df2['CAEC'].map(CAEC_label).astype('int8')


# Define the mapping from category to label
SMOKE_label = {
    'no': 1,
    'yes': 2
}
df['SMOKE'] = df['SMOKE'].map(SMOKE_label).astype('int8')
df2['SMOKE'] = df2['SMOKE'].map(SMOKE_label).astype('int8')


# Define the mapping from category to label
CALC_label = {
    'no': 1,
    'Sometimes' : 2,
    'Frequently': 3,
    'Always':4
}
df['CALC'] = df['CALC'].map(CALC_label).astype('int8')
df2['CALC'] = df2['CALC'].map(CALC_label).astype('int8')


# Define the mapping from category to label
SCC_label = {
    'no': 2,
    'yes': 1
}
df['SCC'] = df['SCC'].map(SCC_label).astype('int8')
df2['SCC'] = df2['SCC'].map(SCC_label).astype('int8')


# Define the mapping from category to label
MTRANS_label = {
    'Public_Transportation': 3,
    'Automobile': 5 ,
    'Walking': 1 ,
    'Bike': 2 ,
    'Motorbike': 4
}
df['MTRANS'] = df['MTRANS'].map(MTRANS_label).astype('int8')
df2['MTRANS'] = df2['MTRANS'].map(MTRANS_label).astype('int8')



# One-hot encoding
categorical_columns = df.select_dtypes(include='category').columns
df_categorical = df[categorical_columns]
df_numerical = df.select_dtypes(exclude='category')
df_dummies = pd.get_dummies(df_categorical, columns=categorical_columns, drop_first=True, dtype=int, sparse=True)
df = pd.concat([df_numerical, df_dummies], axis=1)
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df.columns.values]


# One-hot encoding
categorical_columns2 = df2.select_dtypes(include='category').columns
df2_categorical = df2[categorical_columns2]
df2_numerical = df2.select_dtypes(exclude='category')
df2_dummies = pd.get_dummies(df2_categorical, columns=categorical_columns2, drop_first=True, dtype=int, sparse=True)
df2 = pd.concat([df2_numerical, df2_dummies], axis=1)
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df2.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df2.columns.values]


X = df.drop(columns='NObeyesdad')
y = df['NObeyesdad']


# Modeling 

model = XGBClassifier(objective='multi:softprob',num_class=len(np.unique(y)), random_state=1, eval_metric='merror')
model.fit(X,y)

label_to_category = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Overweight_Level_I',
    3: 'Overweight_Level_II',
    4: 'Obesity_Type_I',
    5: 'Obesity_Type_II',
    6: 'Obesity_Type_III'
}

# Improve the predictions
mask = (y == 2) | (y == 3)
X_23 = X[mask]
y_23 = y[mask]
y_23 = (y_23 == 3).astype('int')
model_23 = XGBClassifier(objective='binary:logistic', random_state=1, eval_metric='error')
model_23.fit(X_23, y_23)
classes_23 = ['Overweight_Level_I', 'Overweight_Level_II']

# y_pred_23 = y_pred
# for i in range(len(y_pred_23)):
#     if y_pred_23[i] in ['Overweight_Level_I', 'Overweight_Level_II']:
#         new_pred_23 = model_23.predict(X_test[i:i+1])[0]
#         new_pred_23 = classes_23[new_pred_23]
#         y_pred_23[i] = new_pred_23

# print(np.unique(y_pred_23))

mask = (y == 3) | (y == 4)
X_34 = X[mask]
y_34 = y[mask]
y_34 = (y_34 == 4).astype('int')
classes_34 = ['Overweight_Level_II','Obesity_Type_I']
model_34 = XGBClassifier(objective='binary:logistic', random_state=1, eval_metric='error')
model_34.fit(X_34, y_34)

# y_pred_34 = y_pred_23
# for i in range(len(y_pred_34)):
#     if y_pred_34[i] in ['Obesity_Type_I', 'Overweight_Level_II']:
#         new_pred_34 = model_34.predict(X_test[i:i+1])[0]
#         new_pred_34 = classes_34[new_pred_34]
#         y_pred_34[i] = new_pred_34

mask = (y == 1) | (y == 2)
X_12 = X[mask]
y_12 = y[mask]
classes_12 = ['Normal_Weight', 'Overweight_Level_I']
y_12 = (y_12 == 2).astype('int')
model_12 = XGBClassifier(objective='binary:logistic', random_state=1, eval_metric='error')
model_12.fit(X_12, y_12)

# y_pred_12 = y_pred_34
# for i in range(len(y_pred_12)):
#     if y_pred_12[i] in ['Normal_Weight', 'Overweight_Level_I']:
#         new_pred_12 = model_12.predict(X_test[i:i+1])[0]
#         new_pred_12 = classes_12[new_pred_12]
#         y_pred_12[i] = new_pred_12
            

# Submission
# y_pred = pd.Series(model.predict(X_test)).map(label_to_category).astype('str')
new_pred = pd.Series(model.predict(df2)).map(label_to_category).astype('str')
# new_pred = new_pred.map(label_to_category)
for i in range(len(new_pred)):
    if new_pred[i] in ['Overweight_Level_I', 'Overweight_Level_II']:
        new_pred[i] = classes_23[model_23.predict(df2[i:i+1])[0]]
for i in range(len(new_pred)):
    if new_pred[i] in ['Obesity_Type_I', 'Overweight_Level_II']:
        new_pred[i] = classes_34[model_34.predict(df2[i:i+1])[0]]
for i in range(len(new_pred)):
    if new_pred[i] in ['Normal_Weight', 'Overweight_Level_I']:
        new_pred[i] = classes_12[model_12.predict(df2[i:i+1])[0]]
        


# Create a new DataFrame with the required columns
XGBoost_submission = pd.DataFrame({
     'id': submission_id,
     'NObeyesdad': new_pred
 })

print('submission:', XGBoost_submission)

# Save the submission as a CSV file
XGBoost_submission.to_csv('submission.csv', index=False)

submission:           id           NObeyesdad
0      20758      Obesity_Type_II
1      20759   Overweight_Level_I
2      20760     Obesity_Type_III
3      20761       Obesity_Type_I
4      20762     Obesity_Type_III
...      ...                  ...
13835  34593  Overweight_Level_II
13836  34594   Overweight_Level_I
13837  34595  Insufficient_Weight
13838  34596        Normal_Weight
13839  34597      Obesity_Type_II

[13840 rows x 2 columns]
