In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e5/sample_submission.csv
/kaggle/input/playground-series-s5e5/train.csv
/kaggle/input/playground-series-s5e5/test.csv


In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn models for Regression
from sklearn.ensemble import (
    AdaBoostRegressor,
    BaggingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    VotingRegressor,
)
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_squared_log_error,
)
from sklearn.model_selection import (
    KFold,
    cross_val_score,
    train_test_split,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Other machine learning libraries
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Preprocessing and pipelines
from sklearn.pipeline import make_pipeline , Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer


In [31]:
train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')

In [4]:
styled_df = train.head().style.set_properties(**{"background-color": "#f9f9f9", "color": "#333333", "border": "1px solid #ccc", "font-weight": "normal"}).set_table_styles([{"selector": "th", "props": [("background-color", "#1976d2"), ("color", "white"), ("font-weight", "bold")]}]).format(precision=2)
styled_df

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [5]:
train.isna().sum()

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [7]:
#Checking for the outlier
def outlier(df):
    outlier_col = []
    for col in df.select_dtypes(include='number').columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Check if any value in the column is an outlier
        if ((df[col] < lower_bound) | (df[col] > upper_bound)).any():
            outlier_col.append(col)
    return outlier_col

outlier_col = outlier(train)
print(f"These are the columns which has outlier values {outlier_col}")

#First iteration we will run with the outliers, in next we will check how we can handle the outliers

These are the columns which has outlier values ['Height', 'Weight', 'Heart_Rate', 'Body_Temp', 'Calories']


In [8]:
#Separating categorical and numberical columns for the pipleline,
def seperate_col(df):
    num_col = [col for col in df.columns if df[col].dtype != object]
    cat_col = [col for col in df.columns if df[col].dtype  == object]
    return num_col, cat_col
    
y = train['Calories']
train = train.drop(columns=['Calories','id'])


num_col,cat_col = seperate_col(train)
print(f"These are the numberical columns in the dataframe: {num_col}")
print(f"These are the categorical columns in the dataframe: {cat_col}")


These are the numberical columns in the dataframe: ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
These are the categorical columns in the dataframe: ['Sex']


In [10]:
#Creating pipeline for numerical values


# Numeric pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])



In [11]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, cat_col)
])

In [12]:
model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('Regressor', DecisionTreeRegressor())
])


In [14]:
# 1. Train-test split
X = train[num_col + cat_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
y_train.head()

453635     25.0
11651      67.0
431999     86.0
529211      5.0
110925    122.0
Name: Calories, dtype: float64

In [17]:
# 2. Fit pipeline
model_pipeline.fit(X_train, y_train)


# 3. Predict
y_pred = model_pipeline.predict(X_test)

# 4. Evaluate with MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 26.17


In [22]:
test.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,male,45,177.0,81.0,7.0,87.0,39.8
1,male,26,200.0,97.0,20.0,101.0,40.5
2,female,29,188.0,85.0,16.0,102.0,40.4
3,female,39,172.0,73.0,20.0,107.0,40.6
4,female,30,173.0,67.0,16.0,94.0,40.5


In [33]:
#Trying the baseline model on the test file for submission
validation_set = test.drop(columns=['id'])

In [34]:
y_pred_v1 = model_pipeline.predict(validation_set)
y_validate = pd.DataFrame(y_pred_v1)


In [37]:
# Combine with ID
y_submission = pd.concat([test['id'].reset_index(drop=True), y_validate], axis=1)
y_submission.columns = ['id', 'Calories']

In [28]:
x = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
x.head()

Unnamed: 0,id,Calories
0,750000,88.283
1,750001,88.283
2,750002,88.283
3,750003,88.283
4,750004,88.283
