In [1]:
# Install any required libraries that are not already available in Colab
!pip install xgboost



In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
from math import sqrt

# Load data
train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

# Encode 'Sex'
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

# Features and target
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
target = 'Calories'

X = train[features]
y = train[target]

# Train/Val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Predict on validation
val_preds = model.predict(X_val)

# Clip predictions to ensure positivity
val_preds = np.clip(val_preds, a_min=1e-5, a_max=None)

# Evaluate using RMSLE safely
epsilon = 1e-5
rmsle = sqrt(mean_squared_log_error(y_val + epsilon, val_preds + epsilon))
print(f"Validation RMSLE: {rmsle:.4f}")

# Predict on test set
test_preds = model.predict(test[features])
test_preds = np.clip(test_preds, a_min=1e-5, a_max=None)  # Clip again

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'Calories': test_preds
})
submission.to_csv("submission1.csv", index=False)

print("✅ Submission file created.")

Validation RMSLE: 0.0812
✅ Submission file created.


In [16]:
submission.head(-5)

Unnamed: 0,id,Calories
0,750000,26.941383
1,750001,108.466202
2,750002,86.273491
3,750003,127.535469
4,750004,73.953224
...,...,...
249990,999990,5.968144
249991,999991,197.575272
249992,999992,62.536209
249993,999993,40.842762


In [19]:
# Step 1: Install required libraries
!pip install catboost lightgbm --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
# Step 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Step 3: Load data (upload your train.csv and test.csv in Colab)


train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [23]:
# Step 4: Preprocessing and Feature Engineering
def preprocess(df):
    # Encode Sex
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    # Add engineered features
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['MassTime'] = df['Weight'] * df['Duration']
    df['HRxDur'] = df['Heart_Rate'] * df['Duration']
    df['TempDiff'] = df['Body_Temp'] - 37  # Deviation from normal body temp

    return df

train = preprocess(train)
test = preprocess(test)

# Features and Target
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
            'BMI', 'MassTime', 'HRxDur', 'TempDiff']
target = 'Calories'

X = train[features]
y = np.log1p(train[target])  # Log transform target

# Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Step 5: Train CatBoost Model
cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    verbose=0
)
cat_model.fit(X_train, y_train)

# Predict on validation
cat_val_preds = np.expm1(cat_model.predict(X_val))
cat_test_preds = np.expm1(cat_model.predict(test[features]))

In [25]:
# Step 6: Train LightGBM Model
lgbm_model = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    random_state=42
)
lgbm_model.fit(X_train, y_train)

# Predict on validation
lgbm_val_preds = np.expm1(lgbm_model.predict(X_val))
lgbm_test_preds = np.expm1(lgbm_model.predict(test[features]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 11
[LightGBM] [Info] Start training from score 4.141163


In [26]:
# Step 7: Evaluate Individual Models
def rmsle(y_true, y_pred):
    return sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 1e-5, None)))

print(f"CatBoost Val RMSLE: {rmsle(train.loc[y_val.index, 'Calories'], cat_val_preds):.5f}")
print(f"LightGBM Val RMSLE: {rmsle(train.loc[y_val.index, 'Calories'], lgbm_val_preds):.5f}")

# Step 8: Ensemble (Average Predictions)
ensemble_val_preds = (cat_val_preds + lgbm_val_preds) / 2
ensemble_test_preds = (cat_test_preds + lgbm_test_preds) / 2

print(f"Ensemble Val RMSLE: {rmsle(train.loc[y_val.index, 'Calories'], ensemble_val_preds):.5f}")

CatBoost Val RMSLE: 0.06043
LightGBM Val RMSLE: 0.06019
Ensemble Val RMSLE: 0.05967


In [27]:
# Step 9: Create Submission File
submission = pd.DataFrame({
    'id': test['id'],
    'Calories': np.clip(ensemble_test_preds, 0.1, None)  # Clip low values
})

submission.to_csv("submission2.csv", index=False)

print("\n✅ Final submission file created.")


✅ Final submission file created.


SUBMISSION - 3

In [28]:
# Step 1: Install required libraries
!pip install catboost lightgbm xgboost --quiet

In [29]:
# Step 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [30]:

train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [31]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (750000, 9)
Test shape: (250000, 8)


In [32]:
# Step 4: Feature Engineering
def preprocess(df):
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    # Derived Features
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['MassTime'] = df['Weight'] * df['Duration']
    df['HRxDur'] = df['Heart_Rate'] * df['Duration']
    df['TempDiff'] = df['Body_Temp'] - 37
    df['HR_BMI_Interaction'] = df['Heart_Rate'] * df['BMI']
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 50, 70, 100], labels=[0,1,2,3]).astype(int)

    return df

train = preprocess(train)
test = preprocess(test)

features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
            'BMI', 'MassTime', 'HRxDur', 'TempDiff', 'HR_BMI_Interaction', 'AgeGroup']
target = 'Calories'

X = train[features]
y = np.log1p(train[target].values)
X_test = test[features]

print("Features used:", features)

Features used: ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'MassTime', 'HRxDur', 'TempDiff', 'HR_BMI_Interaction', 'AgeGroup']


In [33]:
# Step 5: Define RMSLE metric
def rmsle(y_true, y_pred):
    return sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 1e-5, None)))

In [34]:
# Step 6: Initialize models with better params
cat_params = {
    'iterations': 600,
    'learning_rate': 0.03,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'RMSE',
    'verbose': 0
}

lgbm_params = {
    'n_estimators': 600,
    'learning_rate': 0.03,
    'num_leaves': 40,
    'max_depth': 7,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'random_state': 42
}

xgb_params = {
    'n_estimators': 600,
    'learning_rate': 0.03,
    'max_depth': 6,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'eval_metric': 'rmse',
    'verbosity': 0
}

models = [
    ('cat', CatBoostRegressor(**cat_params)),
    ('lgbm', LGBMRegressor(**lgbm_params)),
    ('xgb', XGBRegressor(**xgb_params))
]

In [35]:
# Step 7: 5-Fold Cross Validation + Ensemble
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros((len(X), len(models)))
test_preds = np.zeros((len(X_test), len(models)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold+1}")
    X_trn, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_trn, y_val = y[train_idx], y[val_idx]

    for i, (name, model) in enumerate(models):
        print(f"Training {name}...")
        model.fit(X_trn, y_trn)

        # OOF Predictions
        oof_preds[val_idx, i] = np.expm1(model.predict(X_val))

        # Test Predictions
        test_preds[:, i] += np.expm1(model.predict(X_test)) / NFOLDS


Fold 1
Training cat...
Training lgbm...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1427
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 13
[LightGBM] [Info] Start training from score 4.141163
Training xgb...

Fold 2
Training cat...
Training lgbm...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1432
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 13
[LightGBM] [Info] Start training from score 4.141466
Training xgb...

Fold 3
Training cat...
Training lgbm...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093478 seconds.
You can set `force_col_wise=true` to remove the overhea

In [36]:
# Step 8: Evaluate individual model performance on OOF
actual = train[target].values

for i, (name, _) in enumerate(models):
    score = rmsle(actual, oof_preds[:, i])
    print(f"{name.upper()} OOF RMSLE: {score:.5f}")

CAT OOF RMSLE: 0.06081
LGBM OOF RMSLE: 0.06043
XGB OOF RMSLE: 0.06039


In [37]:
# Step 9: Ensemble all models (simple average)
ensemble_oof = oof_preds.mean(axis=1)
ensemble_test = test_preds.mean(axis=1)

# Final Evaluation
final_score = rmsle(actual, ensemble_oof)
print(f"\nEnsemble OOF RMSLE: {final_score:.5f}")


Ensemble OOF RMSLE: 0.05987


In [38]:
# Step 10: Generate Submission
submission = pd.DataFrame({
    'id': test['id'],
    'Calories': np.clip(ensemble_test, 0.1, None)
})

submission.to_csv("submission_optimized.csv", index=False)
print("\n✅ Optimized submission file created.")

# Optional: Download
from google.colab import files
files.download("submission_optimized.csv")


✅ Optimized submission file created.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SUBMISSION - 4

In [45]:
# Step 1: Install TensorFlow
!pip install tensorflow --quiet

In [46]:
# Step 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.metrics import mean_squared_log_error
from math import sqrt

In [47]:

train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [56]:
# Step 4: Feature Engineering
def preprocess(df):
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['MassTime'] = df['Weight'] * df['Duration']
    df['HRxDur'] = df['Heart_Rate'] * df['Duration']
    df['TempDiff'] = df['Body_Temp'] - 37
    return df

train = preprocess(train)
test = preprocess(test)

features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
            'BMI', 'MassTime', 'HRxDur', 'TempDiff']
target = 'Calories'

X = train[features].values
y = train[target].values
X_test = test[features].values

In [57]:
# Step 5: Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [58]:
# Step 6: Split for validation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [62]:
def rmsle(y_true, y_pred):
    # Use a large number instead of None for the upper clip bound
    first_log = tf.math.log(tf.clip_by_value(y_true + 1.0, 1e-8, 1e9)) # Use 1.0 to ensure float operation
    second_log = tf.math.log(tf.clip_by_value(y_pred + 1.0, 1e-8, 1e9)) # Use 1.0 to ensure float operation
    return tf.sqrt(tf.reduce_mean(tf.square(first_log - second_log)))

In [63]:
model = models.Sequential([
    layers.Dense(128, activation='relu', kernel_regularizer='l2'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

# Use RMSProp or Adam with a good learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, loss=rmsle)

In [64]:
# Step 9: Train the model
early_stop = callbacks.EarlyStopping(patience=20, restore_best_weights=True)

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

print("Model trained.")

Model trained.


In [66]:
# Step 10: Evaluate on Validation Set
val_preds = model.predict(X_val).flatten()


[1m4688/4688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step


In [67]:
# Step 11: Predict and Submit
test_preds = model.predict(X_test_scaled).flatten()

submission = pd.DataFrame({
    'id': test['id'],
    'Calories': np.clip(test_preds, 0.1, None)
})

submission.to_csv("submission_mlp_rmsle.csv", index=False)
print("\n✅ Submission file created.")

from google.colab import files
files.download("submission_mlp_rmsle.csv")

[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step

✅ Submission file created.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>