In [195]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error

In [196]:
submission = pd.read_csv("sample_submission.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
original = pd.read_csv("original.csv")

In [197]:
target = "Calories"

In [198]:
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [199]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [200]:
train

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [201]:
train["Sex"] = train["Sex"]=="male"
test["Sex"] = test["Sex"]=="male"
train

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,True,36,189.0,82.0,26.0,101.0,41.0,150.0
1,False,64,163.0,60.0,8.0,85.0,39.7,34.0
2,False,51,161.0,64.0,7.0,84.0,39.8,29.0
3,True,20,192.0,90.0,25.0,105.0,40.7,140.0
4,False,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...
749995,True,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,False,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,True,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,True,45,182.0,91.0,17.0,102.0,40.3,109.0


In [202]:
trainX = train.drop(target, axis=1)
trainY = train[target]

In [203]:
X_train, X_val, Y_train, Y_val = train_test_split(trainX, trainY, train_size=0.75)

In [204]:
lgbm_base = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, n_jobs=-1)
lgbm_base.fit(X_train, Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 363
[LightGBM] [Info] Number of data points in the train set: 562500, number of used features: 7
[LightGBM] [Info] Start training from score 88.330066


In [205]:
Y_pred = lgbm_base.predict(X_val)
Y_pred = np.clip(Y_pred, a_min=0, a_max=None) # calories burned always above 0
Y_pred

array([ 12.9084716 ,  82.82838543, 121.30924637, ...,  89.77926821,
        41.55256916,  67.34613671], shape=(187500,))

In [206]:
root_mean_squared_log_error(Y_val, Y_pred)

np.float64(0.07266083737242206)

In [207]:
lgbm_base.fit(trainX, trainY) # train on rest of the data

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 357
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 7
[LightGBM] [Info] Start training from score 88.282781


In [208]:
test_pred = lgbm_base.predict(test)
test_pred = np.clip(test_pred, a_min=0, a_max=None) # calories burned always above 0

In [209]:
submission[target] = test_pred
submission

Unnamed: 0,id,Calories
0,750000,27.153256
1,750001,109.386770
2,750002,87.916319
3,750003,128.172563
4,750004,74.251612
...,...,...
249995,999995,26.508327
249996,999996,9.042930
249997,999997,73.001986
249998,999998,166.767869


In [210]:
submission.to_csv("submission.csv", index=False)