### Data prep

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict, cross_val_score

In [2]:
import pandas as pd
df = pd.read_csv('D:\IMPORTANT\AIML\Data\calories.csv')

In [3]:
print(df.columns)

Index(['Unnamed: 0', 'age', 'weight(kg)', 'height(m)', 'gender', 'BMI', 'BMR',
       'activity_level', 'calories_to_maintain_weight', 'BMI_tags', 'Label'],
      dtype='object')


In [4]:
print(df.shape)

(10726, 11)


In [5]:
df = df.drop('Unnamed: 0' , axis=1)

In [6]:
df = df.drop(['BMI_tags', 'Label'], axis=1)

In [7]:
print(df.columns)

Index(['age', 'weight(kg)', 'height(m)', 'gender', 'BMI', 'BMR',
       'activity_level', 'calories_to_maintain_weight'],
      dtype='object')


In [8]:
print(df.describe())

                age    weight(kg)     height(m)           BMI           BMR  \
count  10726.000000  10726.000000  10726.000000  10726.000000  10726.000000   
mean      26.778575     61.647728      1.540276     24.788317   1442.049923   
std       18.201675     24.254566      0.186042      6.954637    250.686166   
min        2.000000     12.005536      0.856328     10.342285    667.941000   
25%       13.000000     45.538375      1.564845     19.557137   1320.322500   
50%       23.000000     61.323460      1.631282     23.056039   1493.165000   
75%       36.000000     83.484585      1.632234     31.334294   1602.488000   
max       90.000000     98.799820      1.633223     37.142927   1914.374000   

       activity_level  calories_to_maintain_weight  
count    10726.000000                 10726.000000  
mean         1.487451                  2118.574075  
std          0.239625                   388.425427  
min          1.200000                   953.540900  
25%          1.300000  

In [9]:
X = df[['age', 'weight(kg)', 'height(m)', 'gender', 'BMI', 'BMR',
       'activity_level']]

y = df['calories_to_maintain_weight']

In [10]:
print(X.head(10))

   age  weight(kg)  height(m) gender        BMI      BMR  activity_level
0    2   16.097862   0.932025      F  18.531612  958.584             1.2
1    4   14.619374   0.916687      F  17.397496  932.383             1.7
2    4   17.899918   0.997288      F  17.997414  977.578             1.9
3    3   13.532880   1.022786      F  12.936609  944.689             1.9
4    4   17.039484   1.053977      M  15.338909  799.229             1.9
5    3   12.032662   1.078630      F  10.342285  939.781             1.9
6    2   17.794901   1.070393      M  15.531358  831.386             1.9
7    3   15.789179   0.912256      F  18.972577  947.560             1.3
8    3   13.300541   1.068686      F  11.645791  950.262             1.7
9    4   15.504307   0.900704      F  19.111207  938.161             1.3


In [11]:
X['gender'] = (X['gender'] == 'M').astype(int)
print(X['gender'])

0        0
1        0
2        0
3        0
4        1
        ..
10721    0
10722    0
10723    0
10724    0
10725    1
Name: gender, Length: 10726, dtype: int32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['gender'] = (X['gender'] == 'M').astype(int)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.75,random_state=42)

### Feature engineering


In [13]:
print(X.head(10))

   age  weight(kg)  height(m)  gender        BMI      BMR  activity_level
0    2   16.097862   0.932025       0  18.531612  958.584             1.2
1    4   14.619374   0.916687       0  17.397496  932.383             1.7
2    4   17.899918   0.997288       0  17.997414  977.578             1.9
3    3   13.532880   1.022786       0  12.936609  944.689             1.9
4    4   17.039484   1.053977       1  15.338909  799.229             1.9
5    3   12.032662   1.078630       0  10.342285  939.781             1.9
6    2   17.794901   1.070393       1  15.531358  831.386             1.9
7    3   15.789179   0.912256       0  18.972577  947.560             1.3
8    3   13.300541   1.068686       0  11.645791  950.262             1.7
9    4   15.504307   0.900704       0  19.111207  938.161             1.3


In [14]:
print(X_train.isnull().sum())

age               0
weight(kg)        0
height(m)         0
gender            0
BMI               0
BMR               0
activity_level    0
dtype: int64


In [15]:
rf_model = RandomForestRegressor(
    n_estimators=200,        # Start with 200 trees
    max_depth=15,            # Set a moderate depth
    min_samples_split=4,     # Prevent overfitting by requiring 4 samples to split
    min_samples_leaf=2,      # Leaf nodes must have at least 2 samples
    max_features='sqrt',     # Consider sqrt of total features when splitting
    bootstrap=True,          # Use bootstrapped samples
    random_state=42          # For reproducibility
)

In [16]:
rf_model.fit(X_train, y_train)

In [17]:
y_pred = rf_model.predict(X_test)

In [18]:
mae = mean_absolute_error(y_test, y_pred)
print(mae)

15.706151146493129


In [19]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(100 - mape)

99.99209955700644


In [20]:
mse = mean_squared_error(y_test, y_pred)
print(mse)

575.9768613295624


In [21]:
y_cvscore = -cross_val_score(rf_model, X_train, y_train, cv=3,n_jobs=-1,scoring='neg_mean_absolute_error' )

In [22]:
print(y_cvscore)

[19.9927677  18.74765741 19.29461512]
