In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer

In [7]:
train = pd.read_csv("/kaggle/input/playground-series-s5e5/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")

#print(train.head)
#print(test.head)

#preprocess the datata, filter features [Sex, Age, ,Height, weight, Duration, Hear_rate, Body_Temp]
#output/Prediction: Calories
features_train = train[['Sex', 'Age',  'Weight', 'Duration', 'Heart_Rate', 'Calories']]
print(features_train.head())


#preprocess features
features_train.loc[:, "Sex"] = features_train["Sex"].apply(lambda x: 1.0 if x == 'male' else 0.0)
print(features_train.head())


#extract women/men calories to replace the nan. values with the mean (calories)
women = features_train[(features_train["Sex"] == 0.0) & (features_train["Calories"].notna())]
men = features_train[(features_train["Sex"] == 1.0) & (features_train["Calories"].notna())]



women_counter = len(women)
men_counter = len(men)

women_calories = women["Calories"].tolist()
men_calories = men["Calories"].tolist()

men_avg_calories = sum(men_calories)/men_counter
women_avg_calories = sum(women_calories)/women_counter

print(men_avg_calories)
print(women_avg_calories)

# replace nan. values with women average calories 
features_train.loc[
    (features_train["Sex"] == 0.0) & (features_train["Calories"].isna()),
    "Calories"
] = women_avg_calories

# replace nan. values with men average calories 
features_train.loc[
    (features_train["Sex"] == 1.0) & (features_train["Calories"].isna()),
    "Calories"
] = men_avg_calories

labels_train = features_train[["Calories"]]
features_train = features_train.drop("Calories", axis=1)

print(labels_train.head())
print(features_train.head())

#create Intensity Feature
features_train["Intensity"] = features_train["Heart_Rate"] * features_train["Duration"]




#Preprocess test data
features_test = test[['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']]
features_test.loc[:, "Sex"] = features_test["Sex"].apply(lambda x: 1.0 if x == 'male' else 0.0)
features_test["Intensity"] = features_test["Heart_Rate"] * features_test["Duration"]
features_test = features_test.drop("Height", axis=1)
features_test = features_test.drop('Body_Temp', axis=1)


print(features_test.head())
print(features_train.head())









      Sex  Age  Weight  Duration  Heart_Rate  Calories
0    male   36    82.0      26.0       101.0     150.0
1  female   64    60.0       8.0        85.0      34.0
2  female   51    64.0       7.0        84.0      29.0
3    male   20    90.0      25.0       105.0     140.0
4  female   38    61.0      25.0       102.0     146.0
   Sex  Age  Weight  Duration  Heart_Rate  Calories
0  1.0   36    82.0      26.0       101.0     150.0
1  0.0   64    60.0       8.0        85.0      34.0
2  0.0   51    64.0       7.0        84.0      29.0
3  1.0   20    90.0      25.0       105.0     140.0
4  0.0   38    61.0      25.0       102.0     146.0
89.03368342867219
87.53476116586509
   Calories
0     150.0
1      34.0
2      29.0
3     140.0
4     146.0
   Sex  Age  Weight  Duration  Heart_Rate
0  1.0   36    82.0      26.0       101.0
1  0.0   64    60.0       8.0        85.0
2  0.0   51    64.0       7.0        84.0
3  1.0   20    90.0      25.0       105.0
4  0.0   38    61.0      25.0       102.

In [8]:
#Start with random Forest classifier
random_forest = RandomForestRegressor()
random_forest.fit(features_train, labels_train)

  random_forest.fit(features_train, labels_train)


In [9]:

#predict Random Forest
y_pred = random_forest.predict(features_test)
# create new df with [id, calories]
result_df = pd.DataFrame({
    'id': test['id'], 
    'Calories': y_pred
})

# save result in csv
result_df.to_csv('predictions_RF.csv', index=False)

