In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import shap
import joblib
from ydata_profiling import ProfileReport
from langchain_ollama import OllamaLLM
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from lime.lime_tabular import LimeTabularExplainer
from treeinterpreter import treeinterpreter as ti
import joblib 

In [2]:
def generate_verbose_explanation(row_pred, row_actual, top_features, top_contributions):
    explanation_text = "Crew AI Analysis Report:\n\n"
    explanation_text += (
        f"- The model predicted a Resting BPM of **{row_pred:.1f}**, "
        f"compared to the actual measured value of **{row_actual:.1f}**.\n\n"
        "Key contributing factors include:\n"
    )

    for f, c in zip(top_features, top_contributions):
        direction = "increased" if c > 0 else "decreased"
        explanation_text += f"• **{f}** {direction} the prediction by **{c:.2f} BPM**.\n"

    explanation_text += (
        "\nOverall, the prediction results from the combined influence of physiological, lifestyle, "
        "and demographic factors captured in the dataset. This explanation highlights how specific "
        "inputs push the predicted Resting BPM upward or downward."
    )

    return explanation_text


In [3]:
# load and set target
df = pd.read_csv("data.csv")
df = df.drop(columns=['pct_HRR']) #drop HRR so it has to guess and not calculate
df = df.drop(columns=['Avg_BPM'])
df = df.drop(columns=['pct_maxHR'])
df = df.drop(columns=['Max_BPM'])
df = df.drop(columns=['Session_Duration (hours)'])
df = df.drop(columns=['Calories_Burned'])
df = df.drop(columns=['Workout_Type'])
df = df.drop(columns=['Fat_Percentage'])
df = df.drop(columns=['Experience_Level'])
df = df.drop(columns=['Physical exercise'])
df = df.drop(columns=['Carbs'])
df = df.drop(columns=['Proteins'])
df = df.drop(columns=['Fats'])
df = df.drop(columns=['meal_name'])
df = df.drop(columns=['meal_type'])
df = df.drop(columns=['sugar_g'])
df = df.drop(columns=['cholesterol_mg'])
df = df.drop(columns=['serving_size_g'])
df = df.drop(columns=['cooking_method'])
df = df.drop(columns=['prep_time_min'])
df = df.drop(columns=['rating'])
df = df.drop(columns=['Name of Exercise'])
df = df.drop(columns=['Sets'])
df = df.drop(columns=['Reps'])
df = df.drop(columns=['Benefit'])
df = df.drop(columns=['Burns Calories (per 30 min)'])
df = df.drop(columns=['Target Muscle Group'])
df = df.drop(columns=['Equipment Needed'])
df = df.drop(columns=['Difficulty Level'])
df = df.drop(columns=['Body Part'])
df = df.drop(columns=['Type of Muscle'])
df = df.drop(columns=['Workout'])
df = df.drop(columns=['cal_from_macros'])
df = df.drop(columns=['pct_carbs'])
df = df.drop(columns=['protein_per_kg'])
df = df.drop(columns=['cal_balance'])
df = df.drop(columns=['lean_mass_kg'])
df = df.drop(columns=['expected_burn'])
df = df.drop(columns=['Burns Calories (per 30 min)_bc'])
df = df.drop(columns=['Burns_Calories_Bin'])
df = df.drop(columns=['sodium_mg'])
df = df.drop(columns=['cook_time_min'])
df = df.drop(columns=['BMI_calc'])
df = df.drop(columns=['Calories'])


target = "Resting_BPM"
X = df.drop(columns=[target])
y = df[target]

In [4]:
# split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Combine features and target for testing
test_df = X_test.copy()
test_df["Resting_BPM"] = y_test  # Include the actual target
test_df.to_csv("resting_bpm_app/test_data.csv", index=False)



In [5]:
# convert bools to ints
bool_cols = X_train.select_dtypes(include='bool').columns
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_val[bool_cols] = X_val[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)

# one-hot encode all remaining categoricals
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_val = pd.get_dummies(X_val, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# align validation and test columns with training
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [6]:
# fit model and train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [7]:
# val and test
val_score = model.score(X_val, y_val)
test_score = model.score(X_test, y_test)

print("Validation R² score:", val_score)
print("Test R² score:", test_score)

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

mae = mean_absolute_error(y_test, model.predict(X_test))
mape = mean_absolute_percentage_error(y_test, model.predict(X_test))

print("Mean Absolute Error (MAE):", mae)
print("Mean Absolute Percentage Error (MAPE):", mape)

X_test.to_csv("resting_bpm_app/test_data.csv", index=False)
y_test.to_csv("resting_bpm_app/test_labels.csv", index=False)

Validation R² score: 0.9847663060102245
Test R² score: 0.9821238859698859
Mean Absolute Error (MAE): 0.46991709999999953
Mean Absolute Percentage Error (MAPE): 0.007756389841079219


In [8]:
# top contributing features
importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("\nTop 10 important features:")
print(importances.head(10)) 
#top contribute is chloesterol


Top 10 important features:
Age                              0.214515
Height (m)                       0.175600
Weight (kg)                      0.161852
BMI                              0.158613
Water_Intake (liters)            0.149014
Daily meals frequency            0.072743
Workout_Frequency (days/week)    0.066460
diet_type_Low-Carb               0.000270
diet_type_Vegan                  0.000230
diet_type_Paleo                  0.000183
dtype: float64


In [9]:
#tidy data
X_train_num = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test_num = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# Optional: align columns just in case
X_test_num = X_test_num[X_train_num.columns]

# set how many rows
N = 100
X_test_sample = X_test_num.iloc[:N]
y_test_sample = y_test.iloc[:N]

#predictions with contributers in mind
predictions, bias, contributions = ti.predict(model, X_test_sample)

results_list = []

for i in range(N):
    row_pred = predictions[i][0]
    row_actual = y_test_sample.iloc[i]
    row_contrib = contributions[i]
    
#give top fivd features
    top_idx = np.argsort(np.abs(row_contrib))[-50:][::-1]
    top_features = X_test_sample.columns[top_idx]
    top_contributions = row_contrib[top_idx]
    
    explanation = ", ".join([f"{f} ({c:+.2f})" for f, c in zip(top_features, top_contributions)])
    decision = "Normal" if row_pred < 80 else "Elevated"
    
    results_list.append({
        "Predicted_Resting_BPM": row_pred,
        "Actual_Resting_BPM": row_actual,
        "Decision": decision,
        "Explanation": explanation
    })

#saving
results = pd.DataFrame(results_list)
results.to_csv("resting_bpm_results_sample.csv", index=False)

print("Results saved to 'resting_bpm_results_sample.csv'")




Results saved to 'resting_bpm_results_sample.csv'


In [10]:
# Save training columns for app
X_train.columns.to_series().to_csv('resting_bpm_app/model/columns.csv', index=False)


In [11]:
joblib.dump(model, "resting_bpm_app/model/resting_bpm_model.joblib")

['resting_bpm_app/model/resting_bpm_model.joblib']