In \[11\]:

    import pandas
    import os

    # This query represents dataset "final_dataset" for domain "fitbit_sleep_daily_summary" and was generated for All of Us Registered Tier Dataset v7
    dataset_28067247_fitbit_sleep_daily_summary_sql = """
        SELECT
            sleep_daily_summary.person_id,
            sleep_daily_summary.sleep_date,
            sleep_daily_summary.is_main_sleep,
            sleep_daily_summary.minute_in_bed,
            sleep_daily_summary.minute_asleep,
            sleep_daily_summary.minute_after_wakeup,
            sleep_daily_summary.minute_awake,
            sleep_daily_summary.minute_restless,
            sleep_daily_summary.minute_deep,
            sleep_daily_summary.minute_light,
            sleep_daily_summary.minute_rem,
            sleep_daily_summary.minute_wake 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.sleep_daily_summary` sleep_daily_summary 
            LIMIT 100000"""

    dataset_28067247_fitbit_sleep_daily_summary_df = pandas.read_gbq(
        dataset_28067247_fitbit_sleep_daily_summary_sql,
        dialect="standard",
        use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
        progress_bar_type="tqdm_notebook")

    dataset_28067247_fitbit_sleep_daily_summary_df.head(5)

    Downloading:   0%|          | 0/100000 [00:00<?, ?rows/s]

Out\[11\]:

|     | person_id | sleep_date | is_main_sleep | minute_in_bed | minute_asleep | minute_after_wakeup | minute_awake | minute_restless | minute_deep | minute_light | minute_rem | minute_wake |
|-----|-----------|------------|---------------|---------------|---------------|---------------------|--------------|-----------------|-------------|--------------|------------|-------------|
| 0   | 1672313   | 2015-06-03 | false         | 10            | 0             | 0                   | 0            | 4               | \<NA\>      | \<NA\>       | \<NA\>     | \<NA\>      |
| 1   | 1764975   | 2014-09-17 | false         | 75            | 0             | 0                   | 0            | 69              | \<NA\>      | \<NA\>       | \<NA\>     | \<NA\>      |
| 2   | 2004003   | 2015-12-02 | false         | 127           | 0             | 0                   | 0            | 99              | \<NA\>      | \<NA\>       | \<NA\>     | \<NA\>      |
| 3   | 1688324   | 2014-03-06 | false         | 17            | 0             | 0                   | 0            | 16              | \<NA\>      | \<NA\>       | \<NA\>     | \<NA\>      |
| 4   | 2530374   | 2016-08-25 | false         | 13            | 0             | 0                   | 0            | 11              | \<NA\>      | \<NA\>       | \<NA\>     | \<NA\>      |

In \[20\]:

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.impute import SimpleImputer

    # Assuming dataset_28067247_fitbit_sleep_daily_summary_df is your actual data
    sleep_df = pd.DataFrame(dataset_28067247_fitbit_sleep_daily_summary_df)

    # Calculate sleep quality and ensure it is capped at a minimum of 0
    sleep_df['sleep_quality'] = (
        (sleep_df['minute_asleep'] / sleep_df['minute_in_bed']) * 100
    ).clip(lower=0)

    # Define features (X) and target (y)
    X = sleep_df[['minute_in_bed', 'minute_awake', 'minute_restless', 'minute_asleep']]
    y = sleep_df['sleep_quality']

    # Handle missing values by filling them with the column mean (if any exist)
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # Initialize the GradientBoostingRegressor model
    gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

    # Fit the Gradient Boosting model
    gbr.fit(X_train, y_train)

    # Initialize the RandomForestRegressor model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)

    # Fit the Random Forest model
    rf.fit(X_train, y_train)

    # Make predictions with both models on training and testing data
    y_train_pred_gbr = gbr.predict(X_train)
    y_test_pred_gbr = gbr.predict(X_test)

    y_train_pred_rf = rf.predict(X_train)
    y_test_pred_rf = rf.predict(X_test)

    # Calculate accuracy for Random Forest (R2 score represents the accuracy)


    train_accuracy_rf = train_r2_rf * 100
    test_accuracy_rf = test_r2_rf * 100



    # Display Random Forest accuracy results
    print("\nRandom Forest Regressor Accuracy:")
    print(f"Train Accuracy: {train_accuracy_rf:.2f}%")
    print(f"Test Accuracy: {test_accuracy_rf:.2f}%")

    Random Forest Regressor Accuracy:
    Train Accuracy: 99.95%
    Test Accuracy: 99.53%

In \[22\]:

    print("\nEnter the values for prediction:")
    minute_in_bed = float(input("Minute in bed: "))
    minute_awake = float(input("Minute awake: "))
    minute_restless = float(input("Minute restless: "))
    minute_asleep = float(input("Minute asleep: "))

    # Create a DataFrame for the new input
    new_data = pd.DataFrame({
        'minute_in_bed': [minute_in_bed],
        'minute_awake': [minute_awake],
        'minute_restless': [minute_restless],
        'minute_asleep': [minute_asleep]
    })

    # Impute missing values for new data
    new_data_imputed = imputer.transform(new_data)

    # Make predictions
    pred_rf = rf.predict(new_data_imputed)


    print(f"Predicted Sleep Quality (Random Forest): {pred_rf[0]:.2f}")

    Enter the values for prediction:
    Minute in bed: 30
    Minute awake: 4
    Minute restless: 4
    Minute asleep: 4
    Predicted Sleep Quality (Random Forest): 18.70

In \[32\]:

    import pandas
    import os

    # This query represents dataset "final_dataset" for domain "fitbit_activity" and was generated for All of Us Registered Tier Dataset v7
    dataset_28067247_fitbit_activity_sql = """
        SELECT
            activity_summary.person_id,
            activity_summary.date,
            activity_summary.activity_calories,
            activity_summary.calories_bmr,
            activity_summary.calories_out,
            activity_summary.elevation,
            activity_summary.fairly_active_minutes,
            activity_summary.floors,
            activity_summary.lightly_active_minutes,
            activity_summary.marginal_calories,
            activity_summary.sedentary_minutes,
            activity_summary.steps,
            activity_summary.very_active_minutes 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.activity_summary` activity_summary 
            LIMIT 100000"""

    dataset_28067247_fitbit_activity_df = pandas.read_gbq(
        dataset_28067247_fitbit_activity_sql,
        dialect="standard",
        use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
        progress_bar_type="tqdm_notebook")

    dataset_28067247_fitbit_activity_df.head(5)

    Downloading:   0%|          | 0/100000 [00:00<?, ?rows/s]

Out\[32\]:

|     | person_id | date       | activity_calories | calories_bmr | calories_out | elevation | fairly_active_minutes | floors | lightly_active_minutes | marginal_calories | sedentary_minutes | steps | very_active_minutes |
|-----|-----------|------------|-------------------|--------------|--------------|-----------|-----------------------|--------|------------------------|-------------------|-------------------|-------|---------------------|
| 0   | 1653707   | 2020-10-28 | 468.0             | 2476.0       | 2820.0       | NaN       | 72.0                  | \<NA\> | 0.0                    | 288.0             | 1368.0            | 8583  | 0.0                 |
| 1   | 1872905   | 2016-10-27 | 276.0             | 1742.0       | 1955.0       | NaN       | 52.0                  | \<NA\> | 0.0                    | 156.0             | 1388.0            | 0     | 0.0                 |
| 2   | 1032025   | 2018-11-28 | 184.0             | 1295.0       | 1442.0       | NaN       | 41.0                  | \<NA\> | 0.0                    | 123.0             | 1399.0            | 0     | 0.0                 |
| 3   | 2878490   | 2018-07-06 | 326.0             | 1144.0       | 1404.0       | NaN       | 83.0                  | \<NA\> | 0.0                    | 166.0             | 1357.0            | 12711 | 0.0                 |
| 4   | 2658608   | 2015-09-03 | 369.0             | 1397.0       | 1694.0       | NaN       | 95.0                  | \<NA\> | 0.0                    | 190.0             | 1345.0            | 0     | 0.0                 |

In \[29\]:

    import pandas
    import os

    # This query represents dataset "final_dataset" for domain "fitbit_heart_rate_summary" and was generated for All of Us Registered Tier Dataset v7
    dataset_28067247_fitbit_heart_rate_summary_sql = """
        SELECT
            heart_rate_summary.person_id,
            heart_rate_summary.date,
            heart_rate_summary.zone_name,
            heart_rate_summary.min_heart_rate,
            heart_rate_summary.max_heart_rate,
            heart_rate_summary.minute_in_zone,
            heart_rate_summary.calorie_count 
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.heart_rate_summary` heart_rate_summary
            LIMIT 100000"""

    dataset_28067247_fitbit_heart_rate_summary_df = pandas.read_gbq(
        dataset_28067247_fitbit_heart_rate_summary_sql,
        dialect="standard",
        use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
        progress_bar_type="tqdm_notebook")

    dataset_28067247_fitbit_heart_rate_summary_df.head(5)

    Downloading:   0%|          | 0/100000 [00:00<?, ?rows/s]

Out\[29\]:

|     | person_id | date       | zone_name    | min_heart_rate | max_heart_rate | minute_in_zone | calorie_count |
|-----|-----------|------------|--------------|----------------|----------------|----------------|---------------|
| 0   | 3068090   | 2016-08-12 | Out of Range | 30             | 88             | \<NA\>         | NaN           |
| 1   | 1000184   | 2020-04-10 | Out of Range | 30             | 90             | \<NA\>         | 638.0424      |
| 2   | 1000184   | 2020-06-23 | Out of Range | 30             | 90             | \<NA\>         | 887.3826      |
| 3   | 1533946   | 2019-05-14 | Out of Range | 30             | 95             | \<NA\>         | NaN           |
| 4   | 2137532   | 2019-04-27 | Out of Range | 30             | 15             | 0              | 0.0000        |

In \[46\]:

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

    # Assuming dataset_28067247_fitbit_activity_df already exists
    # Select necessary columns for prediction
    activity_df = dataset_28067247_fitbit_activity_df[['steps', 'fairly_active_minutes', 'very_active_minutes', 'sedentary_minutes', 'activity_calories']]

    # Ensure no NaN values exist
    activity_df = activity_df.dropna()

    # Create a synthetic obesity status column (1 for obese, 0 for not obese)
    activity_df['obesity'] = (
        (activity_df['sedentary_minutes'] > 480) |  # More than 8 hours sedentary
        (activity_df['activity_calories'] < 200)    # Low activity calories
    ).astype(int)

    # Define features (X) for activity calories prediction
    X_activity = activity_df[['steps', 'fairly_active_minutes', 'very_active_minutes', 'sedentary_minutes']]
    y_activity = activity_df['activity_calories']  # Target for activity calories

    # Split the data into training and testing sets for activity calories
    X_train_activity, X_test_activity, y_train_activity, y_test_activity = train_test_split(X_activity, y_activity, test_size=0.2, random_state=42)

    # Initialize and fit the model for activity calories using Gradient Boosting
    model_activity = GradientBoostingRegressor()
    model_activity.fit(X_train_activity, y_train_activity)

    # Make predictions on the test set for activity calories
    y_pred_activity = model_activity.predict(X_test_activity)

    # Calculate R-squared for activity calories (as accuracy is not applicable here)
    r2_activity = r2_score(y_test_activity, y_pred_activity)


    # Allow user to input values for prediction
    user_steps = int(input("Enter number of steps: "))
    user_fairly_active_minutes = int(input("Enter fairly active minutes: "))
    user_very_active_minutes = int(input("Enter very active minutes: "))
    user_sedentary_minutes = int(input("Enter sedentary minutes: "))

    # Create a DataFrame for the user's input
    user_input_activity = pd.DataFrame({
        'steps': [user_steps],
        'fairly_active_minutes': [user_fairly_active_minutes],
        'very_active_minutes': [user_very_active_minutes],
        'sedentary_minutes': [user_sedentary_minutes]
    })

    # Predict the activity calories based on the user's input
    user_activity_calories_prediction = model_activity.predict(user_input_activity)

    # Display the predicted activity calories
    print(f"Predicted Activity Calories: {user_activity_calories_prediction[0]:.2f}")

    # Now use Random Forest to predict obesity
    # Define features (X) for obesity prediction
    X_obesity = activity_df[['steps', 'fairly_active_minutes', 'very_active_minutes', 'sedentary_minutes']]
    y_obesity = activity_df['obesity']  # Target for obesity

    # Split the data into training and testing sets for obesity prediction
    X_train_obesity, X_test_obesity, y_train_obesity, y_test_obesity = train_test_split(X_obesity, y_obesity, test_size=0.2, random_state=42)

    # Initialize and fit the Random Forest model for obesity
    model_obesity_rf = RandomForestClassifier(random_state=42)
    model_obesity_rf.fit(X_train_obesity, y_train_obesity)

    # Make predictions on the test set for obesity
    y_pred_obesity_rf = model_obesity_rf.predict(X_test_obesity)

    # Calculate accuracy for the Random Forest model
    accuracy_obesity_rf = accuracy_score(y_test_obesity, y_pred_obesity_rf)

    # Initialize and fit the Gradient Boosting model for obesity
    model_obesity_gb = GradientBoostingClassifier(random_state=42)
    model_obesity_gb.fit(X_train_obesity, y_train_obesity)

    # Make predictions on the test set for obesity using Gradient Boosting
    y_pred_obesity_gb = model_obesity_gb.predict(X_test_obesity)

    # Calculate accuracy for the Gradient Boosting model
    accuracy_obesity_gb = accuracy_score(y_test_obesity, y_pred_obesity_gb)

    # Display accuracy for the obesity predictions
    print("Obesity Prediction Accuracy (Random Forest):")
    print(f"Accuracy: {accuracy_obesity_rf:.2f}")
    print("Obesity Prediction Accuracy (Gradient Boosting):")
    print(f"Accuracy: {accuracy_obesity_gb:.2f}")

    # Predict the obesity probability for the user's input using Random Forest
    user_obesity_prediction_proba_rf = model_obesity_rf.predict_proba(user_input_activity)[:, 1]
    user_obesity_prediction_percentage_rf = user_obesity_prediction_proba_rf[0] * 100

    # Predict the obesity probability for the user's input using Gradient Boosting
    user_obesity_prediction_proba_gb = model_obesity_gb.predict_proba(user_input_activity)[:, 1]
    user_obesity_prediction_percentage_gb = user_obesity_prediction_proba_gb[0] * 100


    print(f"Predicted Obesity Probability (Gradient Boosting): {user_obesity_prediction_percentage_gb:.2f}%")

    # Suggest strategies to overcome obesity based on the predicted probabilities
    if user_obesity_prediction_percentage_gb > 50 or user_obesity_prediction_percentage_gb > 50:
        print("Suggestions to Overcome Obesity:")
        print("- Increase daily physical activity, aiming for at least 150 minutes of moderate aerobic activity per week.")
        print("- Incorporate strength training exercises at least twice a week.")
        print("- Monitor your caloric intake and consider a balanced diet with whole foods.")
        print("- Stay hydrated and reduce sugary beverages.")
        print("- Seek support from healthcare professionals or nutritionists for personalized guidance.")
    else:
        print("Keep up the good work! Maintaining a healthy lifestyle will help you stay fit.")

    Enter number of steps: 789
    Enter fairly active minutes: 77
    Enter very active minutes: 99
    Enter sedentary minutes: 9
    Predicted Activity Calories: 1794.06
    Obesity Prediction Accuracy (Random Forest):
    Accuracy: 1.00
    Obesity Prediction Accuracy (Gradient Boosting):
    Accuracy: 1.00
    Predicted Obesity Probability (Gradient Boosting): 4.30%
    Keep up the good work! Maintaining a healthy lifestyle will help you stay fit.

In \[31\]:

    import pandas as pd
    import os
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.metrics import classification_report, accuracy_score
    from sklearn.impute import SimpleImputer



    # Step 2: Data Preprocessing
    # Drop rows with missing 'zone_name'
    dataset_28067247_fitbit_heart_rate_summary_df.dropna(subset=['zone_name'], inplace=True)

    # Convert `minute_in_zone` and `calorie_count` to numeric, coercing errors to NaN
    dataset_95898450_fitbit_heart_rate_summary_df['minute_in_zone'] = pd.to_numeric(dataset_95898450_fitbit_heart_rate_summary_df['minute_in_zone'], errors='coerce')
    dataset_95898450_fitbit_heart_rate_summary_df['calorie_count'] = pd.to_numeric(dataset_95898450_fitbit_heart_rate_summary_df['calorie_count'], errors='coerce')

    # Imputation to handle missing values
    imputer = SimpleImputer(strategy='mean')  # Using mean to impute missing values
    dataset_95898450_fitbit_heart_rate_summary_df[['minute_in_zone', 'calorie_count']] = imputer.fit_transform(dataset_95898450_fitbit_heart_rate_summary_df[['minute_in_zone', 'calorie_count']])

    # Step 3: Map zone_name to numeric values
    zone_mapping = {
        "Out of Range": 0,
        "Fat Burn": 1,
        "Cardio": 2,
        "Peak": 3
    }
    dataset_95898450_fitbit_heart_rate_summary_df['zone_name'] = dataset_95898450_fitbit_heart_rate_summary_df['zone_name'].replace(zone_mapping)

    # Step 4: Features and target variable
    X = dataset_95898450_fitbit_heart_rate_summary_df[['min_heart_rate', 'max_heart_rate', 'minute_in_zone', 'calorie_count']]
    y = dataset_95898450_fitbit_heart_rate_summary_df['zone_name']

    # Step 5: Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Normalize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Step 6: Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Step 7: Train Gradient Boosting model
    gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)

    # Step 8: Evaluate the models
    rf_y_pred = rf_model.predict(X_test)
    gb_y_pred = gb_model.predict(X_test)

    print("Random Forest Accuracy:", accuracy_score(y_test, rf_y_pred))
    print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_y_pred))

    print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_y_pred))
    print("\nGradient Boosting Classification Report:\n", classification_report(y_test, gb_y_pred))

    # Function to predict zone name based on user input
    def predict_zone_name(model, min_heart_rate, max_heart_rate, minute_in_zone, calorie_count):
        user_input = np.array([[min_heart_rate, max_heart_rate, minute_in_zone, calorie_count]])
        user_input = scaler.transform(user_input)  # Scale the input
        predicted_zone = model.predict(user_input)[0]  # Get the predicted zone as a numeric value
        # Decode back to original labels
        zone_names = list(zone_mapping.keys())
        return zone_names[predicted_zone]

    # Risk mapping
    risk_mapping = {
        "Out of Range": "Potential Risk",
        "Fat Burn": "Low Risk",
        "Cardio": "Moderate Risk",
        "Peak": "Higher Risk"
    }

    # User input
    print("Enter the following details for prediction:")
    min_heart_rate = float(input("Minimum Heart Rate: "))
    max_heart_rate = float(input("Maximum Heart Rate: "))
    minute_in_zone = float(input("Minutes in Zone: "))
    calorie_count = float(input("Calorie Count: "))

    # Predict and display the zone name using Random Forest
    predicted_zone_name_rf = predict_zone_name(rf_model, min_heart_rate, max_heart_rate, minute_in_zone, calorie_count)
    print("Predicted Zone Name (Random Forest):", predicted_zone_name_rf)

    # Display the risk suggestion based on the predicted zone using Random Forest
    risk_suggestion_rf = risk_mapping[predicted_zone_name_rf]
    print("Risk Suggestion (Random Forest):", risk_suggestion_rf)

    Random Forest Accuracy: 0.9976

    Random Forest Classification Report:
                   precision    recall  f1-score   support

               0       1.00      1.00      1.00      6418
               1       0.99      0.95      0.97       761
               2       0.99      1.00      0.99      3398
               3       1.00      1.00      1.00      9423

        accuracy                           1.00     20000
       macro avg       0.99      0.99      0.99     20000
    weighted avg       1.00      1.00      1.00     20000

    Gradient Boosting Accuracy: 0.9955

    Gradient Boosting Classification Report:
                   precision    recall  f1-score   support

               0       1.00      1.00      1.00      6418
               1       0.98      0.90      0.94       761
               2       0.98      1.00      0.99      3398
               3       1.00      1.00      1.00      9423

        accuracy                           1.00     20000
       macro avg       0.99      0.97      0.98     20000
    weighted avg       1.00      1.00      1.00     20000

    Enter the following details for prediction:
    Minimum Heart Rate: 30
    Maximum Heart Rate: 88
    Minutes in Zone: 0
    Calorie Count: 0
    Predicted Zone Name (Random Forest): Out of Range
    Risk Suggestion (Random Forest): Potential Risk

    /opt/conda/lib/python3.10/site-packages/sklearn/base.py:464: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
      warnings.warn(

In \[ \]: