In [14]:
#Create Dataframe

import pandas as pd
df = pd.read_excel("~/Desktop/Research/LMU_Wellness/data/Wellness_Database_May19.xlsx", sheet_name="Wellness Responses")

sore_areas = ["Neck", "Back", "Shoulders", "Chest", "Arms", "Hip Flexors", "Glutes", "Hamstrings", "Quadricps", "Adductors", "Calves", "Feet"]

columns_to_drop = ['Athlete Name','Timestamp', 'Athlete ID #', 'Data ID', 'Week ID', 'Week ID Refined', 'Date Value', 'Year ID', 'Season ID', 'Injury Refined', 'Position', 'Classification', 'Stress RA', 'Stress StdDev', 'Stress Z-Score', 'Stress Wellness Score', 'Sleep Quality RA', 'Sleep Quality StdDev', 'Sleep Quality Z-Score', 'Sleep Quality Wellness Score', 'Sleep Quantity RA', 'Sleep Quantity StdDev', 'Sleep Quantity Z-Score', 'Sleep Quantity Wellness Score', 'Soreness RA', 'Soreness StdDev', 'Soreness Z-Score', 'Soreness Wellness Score', 'Hydrate RA', 'Hydrate StdDev', 'Hydrate Z-Score', 'Hydrate Wellness Score', 'Fuel RA', 'Fuel StdDev', 'Fuel Z-Score', 'Fuel Wellness Score', 'Readiness Score']
df = df.drop(columns_to_drop, axis=1)

df['No Injury'] = ((df['What is your injury status?'] == 'Full = I have no injury').astype(int))*10
df['Some Injury'] = ((df['What is your injury status?'] == 'Limited = I need a modification during lift / practice').astype(int))*10
df['Injury'] = ((df['What is your injury status?'] == 'Out = I have an injury').astype(int))*10

for area in sore_areas:
    df[area] = df.apply(lambda row: row["How sore are you?"] if (isinstance(row["Select where you are sore:"], str) and area in row["Select where you are sore:"]) else 0, axis=1)

# Drop the original "Select where you are sore:" column
df = df.drop("Select where you are sore:", axis=1)

# Drop the original column
df = df.drop('What is your injury status?', axis=1)

df = df.dropna(subset=["How well did you hydrate?"])

In [15]:
#Remove outliers
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

# Prepare the data
X = df.drop(["What is your readiness score? "], axis=1)  # Features
y = df["What is your readiness score? "]  # Target variable

# Add a constant term to the features
X = sm.add_constant(X)

# Fit the ordinary least squares (OLS) model
model = sm.OLS(y, X)
results = model.fit()

# Compute leverage values
leverage = OLSInfluence(results).hat_matrix_diag

# Compute Cook's distance
cooks_d = OLSInfluence(results).cooks_distance[0]

# Set a threshold for identifying influential points
threshold = 4 / len(X)  # You can adjust the threshold as needed

# Identify influential points based on Cook's distance and leverage
influential_points = np.where((cooks_d > threshold) | (leverage > np.mean(leverage) + 2 * np.std(leverage)))

# Remove values
for i in range(len(influential_points[0])):
    df = df.drop(index = (influential_points[0][i]+1061) ) 

  x = pd.concat(x[::order], 1)


In [4]:
#Get features

all_columns = df.columns
features = []
for col in all_columns:
    if (col != 'What is your readiness score? '):
        features.append(col)

features


['How stressed are you?',
 'How well did you sleep?',
 'How many hours did you sleep?',
 'How sore are you?',
 'How well did you hydrate?',
 'How well did you fuel?',
 'Neck',
 'Back',
 'Shoulders',
 'Chest',
 'Arms',
 'Hip Flexors',
 'Glutes',
 'Hamstrings',
 'Quadricps',
 'Adductors',
 'Calves',
 'Feet',
 'No Injury',
 'Some Injury',
 'Injury']

In [30]:
#Get all possible combinations of features

import itertools

features_combinations = []

for i in range(len(features)):
    if i == 0:
        continue
    for combination in (list(itertools.combinations(features, i))):
        features_combinations.append(combination)
    
features_combinations

[('How stressed are you?',),
 ('How well did you sleep?',),
 ('How many hours did you sleep?',),
 ('How sore are you?',),
 ('How well did you hydrate?',),
 ('How well did you fuel?',),
 ('Neck',),
 ('Back',),
 ('Shoulders',),
 ('Chest',),
 ('Arms',),
 ('Hip Flexors',),
 ('Glutes',),
 ('Hamstrings',),
 ('Quadricps',),
 ('Adductors',),
 ('Calves',),
 ('Feet',),
 ('No Injury',),
 ('Some Injury',),
 ('Injury',),
 ('How stressed are you?', 'How well did you sleep?'),
 ('How stressed are you?', 'How many hours did you sleep?'),
 ('How stressed are you?', 'How sore are you?'),
 ('How stressed are you?', 'How well did you hydrate?'),
 ('How stressed are you?', 'How well did you fuel?'),
 ('How stressed are you?', 'Neck'),
 ('How stressed are you?', 'Back'),
 ('How stressed are you?', 'Shoulders'),
 ('How stressed are you?', 'Chest'),
 ('How stressed are you?', 'Arms'),
 ('How stressed are you?', 'Hip Flexors'),
 ('How stressed are you?', 'Glutes'),
 ('How stressed are you?', 'Hamstrings'),
 ('

In [47]:
#Create Models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

best_model = None
best_rmse = float('inf')

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Check if this model performs better than the previous best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(best_feature_comb)
print(best_rmse)

Best Model:
LinearRegression()


In [49]:
feature_names = list(X_train[list(best_feature_comb)].columns)

# Get the coefficients of the best model
coefficients = best_model.coef_

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Print the feature names and their corresponding coefficients
for feature, coefficient in feature_coefficients.items():
    print(f"{feature}: {coefficient}")

How well did you sleep?: 2.011942635963661
How sore are you?: 2.9401086144334423
How well did you fuel?: 0.010838726549540212
Neck: -2.147326896305411
Back: 3.186033560849018
Chest: 1.8883971688141166
Glutes: 3.474729943382447
No Injury: 6.136152172254745


In [50]:
best_rmse

14.088701248644005

In [56]:
X_test_comb = X_test[list(best_feature_comb)]
score = best_model.score(X_test_comb, y_test)
score

0.22971830590142395

In [52]:
#Create Models

best_score_model = None
best_score = -float('inf')

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate score (R^2)
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if score > best_score:
        best_score = score
        best_score_model = model
        best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(best_score_model)
print("Score:")
print(best_score)

Best Model:
LinearRegression()
Score:
0.22971830590142395


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

feature_comb = ['How sore are you?', 'How stressed are you?', 'How well did you sleep?', 'Glutes', 'Hamstrings', 'How many hours did you sleep?', 'Shoulders', 'No Injury']

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_comb = X_train[feature_comb]

    # Fit a linear regression model
model = LinearRegression()
model.fit(X_train_comb, y_train)

    # Prepare the test data
X_test_comb = X_test[feature_comb]

    # Predict using the model
y_pred = model.predict(X_test_comb)

    # Calculate score (R^2)
score = model.score(X_test_comb, y_test)

In [17]:
feature_names = list(X_train[feature_comb].columns)

# Get the coefficients of the best model
coefficients = model.coef_

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Print the feature names and their corresponding coefficients
for feature, coefficient in feature_coefficients.items():
    print(f"{feature}: {coefficient}")

How sore are you?: 2.91948847737702
How stressed are you?: 1.0706141707220151
How well did you sleep?: 0.8104104321431078
Glutes: 0.715736609450274
Hamstrings: -0.17985339002541129
How many hours did you sleep?: 0.4553288543597745
Shoulders: 0.7195921420761577
No Injury: 0.47473537899120016


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

feature_comb = ['How sore are you?', 'How stressed are you?', 'How well did you sleep?', 'How many hours did you sleep?', 'No Injury']

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44969)

X_train_comb = X_train[feature_comb]

    # Fit a linear regression model
model = LinearRegression()
model.fit(X_train_comb, y_train)

    # Prepare the test data
X_test_comb = X_test[feature_comb]

    # Predict using the model
y_pred = model.predict(X_test_comb)

    # Calculate score (R^2)
score = model.score(X_test_comb, y_test)

In [22]:
feature_names = list(X_train[feature_comb].columns)

# Get the coefficients of the best model
coefficients = model.coef_

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Print the feature names and their corresponding coefficients
for feature, coefficient in feature_coefficients.items():
    print(f"{feature}: {coefficient}")

How sore are you?: 2.3993384368806767
How stressed are you?: 1.1123991225221017
How well did you sleep?: 0.6040513434989981
How many hours did you sleep?: 1.1409442656092776
No Injury: 0.4462223814040878


In [23]:
score

0.3062935587772292