In [39]:
import pandas as pd
import numpy as np

df = pd.read_excel("~/Desktop/Research/LMU_Wellness/data/Wellness_Database_May19.xlsx", sheet_name="Wellness Responses")

df = df.dropna(subset=["How well did you hydrate?"])

sore_areas = ["Neck", "Back", "Shoulders", "Chest", "Arms", "Hip Flexors", "Glutes", "Hamstrings", "Quadricps", "Adductors", "Calves", "Feet"]

for area in sore_areas:
    df[area] = df.apply(lambda row: row["How sore are you?"] if (isinstance(row["Select where you are sore:"], str) and area in row["Select where you are sore:"]) else 0, axis=1)

# Drop the original "Select where you are sore:" column
df = df.drop("Select where you are sore:", axis=1)

columns_to_drop = ['Athlete Name','Timestamp', 'Athlete ID #', 'Data ID', 'Week ID', 'Week ID Refined', 'Date Value', 'Year ID', 'Season ID', 'Injury Refined', 'Position', 'Classification', 'Stress RA', 'Stress StdDev', 'Stress Z-Score', 'Stress Wellness Score', 'Sleep Quality RA', 'Sleep Quality StdDev', 'Sleep Quality Z-Score', 'Sleep Quality Wellness Score', 'Sleep Quantity RA', 'Sleep Quantity StdDev', 'Sleep Quantity Z-Score', 'Sleep Quantity Wellness Score', 'Soreness RA', 'Soreness StdDev', 'Soreness Z-Score', 'Soreness Wellness Score', 'Hydrate RA', 'Hydrate StdDev', 'Hydrate Z-Score', 'Hydrate Wellness Score', 'Fuel RA', 'Fuel StdDev', 'Fuel Z-Score', 'Fuel Wellness Score', 'Readiness Score']
df = df.drop(columns_to_drop, axis=1)

df['No Injury'] = (df['What is your injury status?'] == 'Full = I have no injury').astype(int)
df['Some Injury'] = (df['What is your injury status?'] == 'Limited = I need a modification during lift / practice').astype(int)
df['Injury'] = (df['What is your injury status?'] == 'Out = I have an injury').astype(int)

# Drop the original column
df = df.drop('What is your injury status?', axis=1)

In [6]:
df

Unnamed: 0,How stressed are you?,How well did you sleep?,How many hours did you sleep?,How sore are you?,How well did you hydrate?,How well did you fuel?,What is your readiness score?,Neck,Back,Shoulders,...,Hip Flexors,Glutes,Hamstrings,Quadricps,Adductors,Calves,Feet,No Injury,Some Injury,Injury
1061,7,7,9.0,6,6.0,6.0,65.0,6,6,6,...,6,0,6,0,6,0,0,0,1,0
1062,9,9,8.0,9,9.0,8.0,90.0,0,0,0,...,9,9,9,9,0,0,0,1,0,0
1063,9,8,8.0,6,6.0,9.0,8.0,0,0,6,...,6,6,6,6,0,0,0,1,0,0
1064,6,9,9.0,5,7.0,7.0,75.0,0,0,0,...,0,0,5,5,0,0,0,1,0,0
1065,9,9,9.0,5,7.0,8.0,86.0,0,0,0,...,5,0,5,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,10,10,8.0,4,10.0,10.0,75.0,0,0,0,...,4,0,4,4,0,0,0,1,0,0
1544,10,10,8.0,8,10.0,10.0,90.0,0,0,0,...,0,0,8,0,0,8,0,1,0,0
1545,6,8,9.0,5,8.0,7.0,81.0,0,5,5,...,5,0,5,5,0,0,5,1,0,0
1546,9,2,5.0,8,8.0,4.0,50.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
#Get features

all_columns = df.columns
features = []
for col in all_columns:
    if (col != 'What is your readiness score? '):
        features.append(col)

features

['How stressed are you?',
 'How well did you sleep?',
 'How many hours did you sleep?',
 'How sore are you?',
 'How well did you hydrate?',
 'How well did you fuel?',
 'Neck',
 'Back',
 'Shoulders',
 'Chest',
 'Arms',
 'Hip Flexors',
 'Glutes',
 'Hamstrings',
 'Quadricps',
 'Adductors',
 'Calves',
 'Feet',
 'No Injury',
 'Some Injury',
 'Injury']

In [8]:
#Get all possible combinations of features

import itertools

features_combinations = []

for i in range(len(features)):
    if i == 0:
        continue
    for combination in (list(itertools.combinations(features, i))):
        features_combinations.append(combination)
    
features_combinations

[('How stressed are you?',),
 ('How well did you sleep?',),
 ('How many hours did you sleep?',),
 ('How sore are you?',),
 ('How well did you hydrate?',),
 ('How well did you fuel?',),
 ('Neck',),
 ('Back',),
 ('Shoulders',),
 ('Chest',),
 ('Arms',),
 ('Hip Flexors',),
 ('Glutes',),
 ('Hamstrings',),
 ('Quadricps',),
 ('Adductors',),
 ('Calves',),
 ('Feet',),
 ('No Injury',),
 ('Some Injury',),
 ('Injury',),
 ('How stressed are you?', 'How well did you sleep?'),
 ('How stressed are you?', 'How many hours did you sleep?'),
 ('How stressed are you?', 'How sore are you?'),
 ('How stressed are you?', 'How well did you hydrate?'),
 ('How stressed are you?', 'How well did you fuel?'),
 ('How stressed are you?', 'Neck'),
 ('How stressed are you?', 'Back'),
 ('How stressed are you?', 'Shoulders'),
 ('How stressed are you?', 'Chest'),
 ('How stressed are you?', 'Arms'),
 ('How stressed are you?', 'Hip Flexors'),
 ('How stressed are you?', 'Glutes'),
 ('How stressed are you?', 'Hamstrings'),
 ('

In [9]:
#Create Models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

best_model = None
best_rmse = float('inf')

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Check if this model performs better than the previous best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(best_feature_comb)
print(best_rmse)

Best Model:
('How well did you sleep?', 'How sore are you?', 'How well did you fuel?', 'Neck', 'Back', 'Chest', 'Hip Flexors', 'Glutes', 'No Injury')
13.881245008367674


In [10]:
feature_names = list(X_train[list(best_feature_comb)].columns)

# Get the coefficients of the best model
coefficients = best_model.coef_

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Print the feature names and their corresponding coefficients
for feature, coefficient in feature_coefficients.items():
    print(f"{feature}: {coefficient}")

How well did you sleep?: 1.999955713470663
How sore are you?: 2.715358491177603
How well did you fuel?: 0.07019636527968842
Neck: -0.6780266983445491
Back: 0.6403778240182539
Chest: 0.43108060917651453
Hip Flexors: -0.2979231424198361
Glutes: 0.5948260403690995
No Injury: 6.074781166385044


In [11]:
X_test_comb = X_test[list(best_feature_comb)]
score = best_model.score(X_test_comb, y_test)
score

0.2522360973069526

In [12]:
#Create Models

best_score_model = None
best_score = -float('inf')

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate score (R^2)
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if score > best_score:
        best_score = score
        best_score_model = model
        best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(best_score_model)
print("Score:")
print(best_score)

Best Model:
LinearRegression()
Score:
0.2522360973069526


In [13]:
feature_names = list(X_train[list(best_score_feature_comb)].columns)

# Get the coefficients of the best model
coefficients = best_score_model.coef_

# Create a dictionary to map feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Print the feature names and their corresponding coefficients
for feature, coefficient in feature_coefficients.items():
    print(f"{feature}: {coefficient}")

How well did you sleep?: 1.999955713470663
How sore are you?: 2.715358491177603
How well did you fuel?: 0.07019636527968842
Neck: -0.6780266983445491
Back: 0.6403778240182539
Chest: 0.43108060917651453
Hip Flexors: -0.2979231424198361
Glutes: 0.5948260403690995
No Injury: 6.074781166385044


In [28]:
## PART 2 (AVERAGE SORENESS)
import pandas as pd
import numpy as np

df = pd.read_excel("~/Desktop/Research/LMU_Wellness/data/Wellness_Database_May19.xlsx", sheet_name="Wellness Responses")

df = df.dropna(subset=["How well did you hydrate?"])

sore_areas = ["Neck", "Back", "Shoulders", "Chest", "Arms", "Hip Flexors", "Glutes", "Hamstrings", "Quadricps", "Adductors", "Calves", "Feet"]

for area in sore_areas:
    df[area] = df.apply(lambda row: row["How sore are you?"] / (row["Select where you are sore:"].count(",")+1) if (isinstance(row["Select where you are sore:"], str) and area in row["Select where you are sore:"]) else 0, axis=1)

# Drop the original "Select where you are sore:" column
df = df.drop("Select where you are sore:", axis=1)

columns_to_drop = ['Athlete Name','Timestamp', 'Athlete ID #', 'Data ID', 'Week ID', 'Week ID Refined', 'Date Value', 'Year ID', 'Season ID', 'Injury Refined', 'Position', 'Classification', 'Stress RA', 'Stress StdDev', 'Stress Z-Score', 'Stress Wellness Score', 'Sleep Quality RA', 'Sleep Quality StdDev', 'Sleep Quality Z-Score', 'Sleep Quality Wellness Score', 'Sleep Quantity RA', 'Sleep Quantity StdDev', 'Sleep Quantity Z-Score', 'Sleep Quantity Wellness Score', 'Soreness RA', 'Soreness StdDev', 'Soreness Z-Score', 'Soreness Wellness Score', 'Hydrate RA', 'Hydrate StdDev', 'Hydrate Z-Score', 'Hydrate Wellness Score', 'Fuel RA', 'Fuel StdDev', 'Fuel Z-Score', 'Fuel Wellness Score', 'Readiness Score']
df = df.drop(columns_to_drop, axis=1)

df['No Injury'] = (df['What is your injury status?'] == 'Full = I have no injury').astype(int)
df['Some Injury'] = (df['What is your injury status?'] == 'Limited = I need a modification during lift / practice').astype(int)
df['Injury'] = (df['What is your injury status?'] == 'Out = I have an injury').astype(int)

# Drop the original column
df = df.drop('What is your injury status?', axis=1)


In [29]:
df

Unnamed: 0,How stressed are you?,How well did you sleep?,How many hours did you sleep?,How sore are you?,How well did you hydrate?,How well did you fuel?,What is your readiness score?,Neck,Back,Shoulders,...,Hip Flexors,Glutes,Hamstrings,Quadricps,Adductors,Calves,Feet,No Injury,Some Injury,Injury
1061,7,7,9.0,6,6.0,6.0,65.0,0.857143,0.857143,0.857143,...,0.857143,0.00,0.857143,0.000000,0.857143,0.0,0.000000,0,1,0
1062,9,9,8.0,9,9.0,8.0,90.0,0.000000,0.000000,0.000000,...,2.250000,2.25,2.250000,2.250000,0.000000,0.0,0.000000,1,0,0
1063,9,8,8.0,6,6.0,9.0,8.0,0.000000,0.000000,1.200000,...,1.200000,1.20,1.200000,1.200000,0.000000,0.0,0.000000,1,0,0
1064,6,9,9.0,5,7.0,7.0,75.0,0.000000,0.000000,0.000000,...,0.000000,0.00,1.666667,1.666667,0.000000,0.0,0.000000,1,0,0
1065,9,9,9.0,5,7.0,8.0,86.0,0.000000,0.000000,0.000000,...,2.500000,0.00,2.500000,0.000000,0.000000,0.0,0.000000,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,10,10,8.0,4,10.0,10.0,75.0,0.000000,0.000000,0.000000,...,1.333333,0.00,1.333333,1.333333,0.000000,0.0,0.000000,1,0,0
1544,10,10,8.0,8,10.0,10.0,90.0,0.000000,0.000000,0.000000,...,0.000000,0.00,4.000000,0.000000,0.000000,4.0,0.000000,1,0,0
1545,6,8,9.0,5,8.0,7.0,81.0,0.000000,0.714286,0.714286,...,0.714286,0.00,0.714286,0.714286,0.000000,0.0,0.714286,1,0,0
1546,9,2,5.0,8,8.0,4.0,50.0,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.0,0.000000,1,0,0


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

best_model = None
best_rmse = float('inf')
best_score = -float('inf')

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error) & Score (R^2)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_feature_comb = feature_comb
        
    if score > best_score:
        best_score = score
        best_score_model = model
        best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(best_feature_comb)
print(best_rmse)

Best Model:
('How well did you sleep?', 'How sore are you?', 'Neck', 'Back', 'Chest', 'Hip Flexors', 'Glutes', 'Adductors', 'No Injury')
13.838017270306073


In [44]:
best_score

0.2568860853468552

> Random State Optimization with data set 2

In [36]:
nonfeatures = set(df.columns) - set(['How well did you sleep?', 'How sore are you?', 'Neck', 'Back', 'Chest', 'Hip Flexors', 'Glutes', 'Adductors', 'No Injury'])
nonfeatures = list(nonfeatures)

In [38]:
X = df.drop(nonfeatures, axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable

highest_score = 0
best_n = 0    
for n in range(100000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    if model.score(X_test, y_test) > highest_score:
        highest_score = model.score(X_test, y_test)
        best_n = n

print("Best Score is", highest_score, "at Random State", best_n)

Best Score is 0.5072822137729255 at Random State 47151


> Random State Optimization with data set 1

In [40]:
nonfeatures = set(df.columns) - set(['How well did you sleep?', 'How sore are you?', 'How well did you fuel?', 'Neck', 'Back', 'Chest', 'Hip Flexors', 'Glutes', 'No Injury'])
nonfeatures = list(nonfeatures)

In [41]:
X = df.drop(nonfeatures, axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable

highest_score = 0
best_n = 0    
for n in range(100000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    if model.score(X_test, y_test) > highest_score:
        highest_score = model.score(X_test, y_test)
        best_n = n

print("Best Score is", highest_score, "at Random State", best_n)

Best Score is 0.49537556135300753 at Random State 47151
