In [12]:
import pandas as pd

df = pd.read_excel("~/Desktop/Research/LMU_Wellness/data/Wellness_Database_May19.xlsx", sheet_name="Wellness Responses")

df = df.dropna(subset=["How well did you hydrate?"])

sore_areas = ["Neck", "Back", "Shoulders", "Chest", "Arms", "Hip Flexors", "Glutes", "Hamstrings", "Quadricps", "Adductors", "Calves", "Feet"]

for area in sore_areas:
    df[area] = df.apply(lambda row: row["How sore are you?"] / (row["Select where you are sore:"].count(",")+1) if (isinstance(row["Select where you are sore:"], str) and area in row["Select where you are sore:"]) else 0, axis=1)

# Drop the original "Select where you are sore:" column
df = df.drop("Select where you are sore:", axis=1)

columns_to_drop = ['Timestamp', 'Athlete ID #', 'Data ID', 'Week ID', 'Week ID Refined', 'Date Value', 'Year ID', 'Season ID', 'Injury Refined', 'Position', 'Classification', 'Stress RA', 'Stress StdDev', 'Stress Z-Score', 'Stress Wellness Score', 'Sleep Quality RA', 'Sleep Quality StdDev', 'Sleep Quality Z-Score', 'Sleep Quality Wellness Score', 'Sleep Quantity RA', 'Sleep Quantity StdDev', 'Sleep Quantity Z-Score', 'Sleep Quantity Wellness Score', 'Soreness RA', 'Soreness StdDev', 'Soreness Z-Score', 'Soreness Wellness Score', 'Hydrate RA', 'Hydrate StdDev', 'Hydrate Z-Score', 'Hydrate Wellness Score', 'Fuel RA', 'Fuel StdDev', 'Fuel Z-Score', 'Fuel Wellness Score', 'Readiness Score']
df = df.drop(columns_to_drop, axis=1)

df['No Injury'] = (df['What is your injury status?'] == 'Full = I have no injury').astype(int)
df['Some Injury'] = (df['What is your injury status?'] == 'Limited = I need a modification during lift / practice').astype(int)
df['Injury'] = (df['What is your injury status?'] == 'Out = I have an injury').astype(int)

# Drop the original column
df = df.drop('What is your injury status?', axis=1)
df = df.rename(columns={"Athlete Name": "AthleteName"})

In [13]:
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

# Prepare the data
X = df.drop(["What is your readiness score? ", "AthleteName"], axis=1)  # Features
y = df["What is your readiness score? "]  # Target variable

# Add a constant term to the features
X = sm.add_constant(X)

# Fit the ordinary least squares (OLS) model
model = sm.OLS(y, X)
results = model.fit()

# Compute leverage values
leverage = OLSInfluence(results).hat_matrix_diag

# Compute Cook's distance
cooks_d = OLSInfluence(results).cooks_distance[0]

# Set a threshold for identifying influential points
threshold = 4 / len(X)  # You can adjust the threshold as needed

# Identify influential points based on Cook's distance and leverage
influential_points = np.where((cooks_d > threshold) | (leverage > np.mean(leverage) + 2 * np.std(leverage)))

# Remove values
for i in range(len(influential_points[0])):
    df = df.drop(index = (influential_points[0][i]+1061) ) 

  x = pd.concat(x[::order], 1)


In [14]:
df_park = df[df.AthleteName == 'Kailey Park']
df_christensen = df[df.AthleteName == 'Makiya Christensen']
df_tinsley = df[df.AthleteName == 'Megan Tinsley']
df_santen = df[df.AthleteName == 'Alice Santen']

In [15]:
df_park = df_park.drop("AthleteName", axis=1)
df_christensen = df_christensen.drop("AthleteName", axis=1)
df_tinsley = df_tinsley.drop("AthleteName", axis=1)
df_santen = df_santen.drop("AthleteName", axis=1)
df = df.drop("AthleteName", axis=1)

In [6]:
all_columns = df.columns
features = []
for col in all_columns:
    if (col != 'What is your readiness score? '):
        features.append(col)

import itertools

features_combinations = []

for i in range(len(features)):
    if i == 0:
        continue
    for combination in (list(itertools.combinations(features, i))):
        features_combinations.append(combination)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

park_best_model = None
park_best_rmse = float('inf')
park_best_score = -float('inf')

# Split the data into training and test sets
X = df_park.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df_park["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Check if this model performs better than the previous best model
    if rmse < park_best_rmse:
        park_best_rmse = rmse
        park_best_model = model
        park_best_feature_comb = feature_comb
        
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if score > park_best_score:
        park_best_score = score
        park_best_score_model = model
        park_best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(park_best_score_feature_comb)
print(park_best_score)

Best Model:
('How stressed are you?', 'How well did you hydrate?', 'How well did you fuel?', 'Back', 'Hip Flexors', 'Hamstrings', 'Quadricps', 'Adductors', 'Calves')
0.6201388063992196


In [7]:
df_park

Unnamed: 0,How stressed are you?,How well did you sleep?,How many hours did you sleep?,How sore are you?,How well did you hydrate?,How well did you fuel?,What is your readiness score?,Neck,Back,Shoulders,...,Hip Flexors,Glutes,Hamstrings,Quadricps,Adductors,Calves,Feet,No Injury,Some Injury,Injury
1065,9,9,9.0,5,7.0,8.0,86.0,0.0,0.0,0.0,...,2.5,0.0,2.5,0.0,0.0,0.0,0.0,1,0,0
1082,9,9,9.0,2,7.0,7.0,88.0,0.0,0.0,0.0,...,0.0,0.0,0.666667,0.666667,0.0,0.666667,0.0,1,0,0
1090,9,9,9.0,4,9.0,8.0,92.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1,0,0
1093,9,7,7.0,8,6.0,7.0,82.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1,0,0
1111,9,8,8.0,8,7.0,9.0,85.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
1124,9,8,8.0,3,6.0,7.0,72.0,0.0,0.75,0.0,...,0.75,0.0,0.75,0.0,0.0,0.0,0.0,1,0,0
1138,8,8,8.0,6,7.0,8.0,74.0,0.0,0.0,0.0,...,3.0,0.0,3.0,0.0,0.0,0.0,0.0,1,0,0
1154,9,9,9.0,3,9.0,9.0,88.0,0.0,0.6,0.0,...,0.0,0.6,0.6,0.6,0.0,0.0,0.0,1,0,0
1164,8,9,9.0,4,7.0,9.0,86.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1,0,0
1166,6,8,8.0,8,7.0,7.0,74.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1,0,0


In [19]:
features_to_drop = set(df_park.columns) - set(['How stressed are you?', 'How well did you hydrate?', 'How well did you fuel?', 'Back', 'Hip Flexors', 'Hamstrings', 'Quadricps', 'Adductors', 'Calves'])
X = df_park.drop(list(features_to_drop), axis=1)  # Features (excluding target variable)
y = df_park["What is your readiness score? "]  # Target variable

highest_score = 0
best_n = 0    
for n in range(1,10000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    #scores = cross_val_score(model, X_train, y_train)
    if model.score(X_test, y_test) > highest_score:
        highest_score = model.score(X_test, y_test)
        best_n = n
print("Best Score is", highest_score, "at Random State", best_n)

Best Score is 0.8833315686586599 at Random State 131


In [10]:
christensen_best_model = None
christensen_best_rmse = float('inf')
christensen_best_score = -float('inf')

# Split the data into training and test sets
X = df_christensen.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df_christensen["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Check if this model performs better than the previous best model
    if rmse < christensen_best_rmse:
        christensen_best_rmse = rmse
        christensen_best_model = model
        christensen_best_feature_comb = feature_comb
        
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if score > christensen_best_score:
        christensen_best_score = score
        christensen_best_score_model = model
        christensen_best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(christensen_best_score_feature_comb)
print(christensen_best_score)

Best Model:
('How well did you sleep?', 'How sore are you?', 'How well did you hydrate?', 'How well did you fuel?', 'Chest', 'Hip Flexors', 'Quadricps')
0.3175360187548518


In [30]:
features_to_drop = set(df_christensen.columns) - set(['How well did you sleep?', 'How sore are you?', 'How well did you hydrate?', 'How well did you fuel?', 'Chest', 'Hip Flexors', 'Quadricps'])
X = df_christensen.drop(list(features_to_drop), axis=1)  # Features (excluding target variable)
y = df_christensen["What is your readiness score? "]  # Target variable

highest_score = 0
best_n = 0    
for n in range(100000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    #scores = cross_val_score(model, X_train, y_train)
    if model.score(X_test, y_test) > highest_score:
        highest_score = model.score(X_test, y_test)
        best_n = n
print("Best Score is", highest_score, "at Random State", best_n)

Best Score is 0.8809759339300821 at Random State 51367


In [8]:
df_christensen

Unnamed: 0,How stressed are you?,How well did you sleep?,How many hours did you sleep?,How sore are you?,How well did you hydrate?,How well did you fuel?,What is your readiness score?,Neck,Back,Shoulders,...,Hip Flexors,Glutes,Hamstrings,Quadricps,Adductors,Calves,Feet,No Injury,Some Injury,Injury
1061,7,7,9.0,6,6.0,6.0,65.0,0.857143,0.857143,0.857143,...,0.857143,0.0,0.857143,0.0,0.857143,0.0,0.0,0,1,0
1085,8,8,8.0,6,4.0,6.0,80.0,1.2,1.2,0.0,...,1.2,0.0,0.0,1.2,1.2,0.0,0.0,0,1,0
1089,7,8,8.0,5,5.0,5.0,80.0,0.0,1.25,0.0,...,1.25,1.25,0.0,1.25,0.0,0.0,0.0,0,1,0
1091,7,9,9.0,7,5.0,7.0,75.0,2.333333,2.333333,0.0,...,2.333333,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1097,6,7,8.0,8,6.0,7.0,80.0,0.0,2.666667,0.0,...,2.666667,0.0,0.0,0.0,2.666667,0.0,0.0,0,1,0
1115,8,9,9.0,6,5.0,6.0,75.0,0.0,1.5,1.5,...,1.5,0.0,0.0,0.0,1.5,0.0,0.0,0,1,0
1127,7,7,8.0,5,6.0,7.0,79.0,0.714286,0.714286,0.714286,...,0.714286,0.0,0.714286,0.714286,0.0,0.0,0.0,0,1,0
1137,7,6,8.0,4,6.0,6.0,65.0,0.571429,0.571429,0.571429,...,0.571429,0.571429,0.571429,0.571429,0.0,0.0,0.0,0,1,0
1148,7,7,8.0,4,6.0,6.0,60.0,0.0,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.0,0.0,0,1,0
1163,7,8,8.0,5,6.0,6.0,75.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0


In [11]:
tinsley_best_model = None
tinsley_best_rmse = float('inf')
tinsley_best_score = -float('inf')

# Split the data into training and test sets
X = df_tinsley.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df_tinsley["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Check if this model performs better than the previous best model
    if rmse < tinsley_best_rmse:
        tinsley_best_rmse = rmse
        tinsley_best_model = model
        tinsley_best_feature_comb = feature_comb
        
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if score > tinsley_best_score:
        tinsley_best_score = score
        tinsley_best_score_model = model
        tinsley_best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(tinsley_best_score_feature_comb)
print(tinsley_best_score)

Best Model:
('How well did you sleep?', 'How well did you fuel?', 'Neck', 'Shoulders', 'Hip Flexors', 'Glutes', 'Quadricps', 'Adductors')
0.7062230448566962


In [34]:
features_to_drop = set(df_tinsley.columns) - set(['How well did you sleep?', 'How well did you fuel?', 'Neck', 'Shoulders', 'Hip Flexors', 'Glutes', 'Quadricps', 'Adductors'])
X = df_tinsley.drop(list(features_to_drop), axis=1)  # Features (excluding target variable)
y = df_tinsley["What is your readiness score? "]  # Target variable

highest_score = 0
best_n = 0    
for n in range(100000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    #scores = cross_val_score(model, X_train, y_train)
    if model.score(X_test, y_test) > highest_score:
        highest_score = model.score(X_test, y_test)
        best_n = n
print("Best Score is", highest_score, "at Random State", best_n)

Best Score is 0.8712243739806672 at Random State 8048


In [9]:
df_tinsley

Unnamed: 0,How stressed are you?,How well did you sleep?,How many hours did you sleep?,How sore are you?,How well did you hydrate?,How well did you fuel?,What is your readiness score?,Neck,Back,Shoulders,...,Hip Flexors,Glutes,Hamstrings,Quadricps,Adductors,Calves,Feet,No Injury,Some Injury,Injury
1070,10,8,7.0,7,6.0,7.0,99.0,0.0,2.333333,2.333333,...,0.0,2.333333,0.0,0.0,0.0,0.0,0.0,1,0,0
1077,10,9,10.0,7,7.0,7.0,98.0,0.0,0.0,2.333333,...,0.0,2.333333,2.333333,0.0,0.0,0.0,0.0,1,0,0
1088,10,9,10.0,8,6.0,7.0,99.0,0.0,0.0,0.0,...,0.0,4.0,4.0,0.0,0.0,0.0,0.0,1,0,0
1092,10,9,10.0,9,6.0,6.0,99.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
1103,10,8,8.0,9,5.0,8.0,99.0,0.0,0.0,0.0,...,0.0,4.5,4.5,0.0,0.0,0.0,0.0,0,1,0
1120,10,9,9.0,7,7.0,7.0,98.0,0.0,0.0,0.0,...,0.0,3.5,3.5,0.0,0.0,0.0,0.0,1,0,0
1125,10,9,8.0,6,7.0,8.0,99.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1,0,0
1145,10,10,9.0,8,7.0,9.0,99.0,0.0,4.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1,0,0
1160,10,10,10.0,9,7.0,9.0,98.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,1,0,0
1169,10,9,10.0,9,7.0,8.0,99.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,1,0,0


In [12]:
santen_best_model = None
santen_best_rmse = float('inf')
santen_best_score = -float('inf')

# Split the data into training and test sets
X = df_santen.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df_santen["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Check if this model performs better than the previous best model
    if rmse < santen_best_rmse:
        santen_best_rmse = rmse
        santen_best_model = model
        santen_best_feature_comb = feature_comb
        
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if score > santen_best_score:
        santen_best_score = score
        santen_best_score_model = model
        santen_best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(santen_best_score_feature_comb)
print(santen_best_score)

Best Model:
('How well did you sleep?', 'How sore are you?', 'Neck', 'Glutes', 'Hamstrings', 'Quadricps', 'Adductors', 'Feet', 'No Injury')
0.5040120437995107


In [33]:
features_to_drop = set(df_santen.columns) - set(['How well did you sleep?', 'How sore are you?', 'Neck', 'Glutes', 'Hamstrings', 'Quadricps', 'Adductors', 'Feet', 'No Injury'])
X = df_santen.drop(list(features_to_drop), axis=1)  # Features (excluding target variable)
y = df_santen["What is your readiness score? "]  # Target variable

highest_score = 0
best_n = 0    
for n in range(100000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    #scores = cross_val_score(model, X_train, y_train)
    if model.score(X_test, y_test) > highest_score:
        highest_score = model.score(X_test, y_test)
        best_n = n
print("Best Score is", highest_score, "at Random State", best_n)

Best Score is 0.9768155830274283 at Random State 99986


In [10]:
df_santen

Unnamed: 0,How stressed are you?,How well did you sleep?,How many hours did you sleep?,How sore are you?,How well did you hydrate?,How well did you fuel?,What is your readiness score?,Neck,Back,Shoulders,...,Hip Flexors,Glutes,Hamstrings,Quadricps,Adductors,Calves,Feet,No Injury,Some Injury,Injury
1064,6,9,9.0,5,7.0,7.0,75.0,0.0,0.0,0.0,...,0.0,0.0,1.666667,1.666667,0.0,0.0,0.0,1,0,0
1078,6,8,9.0,7,6.0,7.0,84.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.5,0.0,0.0,0.0,1,0,0
1104,3,3,5.0,7,7.0,8.0,72.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,1,0,0
1113,6,8,8.0,6,5.0,8.0,78.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1,0,0
1123,9,6,9.0,6,7.0,8.0,71.0,0.0,0.0,0.0,...,0.0,0.0,3.0,3.0,0.0,0.0,0.0,1,0,0
1143,7,7,8.0,3,5.0,7.0,65.0,0.0,0.0,0.0,...,0.0,0.0,0.75,0.75,0.0,0.0,0.75,1,0,0
1147,8,7,8.0,5,8.0,6.0,66.0,0.0,0.0,0.0,...,0.0,0.0,1.25,1.25,0.0,0.0,1.25,1,0,0
1171,6,5,6.0,8,5.0,8.0,78.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
1183,8,9,9.0,6,6.0,7.0,76.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1,0,0
1205,6,8,6.0,7,8.0,6.0,84.0,0.0,0.0,0.0,...,0.0,3.5,0.0,0.0,0.0,0.0,0.0,1,0,0
