In [1]:
import pandas as pd

df = pd.read_excel("~/Desktop/Research/LMU_Wellness/data/Wellness_Database_May19.xlsx", sheet_name="Wellness Responses")

df = df.dropna(subset=["How well did you hydrate?"])

sore_areas = ["Neck", "Back", "Shoulders", "Chest", "Arms", "Hip Flexors", "Glutes", "Hamstrings", "Quadricps", "Adductors", "Calves", "Feet"]

for area in sore_areas:
    df[area] = df.apply(lambda row: row["How sore are you?"] / (row["Select where you are sore:"].count(",")+1) if (isinstance(row["Select where you are sore:"], str) and area in row["Select where you are sore:"]) else 0, axis=1)

# Drop the original "Select where you are sore:" column
df = df.drop("Select where you are sore:", axis=1)

columns_to_drop = ['Athlete Name','Timestamp', 'Athlete ID #', 'Data ID', 'Week ID', 'Week ID Refined', 'Date Value', 'Year ID', 'Season ID', 'Injury Refined', 'Position', 'Classification', 'Stress RA', 'Stress StdDev', 'Stress Z-Score', 'Stress Wellness Score', 'Sleep Quality RA', 'Sleep Quality StdDev', 'Sleep Quality Z-Score', 'Sleep Quality Wellness Score', 'Sleep Quantity RA', 'Sleep Quantity StdDev', 'Sleep Quantity Z-Score', 'Sleep Quantity Wellness Score', 'Soreness RA', 'Soreness StdDev', 'Soreness Z-Score', 'Soreness Wellness Score', 'Hydrate RA', 'Hydrate StdDev', 'Hydrate Z-Score', 'Hydrate Wellness Score', 'Fuel RA', 'Fuel StdDev', 'Fuel Z-Score', 'Fuel Wellness Score', 'Readiness Score']
df = df.drop(columns_to_drop, axis=1)

df['No Injury'] = (df['What is your injury status?'] == 'Full = I have no injury').astype(int)*10
df['Some Injury'] = (df['What is your injury status?'] == 'Limited = I need a modification during lift / practice').astype(int)*10
df['Injury'] = (df['What is your injury status?'] == 'Out = I have an injury').astype(int)*10

# Drop the original column
df = df.drop('What is your injury status?', axis=1)

In [3]:
#Remove outliers
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

# Prepare the data
X = df.drop(["What is your readiness score? "], axis=1)  # Features
y = df["What is your readiness score? "]  # Target variable

# Add a constant term to the features
X = sm.add_constant(X)

# Fit the ordinary least squares (OLS) model
model = sm.OLS(y, X)
results = model.fit()

# Compute leverage values
leverage = OLSInfluence(results).hat_matrix_diag

# Compute Cook's distance
cooks_d = OLSInfluence(results).cooks_distance[0]

# Set a threshold for identifying influential points
threshold = 4 / len(X)  # You can adjust the threshold as needed

# Identify influential points based on Cook's distance and leverage
influential_points = np.where((cooks_d > threshold) | (leverage > np.mean(leverage) + 2 * np.std(leverage)))

# Remove values
for i in range(len(influential_points[0])):
    df = df.drop(index = (influential_points[0][i]+1061) ) 

  x = pd.concat(x[::order], 1)


In [4]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.model_selection import train_test_split

In [6]:
nonfeatures = set(df.columns) - set(['How stressed are you?', 'How well did you sleep?', 'How many hours did you sleep?', 'How sore are you?', 'How well did you fuel?','No Injury'])
nonfeatures = list(nonfeatures)

X = df.drop(nonfeatures, axis=1) 
y = df["What is your readiness score? "]  # Target variable

In [7]:
models = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), BayesianRidge()]

for model in models:
    best_n = 0
    best_score = -float('inf')
    for n in range(100000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = n)
        model.fit(X_train, y_train)
        
        if model.score(X_test, y_test) > best_score:
            best_score = model.score(X_test, y_test)
            best_n = n
            
    print(model, "best score is", best_score, "at state", best_n)
    

LinearRegression() best score is 0.5541777446047151 at state 52976
Lasso() best score is 0.5296093231327741 at state 30627
ElasticNet() best score is 0.5120029171263851 at state 30627
Ridge() best score is 0.5540788715107233 at state 52976
BayesianRidge() best score is 0.5475433938659682 at state 52976


In [11]:
import itertools

all_columns = df.columns
features = []
for col in all_columns:
    if (col != 'What is your readiness score? '):
        features.append(col)

features_combinations = []

for i in range(len(features)):
    if i == 0:
        continue
    for combination in (list(itertools.combinations(features, i))):
        features_combinations.append(combination)
    

In [14]:
from sklearn.metrics import mean_squared_error

best_model = None
best_rmse = float('inf')
best_score = -float('inf')

# Split the data into training and test sets
X = df.drop(["What is your readiness score? "], axis=1)  # Features (excluding target variable)
y = df["What is your readiness score? "]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Iterate over each feature combination
for feature_comb in features_combinations:
    # Prepare the training data
    X_train_comb = X_train[list(feature_comb)]

    # Fit a linear regression model
    model = Ridge()
    model.fit(X_train_comb, y_train)

    # Prepare the test data
    X_test_comb = X_test[list(feature_comb)]

    # Predict using the model
    y_pred = model.predict(X_test_comb)

    # Calculate RMSE (Root Mean Squared Error) & Score (R^2)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    score = model.score(X_test_comb, y_test)

    # Check if this model performs better than the previous best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_feature_comb = feature_comb
        
    if score > best_score:
        best_score = score
        best_score_model = model
        best_score_feature_comb = feature_comb

# Print the best model
print("Best Model:")
print(best_feature_comb)
print(best_rmse)

Best Model:
('How well did you sleep?', 'How sore are you?', 'Neck', 'Back', 'Chest', 'Hip Flexors', 'Glutes', 'Adductors', 'No Injury')
13.843503707538167


In [15]:
best_score

0.2562967153533754