# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
#!pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge 
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

# Original Dataset
data = pd.read_csv('Wellness_Database_3_6.csv').iloc[0:1061]

# New Dataset
df = pd.read_csv('Wellness_Database_3_6.csv').iloc[1061:1310]

# Heatmap

In [None]:
ax = plt.axes()
sns.heatmap(df.corr(), annot = True, cmap = 'rocket_r', ax = ax);
ax.set_title('Correlation Heatmap')

plt.savefig("heatmap.png")
plt.show()

# **Testing Regressions**

In [None]:
# Splitting data to only select entries starting on 2/2/23, date of new survey

# Creating athlete_id to test individual regressions
performance = pd.read_csv('performance_data.csv')
performance = performance.iloc[1061:1233, 2:12]
athlete_id  = []
for ath_id in performance['Athlete ID #']:
    athlete_id.append(int(ath_id.replace('Athlete ', '')))
performance['Athlete ID #'] = athlete_id

# Getting rid of qualitative data
del performance['What is your injury status?']
del performance['Select where you are sore:']

# Creating perf_X (performance data independent variables) and perf_y (performance data target variable)
perf_X = performance.iloc[:, 1:7]
perf_y = performance.iloc[:, 7]

In [None]:
# Naive testing all of the models on both normalized data (models_norm) and non-normalized data (models)

models_norm = [LinearRegression(normalize = True), Lasso(normalize = True), ElasticNet(normalize = True), Ridge(normalize = True),  BayesianRidge(normalize = True)]
models = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), BayesianRidge()]
X_train, X_test, y_train, y_test = train_test_split(perf_X, perf_y, test_size = 0.2, random_state = 42)
for model in models_norm:
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train)
    print("Model score for ", model, " (normalized): ", model.score(X_test, y_test))
    print(' ')
print(' ')
print(' ')
for model in models:
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train)
    print("Model score for ", model, " :", model.score(X_test, y_test))
    print(' ')

# Feature Selection - Classification Methods

In [None]:
# Fitting the data to an ExtraTreesClassifier model to find most important features

model = ExtraTreesClassifier()
model.fit(perf_X, perf_y)
print(model.feature_importances_)

In [None]:
# Fitting the data to an RandomForestClassifier model to find most important features

model = RandomForestClassifier()
model.fit(perf_X, perf_y)
print(model.feature_importances_)

In [None]:
# Fitting the data to an GradientBoostingClassifier model to find most important features

model = GradientBoostingClassifier()
model.fit(perf_X, perf_y)
print(model.feature_importances_)

# Feature Selection - RFE

In [None]:
# Using RFE on all of the models to find the selected features for each model

estimators = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), BayesianRidge(), LinearRegression(normalize = True), Lasso(normalize = True), ElasticNet(normalize = True), Ridge(normalize = True), BayesianRidge(normalize = True)]
for estimator in estimators:
    selector = RFE(estimator, n_features_to_select=3, step=1)
    selector = selector.fit(perf_X, perf_y)
    print('Best Features for', estimator)
    print(selector.support_)
    print(selector.ranking_)
    print(' ')

# Random State Optimization using Features Selected from Classification Methods

In [None]:
models_norm = [LinearRegression(normalize = True), Lasso(normalize = True), ElasticNet(normalize = True), Ridge(normalize = True),  BayesianRidge(normalize = True)]
models = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), BayesianRidge()]

# Iterating through 1000 different random states to find the highest scoring state for each model, normalized
for model in models_norm:
    highest_score = 0
    best_n = 0
    for n in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(perf_X, perf_y, test_size = 0.2, random_state = n)
        model.fit(X_train.iloc[:,[0,3,4]], y_train)
        scores = cross_val_score(model, X_train.iloc[:,[0,3,4]], y_train)
        if model.score(X_test.iloc[:,[0,3,4]], y_test) > highest_score:
            highest_score = model.score(X_test.iloc[:,[0,3,4]], y_test)
            best_n = n
    print("Best Score for ", model, " is ", highest_score, " at Random State ", best_n)
    
print(' ')

# Iterating through 1000 different random states to find the highest scoring state for each model, non-normalized
\\

# Random State Optimization using Features Selected from RFE

In [None]:
models_norm = [LinearRegression(normalize = True), Lasso(normalize = True), ElasticNet(normalize = True), Ridge(normalize = True),  BayesianRidge(normalize = True)]
models = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), BayesianRidge()]
cols_norm = [[1,2,3],[3,4,5],[1,4,5],[1,2,3],[1,2,3]]

for i in range(5):
    highest_score = 0
    best_n = 0
    for n in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(perf_X, perf_y, test_size = 0.2, random_state = n)
        model = models_norm[i].fit(X_train.iloc[:,cols_norm[i]], y_train)
        scores = cross_val_score(model, X_train.iloc[:,cols_norm[i]], y_train)
        if model.score(X_test.iloc[:,cols_norm[i]], y_test) > highest_score:
            highest_score = model.score(X_test.iloc[:,cols_norm[i]], y_test)
            best_n = n
    print("Best Score for ", models_norm[i], " is ", highest_score, " at Random State ", best_n)
    print(model.coef_)
    print(model.intercept_)
    
print(' ')

for model in models:
    highest_score = 0
    best_n = 0
    for n in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(perf_X, perf_y, test_size = 0.2, random_state = n)
        model.fit(X_train.iloc[:,[1,2,3]], y_train)
        scores = cross_val_score(model, X_train.iloc[:,[1,2,3]], y_train)
        if model.score(X_test.iloc[:,[1,2,3]], y_test) > highest_score:
            highest_score = model.score(X_test.iloc[:,[1,2,3]], y_test)
            best_n = n
    print("Best Score for ", model, " is ", highest_score, " at Random State ", best_n)
    print(model.coef_)
    print(model.intercept_)

# Creating Scatterplots

In [None]:
# Scatter plot of Stress, Soreness and Hydration
ax1 = plt.axes(projection='3d')

# From Classifiers
ax1.scatter3D(perf_X.iloc[:,0], perf_X.iloc[:,3], perf_X.iloc[:,4], color = 'blue')
ax1.set_xlabel('Stress')
ax1.set_ylabel('Soreness')
ax1.set_zlabel('Hydration')
plt.savefig("plot1.png")

In [None]:
# Scatter plot of Sleep Quality, Sleep Quantity, and Soreness
ax2 = plt.axes(projection='3d')

# RFE
# Non-Normalized
ax2.scatter3D(perf_X.iloc[:,1], perf_X.iloc[:,2], perf_X.iloc[:,3], color = 'green')
ax2.set_xlabel('Sleep Quality')
ax2.set_ylabel('Sleep Quantity')
ax2.set_zlabel('Soreness')
plt.savefig("plot2.png")

In [None]:
# Scatter plot of Fuel, Soreness and Hydration
ax4 = plt.axes(projection='3d')

# Lasso
ax4.scatter3D(perf_X.iloc[:,3], perf_X.iloc[:,4], perf_X.iloc[:,5], color = 'red')
ax4.set_xlabel('Soreness')
ax4.set_ylabel('Hydration')
ax4.set_zlabel('Fuel')
plt.savefig("plot3.png")

In [None]:
# Scatter plot of Sleep Quality, Hydration and Fuel
ax5 = plt.axes(projection='3d')

# Elastic Net
ax5.scatter3D(perf_X.iloc[:,1], perf_X.iloc[:,4], perf_X.iloc[:,5], color = 'purple')
ax5.set_xlabel('Sleep Quality')
ax5.set_ylabel('Hydration')
ax5.set_zlabel('Fuel')
plt.savefig("plot4.png")

# Testing Regressions by Athlete

In [None]:
# There wasn't enough data for this to be successful; an error message pops up so it won't run
# But the methodology could be useful in the future to build a model for each athlete
athletes = performance['Athlete ID #'].unique()

for athlete in athletes:
    print('Athlete', athlete)
    data = performance[performance['Athlete ID #'] == athlete]
    perf_X = data.iloc[:, 1:7]
    perf_y = data.iloc[:, 7]
    
    models_norm = [LinearRegression(normalize = True), Lasso(normalize = True), ElasticNet(normalize = True), Ridge(normalize = True), KernelRidge(), BayesianRidge(normalize = True)]
    models = [LinearRegression(), Lasso(), ElasticNet(), Ridge(), KernelRidge(), BayesianRidge()]
    X_train, X_test, y_train, y_test = train_test_split(perf_X, perf_y, test_size = 0.3, random_state = 42)
    for model in models_norm:
        model.fit(X_train, y_train)
        scores = cross_val_score(model, X_train, y_train)
        print("Model score for ", model, " (normalized): ", model.score(X_test, y_test))
        print(' ')
    print(' ')
    print(' ')
    for model in models:
        model.fit(X_train, y_train)
        scores = cross_val_score(model, X_train, y_train)
        print("Model score for ", model, " (normalized): ", model.score(X_test, y_test))
        print(' ')