# 1. SciKit Basic Models

In [2]:
'''
This is a notebook that tests various machine learning models that aims to predict the future top-100 average of track and field events.
Created on Monday, April 1st, 2024
Author: Matt Goeckel
'''
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
print("Libraries Imported")

Libraries Imported


In [3]:
# Load the data
df = pd.read_csv('top100avg.csv')

# Preprocess the data
le = LabelEncoder()
# For 'Division'
le_division = LabelEncoder()
df['Division'] = le_division.fit_transform(df['Division'])
print("Division Label Encoding:")
print(list(le_division.classes_))
print(list(le_division.transform(le_division.classes_)))

# For 'Sex'
le_sex = LabelEncoder()
df['Sex'] = le_sex.fit_transform(df['Sex'])
print("\nSex Label Encoding:")
print(list(le_sex.classes_))
print(list(le_sex.transform(le_sex.classes_)))

# For 'Event'
le_event = LabelEncoder()
df['Event'] = le_event.fit_transform(df['Event'])
print("\nEvent Label Encoding:")
print(list(le_event.classes_))
print(list(le_event.transform(le_event.classes_)))

# Split the data into features and target
X = df.iloc[:, :4]  # columns 1-4 are your features
y = df.iloc[:, 4]  # column 5 is your target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Division Label Encoding:
['Kansas 1A', 'Kansas 3A', 'Kansas 6A', 'NAIA', 'NCAA D-I', 'NCAA D-II', 'NCAA D-III', 'World']
[0, 1, 2, 3, 4, 5, 6, 7]

Sex Label Encoding:
['Men', 'Women']
[0, 1]

Event Label Encoding:
['10000m', '100m', '1500m', '1600m', '200m', '3200m', '400m', '5000m', '800m', 'Discus', 'High Jump', 'Javelin', 'Long Jump', 'Pole Vault', 'Shot Put', 'Triple Jump']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


<table>
<tr><th colspan="3">Label Encoding Key</th></tr>
<tr><td>

| Label | Division   |
|-------|------------|
| 0     | Kansas 1A  |
| 1     | Kansas 3A  |
| 2     | Kansas 6A  |
| 3     | NAIA       |
| 4     | NCAA D-I   |
| 5     | NCAA D-II  |
| 6     | NCAA D-III |
| 7     | World      |

</td><td>

| Label | Sex   |
|-------|-------|
| 0     | Men   |
| 1     | Women |

</td><td>

| Label | Event       |
|-------|-------------|
| 0     | 10000m      |
| 1     | 100m        |
| 2     | 1500m       |
| 3     | 1600m       |
| 4     | 200m        |
| 5     | 3200m       |
| 6     | 400m        |
| 7     | 5000m       |
| 8     | 800m        |
| 9     | Discus      |
| 10    | High Jump   |
| 11    | Javelin     |
| 12    | Long Jump   |
| 13    | Pole Vault  |
| 14    | Shot Put    |
| 15    | Triple Jump |

</td></tr> </table>

In [4]:
# Uncomment the next line if the lasso model fails to converge with the default number of iterations
#lasso_model = make_pipeline(StandardScaler(), Lasso(max_iter=10000))

In [5]:
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Decision Tree Regression', DecisionTreeRegressor()),
    ('Random Forest Regression', RandomForestRegressor()),
    ('Support Vector Regression', SVR()),
    ('Gradient Boosting Regression', GradientBoostingRegressor())
]

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(name)
    print('RMSE:', root_mean_squared_error(y_test, y_pred))
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('R^2:', r2_score(y_test, y_pred))

Linear Regression
RMSE: 117.72176625653266
MAE: 95.72363523424741
R^2: 0.7138179530957249
Ridge Regression
RMSE: 117.72401692321075
MAE: 95.72756766322412
R^2: 0.7138070102331695
Lasso Regression
RMSE: 117.87647712529565
MAE: 96.03982107122336
R^2: 0.713065253436234
Decision Tree Regression
RMSE: 22.52998777274723
MAE: 13.167539267015707
R^2: 0.9895178406222208
Random Forest Regression
RMSE: 17.50715885984372
MAE: 10.880157068062829
R^2: 0.9936706342626285
Support Vector Regression
RMSE: 221.3954409020273
MAE: 184.3238384000814
R^2: -0.012199674860394527
Gradient Boosting Regression
RMSE: 39.44550688691538
MAE: 28.18553169021882
R^2: 0.9678690231545828


Model Performance
| Model                     | RMSE       | MAE        | R^2       |
|---------------------------|------------|------------|-----------|
| Linear Regression         | 117.72177  | 95.72364   | 0.71382   |
| Ridge Regression          | 117.72402  | 95.72757   | 0.71381   |
| Lasso Regression          | 117.89145  | 96.06475   | 0.71299   |
| Decision Tree Regression  | 22.18599   | 13.01396   | 0.98984   |
| Random Forest Regression  | 17.06535   | 10.74354   | 0.99399   |
| Support Vector Regression | 221.39544  | 184.32384  | -0.01220  |
| Gradient Boosting Regression | 39.44551 | 28.18553   | 0.96787   |

In [6]:
# Create an instance of the Random Forest Regression model
model = RandomForestRegressor()

# Fit the model with the training data
model.fit(X_train, y_train)

Let's try and run a simulation for an event in 2024

In [52]:
import pointCalculator

# Create a new data point with the given parameters
data23 = pd.DataFrame({'Event': [6], 'Division': [7], 'Sex': [0], 'Year': [2023]})

# Use the model to predict the score value for the new data point
score = model.predict(data23)

time = pointCalculator.calculateResult('Men', '400m', score)

# Print the predicted score value
print('Men 400m World 2023: ' + str(score))
print('Time: ' + str(time))

# Create a new data point with the given parameters
data24 = pd.DataFrame({'Event': [6], 'Division': [7], 'Sex': [0], 'Year': [2024]})

# Use the model to predict the score value for the new data point
score = model.predict(data24)

time = pointCalculator.calculateResult('Men', '400m', score)

# Print the predicted score value
print('Men 400m World 2024: ' + str(score))
print('Time: ' + str(time))


JSON Loaded
Men 400m World 2023: [1185.88]
Time: 44.9193788710826
Men 400m World 2024: [1185.88]
Time: 44.9193788710826


Model is accurate only for 2010-2023. All years outside this range get fitted to 2010 (if older) or 2023 (if future)

# 2. Time Series Models


In [None]:
import prophet

# Prepare data for Prophet
prophet_data = train_data.reset_index().rename(columns={'index':'ds', 'Recorded Value':'y'})

# Instantiate Prophet model
prophet_model = prophet()

# Fit the model to the data
prophet_model.fit(prophet_data)

# Make future predictions
future = prophet_model.make_future_dataframe(periods=len(test_data))
prophet_forecast = prophet_model.predict(future)

# Extract predicted values for the test set
prophet_predictions = prophet_forecast[-len(test_data):]['yhat'].values

# Calculate RMSE for Prophet model
prophet_rmse = np.sqrt(mean_squared_error(y_test, prophet_predictions))
print("Prophet RMSE:", prophet_rmse)

In [54]:
from darts import TimeSeries
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.models import RNNModel, TCNModel, ARIMA, ExponentialSmoothing, VARIMA
from darts.metrics import mape

In [55]:
# Create a list to store TimeSeries objects
training_series = []

# Iterate over each row in the DataFrame and create a TimeSeries object
for _, row in df.iterrows():
    # Extract the values from the row starting from the fourth element
    values = row.values[4:]
    # Create a TimeSeries object using the extracted values
    time_series = TimeSeries.from_values(values)
    # Append the created TimeSeries object to the list
    training_series.append(time_series)

# Example of training a VARIMA model
varima_model = VARIMA()
varima_model.fit(training_series)

# Example of forecasting with the trained model
forecast = varima_model.predict(n=1)
print('Forecast:', forecast)

AttributeError: 'list' object has no attribute 'last_values'

In [None]:
# Example of evaluating a VAR model
print('MAPE:', mape(var_model.predict(1), validation_series))

In [None]:
# Example of forecasting with a VAR model
forecast = var_model.predict(1)
print('Forecast:', forecast)

In [56]:
import numpy as np
import pandas as pd
from statsmodels.tsa.api import VAR, VARMAX
from statsmodels.tsa.statespace.varmax import VARMAXOrder
from statsmodels.tsa.statespace.varmax import VARMAXResults
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.statespace.varmax import VARMAXResults
from statsmodels.tsa.statespace.varmax import VARMAXResults
from statsmodels.tsa.statespace.varmax import VARMAXResults
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping

# Load your dataset into a pandas DataFrame
# Replace this with your actual data loading code
data = pd.read_csv('your_data.csv')

# Preprocess the data
# Assuming 'Division', 'Gender', and 'Event' are categorical variables
data = pd.get_dummies(data, columns=['Division', 'Gender', 'Event'])

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Standardize the input features
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data.drop(columns=['Recorded Value']))
test_data_scaled = scaler.transform(test_data.drop(columns=['Recorded Value']))

# Split the features and target variable
X_train = train_data_scaled
y_train = train_data['Recorded Value'].values
X_test = test_data_scaled
y_test = test_data['Recorded Value'].values

# VAR model
var_model = VAR(train_data)
var_results = var_model.fit()

# VARIMA model
order = VARMAXOrder(selection=15)
varima_model = VARMAX(train_data, order=order)
varima_results = varima_model.fit()

# LSTM model
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(1, X_train.shape[1])))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X_train_lstm, y_train, epochs=100, verbose=0)

# GRU model
gru_model = Sequential()
gru_model.add(GRU(50, activation='relu', input_shape=(1, X_train.shape[1])))
gru_model.add(Dense(1))
gru_model.compile(optimizer='adam', loss='mse')
gru_model.fit(X_train_lstm, y_train, epochs=100, verbose=0)

# CNN model
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(50, activation='relu'))
cnn_model.add(Dense(1))
cnn_model.compile(optimizer='adam', loss='mse')
cnn_model.fit(X_train_cnn, y_train, epochs=100, verbose=0)

# Evaluate models
var_predictions = var_results.forecast(train_data.values, steps=len(test_data))
varima_predictions = varima_results.forecast(len(test_data))
lstm_predictions = lstm_model.predict(X_test_lstm)
gru_predictions = gru_model.predict(X_test_lstm)
cnn_predictions = cnn_model.predict(X_test_cnn)

# Calculate RMSE for each model
var_rmse = np.sqrt(mean_squared_error(y_test, var_predictions))
varima_rmse = np.sqrt(mean_squared_error(y_test, varima_predictions))
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_predictions))
gru_rmse = np.sqrt(mean_squared_error(y_test, gru_predictions))
cnn_rmse = np.sqrt(mean_squared_error(y_test, cnn_predictions))

print("VAR RMSE:", var_rmse)
print("VARIMA RMSE:", varima_rmse)
print("LSTM RMSE:", lstm_rmse)
print("GRU RMSE:", gru_rmse)
print("CNN RMSE:", cnn_rmse)


ImportError: cannot import name 'VARMAXOrder' from 'statsmodels.tsa.statespace.varmax' (c:\Users\mattg\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\statespace\varmax.py)