In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('top100avg_nofield.csv')

divisions = ['World', 'NCAA D-I', 'NCAA D-II', 'NCAA D-III', 'NAIA', 'Kansas 1A', 'Kansas 3A', 'Kansas 6A']
genders = ['Men', 'Women']
events = ['100m', '200m', '400m', '800m', '1500m', '1600m', '3200m', '5000m', '10000m'
          #, 'High Jump', 'Pole Vault', 'Long Jump', 'Triple Jump', 'Shot Put', 'Discus', 'Javelin'
          ]



# INSERT TRAIN/TEST SPLIT HERE



average_points = {}

average_points['Overall'] = df.groupby('Year')['Points'].mean()

for division in divisions:
    filtered_df_division = df[df['Division'] == division]
    #print(filtered_df_division)
    average_points[division] = filtered_df_division.groupby('Year')['Points'].mean()

for gender in genders:
    filtered_df_gender = df[df['Sex'] == gender]
    average_points[gender] = filtered_df_gender.groupby('Year')['Points'].mean()

for event in events:
    filtered_df_event = df[df['Event'] == event]
    average_points[event] = filtered_df_event.groupby('Year')['Points'].mean()

print(average_points)

{'Overall': Year
2010    842.867925
2011    850.471698
2012    857.792453
2013    854.584906
2014    863.886792
2015    868.018868
2016    871.547170
2017    871.198113
2018    867.056604
2019    868.103774
2021    869.047170
2022    881.018868
2023    891.405660
Name: Points, dtype: float64, 'World': Year
2010    1149.642857
2011    1156.071429
2012    1161.000000
2013    1154.500000
2014    1153.642857
2015    1162.214286
2016    1165.642857
2017    1161.785714
2018    1160.214286
2019    1164.428571
2021    1176.071429
2022    1178.571429
2023    1183.000000
Name: Points, dtype: float64, 'NCAA D-I': Year
2010    1053.714286
2011    1058.357143
2012    1063.142857
2013    1060.285714
2014    1066.214286
2015    1069.928571
2016    1070.714286
2017    1075.214286
2018    1077.857143
2019    1075.571429
2021    1089.214286
2022    1099.285714
2023    1103.714286
Name: Points, dtype: float64, 'NCAA D-II': Year
2010    934.428571
2011    940.428571
2012    946.142857
2013    948.571429
2

In [2]:
from sklearn.linear_model import LinearRegression

equations = {}

for key, value in average_points.items():
    X = value.index.values.reshape(-1, 1)
    y = value.values.reshape(-1, 1)
    
    model = LinearRegression()
    model.fit(X, y)
    
    slope = model.coef_[0][0]
    intercept = model.intercept_[0]
    equations[key] = [slope, intercept]

print(equations)


{'Overall': [2.719263433068304, -4616.739526473332], 'World': [2.1570608142754613, -3185.533483745261], 'NCAA D-I': [3.514998461696238, -6012.95464567737], 'NCAA D-II': [4.111245000512767, -7328.504384165724], 'NCAA D-III': [2.3457594092913543, -3809.020869654394], 'NAIA': [1.4580555840426592, -2053.8864219054394], 'Kansas 1A': [-2.6738155061019357, 5929.118838238808], 'Kansas 3A': [6.14758315386456, -11816.798875329025], 'Kansas 6A': [4.6947535295525205, -8771.969340751368], 'Men': [3.194442563220418, -5576.641949640391], 'Women': [2.2440843029161957, -3656.8371033062836], '100m': [4.397792534099067, -7937.16654702082], '200m': [3.5833408111988505, -6318.7795001794675], '400m': [1.9232322325915283, -2997.963657573581], '800m': [2.092942390524048, -3385.4077306173713], '1500m': [2.3471284996410615, -3731.6219310839897], '1600m': [3.4568676716917923, -6451.342964824121], '3200m': [-1.2316941852117744, 2946.3668939937806], '5000m': [2.865362526920318, -4816.05516870065], '10000m': [3.080

In [15]:
overall = equations['Overall']
overall_slope = overall[0]
overall_intercept = overall[1]

projection_list = []
actual_list = []

for index, row in df.iterrows():
    division = row['Division']
    event = row['Event']
    sex = row['Sex']
    points = row['Points']

    division_slope = equations[division][0]
    division_intercept = equations[division][1]
    event_slope = equations[event][0]
    event_intercept = equations[event][1]
    sex_slope = equations[sex][0]
    sex_intercept = equations[sex][1]

    adjusted_slope = overall_slope + (division_slope - overall_slope) + (event_slope - overall_slope) + (sex_slope - overall_slope)
    adjusted_intercept = overall_intercept + (division_intercept - overall_intercept) + (event_intercept - overall_intercept) + (sex_intercept - overall_intercept)
    projected_value = adjusted_slope * row['Year'] + adjusted_intercept
    
    projection_list.append(projected_value)
    actual_list.append(points)

print(projection_list)
print(actual_list)


[951.7827052087541, 954.0775810124942, 956.3724568162334, 958.6673326199734, 960.9622084237126, 963.2570842274517, 965.5519600311918, 967.846835834931, 970.141711638671, 972.4365874424102, 977.0263390498894, 979.3212148536286, 981.6160906573687, 961.3674483313762, 962.7119658748115, 964.0564834182464, 965.4010009616818, 966.7455185051172, 968.0900360485525, 969.4345535919879, 970.7790711354232, 972.1235886788586, 973.468106222294, 976.1571413091647, 977.5016588526, 978.8461763960349, 1127.1696655205196, 1131.5214842019122, 1135.8733028833049, 1140.2251215646975, 1144.576940246092, 1148.9287589274845, 1153.2805776088771, 1157.6323962902698, 1161.9842149716642, 1166.3360336530568, 1175.0396710158439, 1179.3914896972365, 1183.743308378629, 1136.754408643139, 1140.1558690642278, 1143.5573294853166, 1146.9587899064054, 1150.3602503274942, 1153.761710748583, 1157.1631711696718, 1160.5646315907607, 1163.9660920118495, 1167.3675524329383, 1174.1704732751168, 1177.5719336962056, 1180.9733941172

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_list, projection_list))

# Calculate MAE
mae = mean_absolute_error(actual_list, projection_list)

# Calculate R^2
r2 = r2_score(actual_list, projection_list)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R^2: {r2}')

RMSE: 121.1660632895053
MAE: 88.08426964395458
R^2: 0.7019781797318259
