In [1]:
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE, SpectralEmbedding
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure, silhouette_score, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, f1_score

plt.style.use("ggplot")
pd.set_option('display.max_rows', 500)

In [2]:
mdf = pd.read_csv('M_athlete_results.csv', low_memory=False)
fdf = pd.read_csv('F_athlete_results.csv', low_memory=False)

In [3]:
# String marks to float times in seconds
mdf['Time'] = [utils.Time(m) if (e in utils.event_classification()['Track'] and utils.ismark(m)) else None for e, m in zip(mdf['Event'], mdf['Mark'])]
fdf['Time'] = [utils.Time(m) if (e in utils.event_classification()['Track'] and utils.ismark(m)) else None for e, m in zip(fdf['Event'], fdf['Mark'])]

In [4]:
# String events to float distance in meters
mdf['Distance'] = [utils.string_to_distance(e) if e in utils.event_classification()['Track'] else None for e in mdf['Event']]
fdf['Distance'] = [utils.string_to_distance(e) if e in utils.event_classification()['Track'] else None for e in fdf['Event']]

In [5]:
mdf_dataframe = pd.DataFrame(mdf)
fdf_dataframe = pd.DataFrame(fdf)

csm_m = mdf.loc[(mdf['School'] == 'COLORADO MINES') & (mdf['Season'] == 'Cross Country')]
csm_f = fdf.loc[(fdf['School'] == 'COLORADO MINES') & (fdf['Season'] == 'Cross Country')]

natys2019_m = (mdf.loc[(mdf['Meet_Name'] == 'NCAA Division II Cross Country Championships') & (mdf['Year'] == 2019)])
natys2019df_m = pd.DataFrame(natys2019_m)

In [6]:
csm_m

Unnamed: 0,Name,Athlete ID,Grade,Academic_Year,School,Conference,Meet_ID,Meet_Name,Meet_Start,Meet_End,Year,Season,Event,Mark,Place,Prelim/Final,Time,Distance
8397,CHRIS CATHCART,6765183,SO,2.0,COLORADO MINES,Rocky Mountain AC,61281,2020 RMAC Cross Country Championships,"Oct 24, 2020","Oct 24, 2020",2020,Cross Country,8K,24:27.9,4.0,,1467.9,8000.0
8400,CHRIS CATHCART,6765183,SO,2.0,COLORADO MINES,Rocky Mountain AC,58946,12th Annual UCCS Rust-Buster - 2019,"Sep 7, 2019","Sep 7, 2019",2019,Cross Country,8K,24:46.3,1.0,,1486.3,8000.0
8406,NATHAN DAVIS,6907863,SO,2.0,COLORADO MINES,Rocky Mountain AC,15905,Maverick Open,"Oct 26, 2019","Oct 26, 2019",2019,Cross Country,8K,26:14.1,20.0,,1574.1,8000.0
8407,NATHAN DAVIS,6907863,SO,2.0,COLORADO MINES,Rocky Mountain AC,15218,2019 FHSU Tiger Open,"Oct 12, 2019","Oct 12, 2019",2019,Cross Country,8K,25:35.7,28.0,,1535.7,8000.0
8408,NATHAN DAVIS,6907863,SO,2.0,COLORADO MINES,Rocky Mountain AC,16384,12th Annual UCCS Rust-Buster - 2019,"Sep 7, 2019","Sep 7, 2019",2019,Cross Country,8K,26:57.7,50.0,,1617.7,8000.0
8413,LUC HAGEN,5979587,SR,4.0,COLORADO MINES,Rocky Mountain AC,52971,NCAA Division II Cross Country Championships,"Nov 23, 2019","Nov 23, 2019",2019,Cross Country,10K,30:13.5,15.0,,1813.5,10000.0
8414,LUC HAGEN,5979587,SR,4.0,COLORADO MINES,Rocky Mountain AC,59432,NCAA Division II South Central Region Cross Co...,"Nov 9, 2019","Nov 9, 2019",2019,Cross Country,10K,31:39.7,12.0,,1899.7,10000.0
8415,LUC HAGEN,5979587,SR,4.0,COLORADO MINES,Rocky Mountain AC,63963,RMAC Cross Country Championships,"Oct 26, 2019","Oct 26, 2019",2019,Cross Country,8K,23:58.8,6.0,,1438.8,8000.0
8416,LUC HAGEN,5979587,SR,4.0,COLORADO MINES,Rocky Mountain AC,54828,Chile Pepper XC Fesitval,"Oct 5, 2019","Oct 5, 2019",2019,Cross Country,8K,24:30.6,28.0,,1470.6,8000.0
8417,LUC HAGEN,5979587,SR,4.0,COLORADO MINES,Rocky Mountain AC,14511,12th Annual UCCS Rust-Buster - 2019,"Sep 7, 2019","Sep 7, 2019",2019,Cross Country,8K,24:47.5,5.0,,1487.5,8000.0


In [7]:
r19 = pd.DataFrame()
for index, row in natys2019df_m.iterrows():
    df2 = mdf.loc[(mdf['Athlete ID'] == row['Athlete ID']) & (mdf['Season'] == "Cross Country") 
                             & (mdf['Year'] == 2019)]
    r19 = pd.concat([r19, df2])

In [8]:
# Setting up data for machine learning models
natsPlaces = []
for index, row in r19.iterrows():
    place = mdf.loc[(mdf['Athlete ID'] == row['Athlete ID']) & (mdf['Meet_Name'] == 'NCAA Division II Cross Country Championships') & (mdf["Year"] == 2019)].reset_index()['Place'][0]
    natsPlaces.append(place)


In [9]:
r19['NatsPlace'] = pd.Series(natsPlaces, r19.index)
modelData = r19.loc[r19['Meet_Name'] != 'NCAA Division II Cross Country Championships']
modelData

Unnamed: 0,Name,Athlete ID,Grade,Academic_Year,School,Conference,Meet_ID,Meet_Name,Meet_Start,Meet_End,Year,Season,Event,Mark,Place,Prelim/Final,Time,Distance,NatsPlace
853,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,52063,NCAA Division II Southeast Region Cross Countr...,"Nov 9, 2019","Nov 9, 2019",2019,Cross Country,10K,31:49.4,11.0,,1909.4,10000.0,57.0
854,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,16479,2019 Conference Carolinas XC Championships,"Oct 26, 2019","Oct 26, 2019",2019,Cross Country,8K,25:50.9,2.0,,1550.9,8000.0,57.0
855,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,11448,Royals XC Challenge,"Oct 11, 2019","Oct 11, 2019",2019,Cross Country,8K,24:16.2,9.0,,1456.2,8000.0,57.0
856,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,15167,Lehigh Paul Short Run (College),"Oct 5, 2019","Oct 5, 2019",2019,Cross Country,8K,24:59.6,177.0,,1499.6,8000.0,57.0
857,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,13733,Wingate University Bulldog Stampede,"Sep 21, 2019","Sep 21, 2019",2019,Cross Country,8K,25:56.7,3.0,,1556.7,8000.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20212,JESSE BECKER,6421717,Unknown,,GRAND VALLEY ST.,,13720,NCAA Division II Midwest Region Cross Country ...,"Nov 9, 2019","Nov 9, 2019",2019,Cross Country,10K,31:53.6,10.0,,1913.6,10000.0,76.0
20213,JESSE BECKER,6421717,Unknown,,GRAND VALLEY ST.,,13826,2019 GLIAC Cross Country Championships,"Oct 26, 2019","Oct 26, 2019",2019,Cross Country,8K,24:44.4,14.0,,1484.4,8000.0,76.0
20214,JESSE BECKER,6421717,Unknown,,GRAND VALLEY ST.,,15461,Lewis Crossover,"Oct 12, 2019","Oct 12, 2019",2019,Cross Country,8K,25:33.1,8.0,,1533.1,8000.0,76.0
20215,JESSE BECKER,6421717,Unknown,,GRAND VALLEY ST.,,15400,Ohio State - Buckeye Preview,"Sep 28, 2019","Sep 28, 2019",2019,Cross Country,8K,25:23.3,24.0,,1523.3,8000.0,76.0


In [10]:
# Machine Learning Models 
# Trying first Model : 
