In [225]:
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt
import matplotlib
import sklearn as sk

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE, SpectralEmbedding
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure, silhouette_score, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, f1_score

plt.style.use("ggplot")
pd.set_option('display.max_rows', 500)

In [226]:
mdf = pd.read_csv('M_athlete_results.csv', low_memory=False)
fdf = pd.read_csv('F_athlete_results.csv', low_memory=False)

In [227]:
# String marks to float times in seconds
mdf['Time'] = [utils.Time(m) if (e in utils.event_classification()['Track'] and utils.ismark(m)) else None for e, m in zip(mdf['Event'], mdf['Mark'])]
fdf['Time'] = [utils.Time(m) if (e in utils.event_classification()['Track'] and utils.ismark(m)) else None for e, m in zip(fdf['Event'], fdf['Mark'])]

In [228]:
# String events to float distance in meters
mdf['Distance'] = [utils.string_to_distance(e) if e in utils.event_classification()['Track'] else None for e in mdf['Event']]
fdf['Distance'] = [utils.string_to_distance(e) if e in utils.event_classification()['Track'] else None for e in fdf['Event']]

In [229]:
mdf_dataframe = pd.DataFrame(mdf)
fdf_dataframe = pd.DataFrame(fdf)

csm_m = mdf.loc[(mdf['School'] == 'COLORADO MINES') & (mdf['Season'] == 'Cross Country')]
csm_f = fdf.loc[(fdf['School'] == 'COLORADO MINES') & (fdf['Season'] == 'Cross Country')]

natys2019_m = (mdf.loc[(mdf['Meet_Name'] == 'NCAA Division II Cross Country Championships') & (mdf['Year'] == 2019)])
natys2019df_m = pd.DataFrame(natys2019_m)

In [None]:
csm_m

In [240]:
r19 = pd.DataFrame()
for index, row in natys2019df_m.iterrows():
    df2 = mdf.loc[(mdf['Athlete ID'] == row['Athlete ID']) & (mdf['Season'] == "Cross Country") 
                             & (mdf['Year'] == 2019)]
    r19 = pd.concat([r19, df2])

In [296]:
# Setting up data for machine learning models
natsPlaces = []
for index, row in r19.iterrows():
    place = mdf.loc[(mdf['Athlete ID'] == row['Athlete ID']) & (mdf['Meet_Name'] == 'NCAA Division II Cross Country Championships') & (mdf["Year"] == 2019)].reset_index()['Place'][0]
    natsPlaces.append(place)


57.0
852
57.0
57.0
853
57.0
57.0
854
57.0
57.0
855
57.0
57.0
856
57.0
57.0
857
57.0
108.0
1683
108.0
108.0
1684
108.0
108.0
1685
108.0
108.0
1686
108.0
108.0
1687
108.0
108.0
1688
108.0
108.0
1689
108.0
141.0
4141
141.0
141.0
4142
141.0
141.0
4143
141.0
141.0
4144
141.0
141.0
4145
141.0
141.0
4146
141.0
163.0
4264
163.0
163.0
4265
163.0
163.0
4266
163.0
163.0
4267
163.0
163.0
4268
163.0
163.0
4269
163.0
181.0
4348
181.0
181.0
4349
181.0
181.0
4350
181.0
181.0
4351
181.0
181.0
4352
181.0
181.0
4353
181.0
13.0
4393
13.0
13.0
4394
13.0
13.0
4395
13.0
13.0
4396
13.0
13.0
4397
13.0
13.0
4398
13.0
125.0
5854
125.0
125.0
5855
125.0
125.0
5856
125.0
125.0
5857
125.0
125.0
5858
125.0
125.0
5859
125.0
5.0
5900
5.0
5.0
5901
5.0
5.0
5902
5.0
5.0
5903
5.0
5.0
5904
5.0
5.0
5905
5.0
240.0
5931
240.0
240.0
5932
240.0
240.0
5933
240.0
240.0
5934
240.0
240.0
5935
240.0
240.0
5936
240.0
162.0
5961
162.0
162.0
5962
162.0
162.0
5963
162.0
162.0
5964
162.0
162.0
5965
162.0
162.0
5966
162.0
84.0
6060
84.0
84

244.0
14007
244.0
244.0
14008
244.0
244.0
14009
244.0
244.0
14010
244.0
253.0
14011
253.0
253.0
14012
253.0
253.0
14013
253.0
253.0
14014
253.0
253.0
14015
253.0
71.0
14026
71.0
71.0
14027
71.0
71.0
14028
71.0
71.0
14029
71.0
71.0
14030
71.0
71.0
14031
71.0
40.0
14097
40.0
40.0
14098
40.0
40.0
14099
40.0
40.0
14100
40.0
40.0
14101
40.0
40.0
14102
40.0
40.0
14103
40.0
256.0
14126
256.0
256.0
14127
256.0
256.0
14128
256.0
256.0
14129
256.0
256.0
14130
256.0
256.0
14131
256.0
256.0
14132
256.0
70.0
14134
70.0
70.0
14135
70.0
70.0
14136
70.0
70.0
14137
70.0
70.0
14138
70.0
70.0
14139
70.0
78.0
14161
78.0
78.0
14162
78.0
78.0
14163
78.0
78.0
14164
78.0
78.0
14165
78.0
78.0
14166
78.0
258.0
14173
258.0
258.0
14174
258.0
258.0
14175
258.0
258.0
14176
258.0
258.0
14177
258.0
258.0
14178
258.0
258.0
14179
258.0
45.0
14185
45.0
45.0
14186
45.0
45.0
14187
45.0
45.0
14188
45.0
45.0
14189
45.0
45.0
14190
45.0
210.0
14259
210.0
210.0
14260
210.0
210.0
14261
210.0
210.0
14262
210.0
210.0
14263
210.0


46.0
16765
46.0
128.0
16771
128.0
128.0
16772
128.0
128.0
16773
128.0
128.0
16774
128.0
128.0
16775
128.0
128.0
16776
128.0
225.0
16808
225.0
225.0
16809
225.0
225.0
16810
225.0
225.0
16811
225.0
225.0
16812
225.0
225.0
16813
225.0
196.0
16841
196.0
196.0
16842
196.0
196.0
16843
196.0
196.0
16844
196.0
196.0
16845
196.0
98.0
16906
98.0
98.0
16907
98.0
98.0
16908
98.0
98.0
16909
98.0
98.0
16910
98.0
98.0
16911
98.0
98.0
16912
98.0
116.0
16932
116.0
116.0
16933
116.0
116.0
16934
116.0
116.0
16935
116.0
116.0
16936
116.0
116.0
16937
116.0
233.0
16955
233.0
233.0
16956
233.0
233.0
16957
233.0
233.0
16958
233.0
233.0
16959
233.0
233.0
16960
233.0
233.0
16961
233.0
233.0
16962
233.0
88.0
16986
88.0
88.0
16987
88.0
88.0
16988
88.0
88.0
16989
88.0
88.0
16990
88.0
88.0
16991
88.0
29.0
17027
29.0
29.0
17028
29.0
29.0
17029
29.0
29.0
17030
29.0
29.0
17031
29.0
29.0
17032
29.0
36.0
17051
36.0
36.0
17052
36.0
36.0
17053
36.0
36.0
17054
36.0
36.0
17055
36.0
36.0
17056
36.0
36.0
17057
36.0
148.0
1712

180.0
19325
180.0
180.0
19326
180.0
180.0
19327
180.0
180.0
19328
180.0
180.0
19329
180.0
224.0
19368
224.0
224.0
19369
224.0
224.0
19370
224.0
224.0
19371
224.0
224.0
19372
224.0
224.0
19373
224.0
193.0
19419
193.0
193.0
19420
193.0
193.0
19421
193.0
193.0
19422
193.0
193.0
19423
193.0
193.0
19424
193.0
193.0
19425
193.0
219.0
19470
219.0
219.0
19471
219.0
219.0
19472
219.0
219.0
19473
219.0
219.0
19474
219.0
219.0
19475
219.0
63.0
19490
63.0
63.0
19491
63.0
63.0
19492
63.0
63.0
19493
63.0
63.0
19494
63.0
63.0
19495
63.0
63.0
19496
63.0
205.0
19565
205.0
205.0
19566
205.0
205.0
19567
205.0
205.0
19568
205.0
119.0
19623
119.0
119.0
19624
119.0
119.0
19625
119.0
119.0
19626
119.0
121.0
19662
121.0
121.0
19663
121.0
121.0
19664
121.0
121.0
19665
121.0
121.0
19666
121.0
121.0
19667
121.0
121.0
19668
121.0
192.0
19720
192.0
192.0
19721
192.0
192.0
19722
192.0
192.0
19723
192.0
192.0
19724
192.0
192.0
19725
192.0
192.0
19726
192.0
104.0
19752
104.0
104.0
19753
104.0
104.0
19754
104.0
104.0


In [297]:
r19['NatsPlace'] = pd.Series(natsPlaces, r19.index)
modelData = r19.loc[r19['Meet_Name'] != 'NCAA Division II Cross Country Championships']
modelData

Unnamed: 0,Name,Athlete ID,Grade,Academic_Year,School,Conference,Meet_ID,Meet_Name,Meet_Start,Meet_End,Year,Season,Event,Mark,Place,Prelim/Final,Time,Distance,NatsPlace
853,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,52063,NCAA Division II Southeast Region Cross Countr...,"Nov 9, 2019","Nov 9, 2019",2019,Cross Country,10K,31:49.4,11.0,,1909.4,10000.0000,57.0
854,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,16479,2019 Conference Carolinas XC Championships,"Oct 26, 2019","Oct 26, 2019",2019,Cross Country,8K,25:50.9,2.0,,1550.9,8000.0000,57.0
855,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,11448,Royals XC Challenge,"Oct 11, 2019","Oct 11, 2019",2019,Cross Country,8K,24:16.2,9.0,,1456.2,8000.0000,57.0
856,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,15167,Lehigh Paul Short Run (College),"Oct 5, 2019","Oct 5, 2019",2019,Cross Country,8K,24:59.6,177.0,,1499.6,8000.0000,57.0
857,JACOB GONZALEZ,6419804,SR,4,MOUNT OLIVE,Conference Carolinas,13733,Wingate University Bulldog Stampede,"Sep 21, 2019","Sep 21, 2019",2019,Cross Country,8K,25:56.7,3.0,,1556.7,8000.0000,57.0
1684,BLAKE JONES,6876234,FR,1,ILLINOIS-SPRINGFIELD,GLVC,17078,NCAA Division II Midwest Region Cross Country ...,"Nov 9, 2019","Nov 9, 2019",2019,Cross Country,10K,31:42.3,8.0,,1902.3,10000.0000,108.0
1685,BLAKE JONES,6876234,FR,1,ILLINOIS-SPRINGFIELD,GLVC,15400,2019 GLVC Cross Country Championships,"Oct 26, 2019","Oct 26, 2019",2019,Cross Country,8K,25:32.0,6.0,,1532.0,8000.0000,108.0
1686,BLAKE JONES,6876234,FR,1,ILLINOIS-SPRINGFIELD,GLVC,15008,Lewis Crossover,"Oct 12, 2019","Oct 12, 2019",2019,Cross Country,8K,26:19.2,51.0,,1579.2,8000.0000,108.0
1687,BLAKE JONES,6876234,FR,1,ILLINOIS-SPRINGFIELD,GLVC,16706,Loyola Lakefront Collegiate Invitational,"Sep 28, 2019","Sep 28, 2019",2019,Cross Country,8K,25:37.8,15.0,,1537.8,8000.0000,108.0
1688,BLAKE JONES,6876234,FR,1,ILLINOIS-SPRINGFIELD,GLVC,59775,MSU Spartan Invitational,"Sep 13, 2019","Sep 13, 2019",2019,Cross Country,8K,25:25.8,13.0,,1525.8,8000.0000,108.0


In [None]:
# Machine Learning Models 
# Trying first Model : 
