In [1]:
import pandas as pd
import requests
import re
import numpy as np
from bs4 import BeautifulSoup

In [2]:
url="https://www.tfrrs.org/results/75224/m/NCAA_Division_I_Outdoor_Track__Field_Championships"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.102 Safari/537.36"}
response = requests.get(url, headers=headers)

if response.status_code < 300 and response.status_code >= 200:
    # panda's read_html doesn't accept percent colspan arguments
    HTML = response.text.replace('colspan="100%"', 'colspan="3"')
else:
    HTML = None
    raise Exception("Could not retrieve", response.status_code)

soup = BeautifulSoup(HTML, "html.parser")
event_names = [event.text.strip().replace("\n", " ") for event in soup.find_all("h3", class_="font-weight-500 pl-5")]
finals = [event for event in event_names if not any(x in event for x in ["Preliminaries", "Jump Off", "Decathlon", "4 x"])]
finals

["Men's 100 Meters Finals",
 "Men's 200 Meters Finals",
 "Men's 400 Meters Finals",
 "Men's 800 Meters Finals",
 "Men's 1500 Meters Finals",
 "Men's 5000 Meters",
 "Men's 10,000 Meters",
 "Men's 110 Hurdles Finals",
 "Men's 400 Hurdles Finals",
 "Men's 3000 Steeplechase Finals",
 'Men High Jump',
 "Men's  Pole Vault",
 "Men's  Long Jump",
 "Men's  Triple Jump",
 "Men's  Shot Put",
 "Men's  Discus",
 "Men's  Hammer",
 "Men's  Javelin"]

In [88]:
pd.options.mode.chained_assignment = None

finals_name_map = {
    "Men's 100 Meters Finals": "100",
    "Men's 200 Meters Finals": "200",
    "Men's 400 Meters Finals": "400",
    "Men's 800 Meters Finals": "800",
    "Men's 1500 Meters Finals": "1500",
    "Men's 5000 Meters": "5000",
    "Men's 10,000 Meters": "10000",
    "Men's 110 Hurdles Finals": "110H",
    "Men's 400 Hurdles Finals": "400H",
    "Men's 3000 Steeplechase Finals": "3000S",
    "Men High Jump": "HJ",
    "Men's  Pole Vault": "PV",
    "Men's  Long Jump": "LJ",
    "Men's  Triple Jump": "TJ",
    "Men's  Shot Put": "SP",
    "Men's  Discus": "DT",
    "Men's  Hammer": "HT",
    "Men's  Javelin": "JT"
}

prelims_name_map = {
    # prelims
    "Men's 100 Meters Preliminaries": "100",
    "Men's 200 Meters Preliminaries": "200",
    "Men's 400 Meters Preliminaries": "400",
    "Men's 800 Meters Preliminaries": "800",
    "Men's 1500 Meters Preliminaries": "1500",
    "Men's 110 Hurdles Preliminaries": "110H",
    "Men's 400 Hurdles Preliminaries": "400H",
    "Men's 3000 Steeplechase Preliminaries": "3000S"
}

all_athletes = pd.DataFrame(
    {"name": pd.Series(dtype="str"),
    "all_american": pd.Series(dtype="int"),
    "event": pd.Series(dtype="str")}
)

for tag in soup.find_all("div", class_="col-lg-12"):
    text = str(tag).replace("\n", " ")
    event = re.search("<h3\s*.*>(\s*.*)<\/h3>", text).group(1).strip()
    
    if event in prelims_name_map or event in finals_name_map:

        dfs = pd.read_html(text)    
        if len(dfs) > 1:
            raise Exception("Too many HTML tables")
        df = dfs[0]
        
        df["valid_name"] = df["NAME"].apply(lambda x: re.sub("'|\.|,|-|\s|FOUL", "", x).isalpha())
        df = df[df["valid_name"]]
        df["NAME"] = df["NAME"].apply(lambda x: x.replace("  ", " ").upper())
        
        if event in finals_name_map:
            df["all_american"] = df["PL"].apply(lambda x: 1 if x <= 8 else 0)
            df["event"] = finals_name_map[event]
        else:
            df["all_american"] = 0
            df["event"] = prelims_name_map[event]

        df = df[["NAME", "all_american", "event"]]
        df.columns = ["name", "all_american", "event"]
        
        all_athletes = pd.concat([all_athletes, df], ignore_index=True)

pd.options.mode.chained_assignment = "warn"

all_athletes = all_athletes.drop_duplicates() # there are duplicates for 9th place finisher in finals
all_athletes["row_num"] = all_athletes.groupby(["name", "event"])["all_american"].transform('count') # row count over partition by name, event
all_athletes = all_athletes[~((all_athletes["row_num"] == 2) & (all_athletes["all_american"] == 0))] # throw out prelims for all americans
all_athletes.drop(["row_num"], axis=1, inplace=True)
all_athletes

Unnamed: 0,name,all_american,event
0,JOSEPH FAHNBULLEH,1,100
1,FAVOUR ASHE,1,100
2,JAVONTE HARDING,1,100
3,JOVAUGHN MARTIN,1,100
4,DEDRICK VANOVER,1,100
...,...,...,...
503,MICHAEL WHITTAKER,0,JT
504,JARED O'RILEY,0,JT
505,SCOTT CAMPBELL,0,JT
506,JAMES MCNANEY,0,JT


In [89]:
# sum should be 8, count should be 24
#all_athletes.to_csv("./data/processed/events_and_target.csv", index=False)
#all_athletes[all_athletes["event"] == "400H"]
all_athletes.groupby("event").agg({"all_american": ["count", "sum"]})["all_american"].reset_index() # only 23 participants in 400H

Unnamed: 0,event,count,sum
0,100,24,8
1,10000,24,8
2,110H,24,8
3,1500,24,8
4,200,24,8
5,3000S,24,8
6,400,24,8
7,400H,23,8
8,5000,24,8
9,800,24,8


In [16]:
def get_html(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.102 Safari/537.36"}
    response = requests.get(url, headers=headers)

    if response.status_code < 300 and response.status_code >= 200:
        # panda's read_html doesn't accept percent colspan arguments
        HTML = response.text.replace('colspan="100%"', 'colspan="3"')
    else:
        HTML = None
        raise Exception("Could not retrieve", response.status_code)

    soup = BeautifulSoup(HTML, "html.parser")
    reduced = str(soup.find("div", class_="panel panel-bordered"))

    highschool_cutoff = re.search('↓Competing for\s+<a href="https:\/\/www.directathletics.com', reduced)
    if highschool_cutoff:
        print(url)
        reduced = reduced[:highschool_cutoff.span(0)[0]]

    return reduced

# HTML = get_html("https://tfrrs.org/athletes/7531871/Florida/Joseph_Fahnbulleh.html")
#HTML = get_html("https://tfrrs.org/athletes/7546466/Texas_Arlington/Arthur_Petersen.html")
#HTML = get_html("https://tfrrs.org/athletes/7526270/Maryland/Caleb_Dean.html")

In [155]:
def get_athlete_info(html_string, local=False):
    if local:
        with open(html_string, "r") as f:
            html_string = f.read()

    soup = BeautifulSoup(html_string, "html.parser")
    info = soup.find("div", class_="panel-heading").get_text().replace("\n", "")

    name, school = re.split("\(.+?\)", info)[:2] # school MIAMI (FLA.) goes to MIAMI but idc
    name = name.strip().replace("  ", " ")
    school = school.strip()

    grade_year = re.search("\(.+?\)", info).group()
    grade, year = re.split("-|/", grade_year[1:-1])

    data = {
        "name": name,
        "grade": grade if grade != "RS" else "SR",
        #"year": int(year) if year.isnumeric() else year,
        "year": int(year) if year.isnumeric() else 4,
        "school": school
    }

    return data


In [18]:
# valid_events = [
#     '60', '100', '200', '400', '800', '1500', 'Mile', '3000S', '5000', '10000', '8k', '10k'
#     '60H', '110H', '400H', '4x100', '4x400', 'DMR',
#     'JT', 'HT', 'WT', 'DT', 'SP',
#     'PV', 'HJ', 'LJ', 'TJ',
#     'Dec', 'Hep', #'Pent'
# ]

events = {
    "races": [
        "100",
        "200",
        "400",
        "800",
        "1500",
        "3000S",
        "5000",
        "10000",
        "110H",
        "400H",
    ],
    "jumps": ["HJ", "PV", "LJ", "TJ"],
    "throws": ["SP", "DT", "HT", "JT"]
    # "multi": [
    #    "Dec", "Hept"
    # ]
}

ncaa_champ_events = events["races"] + events["jumps"] + events["throws"]

# from https://en.wikipedia.org/wiki/List_of_United_States_collegiate_records_in_track_and_field
# ignoring PS records if higher and accepting foreign born athletes
ncaa_records = {
    "100": 9.82,
    "200": 19.69,
    "400": 43.61,
    "800": 60.0 + 43.25,
    "1500": 3 * 60.0 + 33.74,
    "3000S": 8 * 60.0 + 18.88,  # other record is hand timed and too fast
    "5000": 13 * 60.0 + 6.32,
    "10000": 27 * 60.0 + 8.49,
    "110H": 12.98,
    "400H": 47.02,
    "4x100": 37.97,
    "4x400": 2 * 60.0 + 58.53,
    #'DMR': 9*60.0+20.10,
    "HJ": 2.38,
    "PV": 6.00,
    "LJ": 8.74,
    "TJ": 17.57,
    "JT": 89.10,
    "SP": 22.0,
    "DT": 68.73,
    "HT": 81.94,
    #'Dec': 8720,
}


In [23]:
def parse_event_mark(mark):
    # try to make pandas use float to avoid importing all of numpy
    if isinstance(mark, np."float") or isinstance(mark, float):
        return float(mark)

    # Some results are just the float
    if mark.isalpha():
        return mark

    no_mark = re.match("([A-Z]+)\s*(\(-?\d.\d\)|[A-Z]+)", str(mark))
    if no_mark:
        return no_mark.group(1)

    # possibly irrelevant
    elif mark.replace(".", "").isnumeric():
        return float(mark)

    else:
        # Don't want feet conversion or wind right now
        endChars = ["m", "W", "w", "(", "W"]
        for char in endChars:
            if char in mark:
                return float(mark[0 : mark.index(char)])

    # Unaccounted for
    return mark

def convert_time_mark(mark):
    match = re.match("(\d{1,2}):(\d{1,2}.\d{1,2})", str(mark))
    return mark if not match else 60*float(match.group(1))+float(match.group(2))

def scale_mark(df):
    if str(df["raw_mark"]).isalpha():
        return df["raw_mark"]
    if df["event"] in events["races"]:
        return ncaa_records[df["event"]] / df["convert_mark"]
    else:
        return df["convert_mark"] / ncaa_records[df["event"]]

In [24]:
def get_event_history(HTML, local=False):
    if local:
        with open(HTML, "r") as f:
            HTML = f.read()
    dfs = pd.read_html(HTML)

    l = [
        df.values
        for df in dfs[1:]
        if not df.empty and str(df.values[0][0]).strip() in ncaa_champ_events# and int(df.columns[0][-4:]) >= 2017 \# exclude highschool data
    ]
    flat_list = [item for sublist in l for item in sublist]

    df = pd.DataFrame(flat_list, columns=["event", "raw_mark", "place"])
    df["event"] = df["event"].astype(str)
    df = df[df["event"].isin(ncaa_champ_events)] #re-apply incase they did multiple events in on meet

    df["raw_mark"] = df["raw_mark"].apply(lambda x: parse_event_mark(x))
    df["convert_mark"] = df["raw_mark"].apply(lambda x: convert_time_mark(x))
    df["scaled_mark"] = df.apply(lambda x: scale_mark(x), axis=1)

    return df

In [25]:
# Define functions for descriptive statistics and feature extraction
def no_mark_rate(x):
    return sum([1 if str(mark).isalpha() else 0 for mark in x]) / len(x)


def q1(x):
    return x.quantile(0.25)


def q3(x):
    return x.quantile(0.75)


def mean_three_recent(x):
    num_marks = min(3, len(x))
    return np.mean(x.loc[-num_marks:])


def mean_three_best(x):
    num_marks = min(3, len(x))
    return np.mean(sorted(x)[-num_marks])


def mean_three_worst(x):
    num_marks = min(3, len(x))
    return np.mean(sorted(x)[:num_marks])


def compute_statistics(df):

    # Get rate of no marking then remove no-mark columns
    nm_df = (
        df.groupby("event")
        .agg({"scaled_mark": [no_mark_rate, "count"]})["scaled_mark"]
        .reset_index()
    )

    df["convert_mark"] = df["convert_mark"].astype(str)
    df = df[~df["convert_mark"].str.contains("[A-Z]+", na=False, regex=True)]
    df = df.astype({"convert_mark": "float", "scaled_mark": "float"})

    # add feature PB this season
    stats_df = (
        df.groupby("event")
        .agg({
            "scaled_mark": [
                "min",
                "max",
                "mean",
                "std",
                "median",
                q1,
                q3,
                mean_three_recent,
                mean_three_best,
                mean_three_worst,
            ]
        })["scaled_mark"]
        .reset_index()
    )
    stats_df["percent_diff_recent_best"] = abs(stats_df["mean_three_recent"] - stats_df["mean_three_best"]) / ((stats_df["mean_three_recent"] + stats_df["mean_three_best"]) / 2)
    stats_df["percent_diff_recent_worst"] = abs(stats_df["mean_three_recent"] - stats_df["mean_three_worst"]) / ((stats_df["mean_three_recent"] + stats_df["mean_three_worst"]) / 2)

    return stats_df.merge(nm_df, how="left", on="event")


## Local stuff

In [135]:
def get_athlete_pages(meet_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.102 Safari/537.36"}
    response = requests.get(meet_url, headers=headers)
    results_html = response.text.replace('colspan="100%"', 'colspan="3"')

    results_soup = BeautifulSoup(results_html, "html.parser")
    athlete_pages = [link["href"] for link in results_soup.select("a[href*=athletes]")]

    return set(athlete_pages)

athlete_pages = get_athlete_pages("https://tfrrs.org/results/75224/m/NCAA_Division_I_Outdoor_Track__Field_Championships")

In [152]:
#for page in list(athlete_pages):
for page in athlete_pages:
    file_name = page.split("/")[6][:-5]
    with open("./data/"+file_name, "w") as f:
        f.write(get_html(page))

https://www.tfrrs.org/athletes/6986872/Houston/Jordan_Booker.html
https://www.tfrrs.org/athletes/7984404/Houston/Adam_Mason.html
https://tfrrs.org/athletes/6543213/Sam_Houston/Clayton_Fritsch.html
https://tfrrs.org/athletes/6547485/Wichita_State/Taran_Taylor.html
https://tfrrs.org/athletes/7526270/Maryland/Caleb_Dean.html
https://www.tfrrs.org/athletes/7019840/Texas_Tech/Jacolby_Shelton.html
https://tfrrs.org/athletes/6996705/Wichita_State/Michael_Bryan.html
https://tfrrs.org/athletes/7495800/Georgia/Matthew_Boling.html
https://tfrrs.org/athletes/6547850/Houston/Quivell_Jordan-Bacot.html
https://www.tfrrs.org/athletes/7495800/Georgia/Matthew_Boling.html
https://tfrrs.org/athletes/7388209/Arizona_State/Vincent__Mauri.html


In [156]:
import os

# from the web
# try:
#     for i, athlete_page in enumerate(athlete_pages):
#         if (i % 20) == 0:
#             print(i, athlete_page)
#         html = get_html(athlete_page)
#         athlete_info = get_athlete_info(html)
# except:
#     print(athlete_page)

# local
X = pd.DataFrame({
    'name': pd.Series(dtype="str"),
    'all_american': pd.Series(dtype="int"),
    'event': pd.Series(dtype="str"),
    'min': pd.Series(dtype="float"),
    'max': pd.Series(dtype="float"),
    'mean': pd.Series(dtype="float"),
    'std': pd.Series(dtype="float"),
    'median': pd.Series(dtype="float"),
    'q1': pd.Series(dtype="float"),
    'q3': pd.Series(dtype="float"),
    'mean_three_recent': pd.Series(dtype="float"),
    'mean_three_best': pd.Series(dtype="float"),
    'mean_three_worst': pd.Series(dtype="float"),
    'percent_diff_recent_best': pd.Series(dtype="float"),
    'percent_diff_recent_worst': pd.Series(dtype="float"),
    'no_mark_rate': pd.Series(dtype="float"),
    'count': pd.Series(dtype="int"),
    'grade': pd.Series(dtype="str"),
    'year': pd.Series(dtype="int"),
    'school': pd.Series(dtype="str")
})

try:
    for page in os.listdir("./data/html/"):
        info_dict = get_athlete_info("./data/html/"+page, local=True)
        event_df = get_event_history("./data/html/"+page, local=True)
        stats_df = compute_statistics(event_df)
        info_df = pd.DataFrame(info_dict, index=range(len(stats_df)))
        full_df = pd.merge(all_athletes, stats_df.join(info_df), on=["name", "event"], how="inner")
        X = pd.concat([X, full_df], ignore_index=True)
except Exception as e:
    print(e)
    print(page)

In [192]:
X.drop(["q1", "q3"], axis=1).corr()
print("Default data highest correlations to target")
print(sorted(X.corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")

print("Races data highest correlations to target")
print(sorted(X[X["event"].isin(events["races"])].corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")

print("Jumps data highest correlations to target")
print(sorted(X[X["event"].isin(events["jumps"])].corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")

print("Throws data highest correlations to target")
print(sorted(X[X["event"].isin(events["throws"])].corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")

Default data highest correlations to target
[0.2388225799065916, 0.21672901806617573, 0.19566722208845774, 0.1619775394304868, 0.1591473098431233]

Races data highest correlations to target
[0.45971676480878604, 0.43555160664427806, 0.39455926378968326, 0.29382988561881695, 0.2854967814823146]

Jumps data highest correlations to target
[0.4906239489924711, 0.4814069139830118, 0.4705346286953792, 0.4361764528515468, 0.43617645285154605]

Throws data highest correlations to target
[0.4928573172847412, 0.4918374039661947, 0.48898830502945273, 0.4242505003943985, 0.41122150858457834]



  X.drop(["q1", "q3"], axis=1).corr()
  print(sorted(X.corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")
  print(sorted(X[X["event"].isin(events["races"])].corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")
  print(sorted(X[X["event"].isin(events["jumps"])].corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")
  print(sorted(X[X["event"].isin(events["throws"])].corr()["all_american"], reverse=True)[1:6], sep=" ", end="\n\n")


In [81]:
#df_names.difference({x.replace("  ", " ") for x in info_names})

{'OMAMUYOWI ERHIRE'}

In [138]:
#test_html = get_html("https://www.tfrrs.org/athletes/7871291/Cornell/Rhys_Hammond.html")
test_html = "./data/html/Favour_Ashe"
info_dict = get_athlete_info(test_html, local=True)
event_df = get_event_history(test_html, local=True)
stats_df = compute_statistics(event_df)
info_df = pd.DataFrame(info_dict, index=range(len(stats_df)))
full_df = pd.merge(all_athletes, stats_df.join(info_df), on=["name", "event"], how="inner")
full_df

Unnamed: 0,name,all_american,event,min,max,mean,std,median,q1,q3,mean_three_recent,mean_three_best,mean_three_worst,percent_diff_recent_best,percent_diff_recent_worst,no_mark_rate,count,grade,year,school
0,FAVOUR ASHE,1,100,0.964637,1.003064,0.97443,0.011425,0.971316,0.968442,0.974206,0.97443,0.974206,0.966855,0.00023,0.007804,0.0,9,FR,1,TENNESSEE
