In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix
import datetime
import dateutil.parser
import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = pd.read_csv("track.csv").dropna()
print (dataset.columns)

Index(['Gender', 'Event', 'Location', 'Year', 'Medal', 'Name', 'Nationality',
       'Result'],
      dtype='object')


In [None]:
dataset.Event.value_counts()

Marathon Men                84
100M Men                    82
800M Men                    81
1500M Men                   81
110M Hurdles Men            80
400M Men                    80
200M Men                    78
Decathlon Men               76
400M Hurdles Men            75
3000M Steeplechase Men      72
10000M Men                  69
5000M Men                   69
Long Jump Men               63
High Jump Men               62
Discus Throw Men            60
100M Women                  60
Shot Put Men                60
Pole Vault Men              59
50Km Race Walk Men          57
Triple Jump Men             54
Hammer Throw Men            54
Discus Throw Women          53
200M Women                  51
Javelin Throw Men           50
Javelin Throw Women         50
20Km Race Walk Men          48
800M Women                  45
High Jump Women             45
Shot Put Women              41
400M Women                  39
Long Jump Women             35
100M Hurdles Women          34
1500M Wo

In [None]:
def time_convert_hour(time):
  if time.count(":") == 3:

    time = time[:-2]
  dt = datetime.datetime.strptime(time, "%H:%M:%S")
  return dt.hour * 3600 + dt.minute * 60 + dt.second

def time_convert_minute(time):

  if time.count(":") == 2:

    time = time[:-3]
  dt = datetime.datetime.strptime(time, "%M:%S")
  return dt.minute * 60 + dt.second

def time_convert_second(time):

  if time == "None":

    return np.nan

  if len(time) > 5:

    dt = datetime.datetime.strptime(time, "%M:%S.%f")

    return dt.minute * 60 + dt.second + dt.microsecond / 1000000

  return float(time)

In [None]:
def pair_event(event1, event2, event1_distance, event2_distance):

  event1_2 = event1.set_index("Name").join(event2.set_index ("Name"), rsuffix = "_event2", lsuffix = "_event1")
  event1_2 = (event1_2 [["Result_event1", "Result_event2"]]).dropna()

  event1_2["event1_speed"] = event1_distance/event1_2.Result_event1
  event1_2["event2_speed"] = event2_distance/event1_2.Result_event2

  event1_speed = event1_2[["event1_speed"]]
  event2_speed = event1_2["event2_speed"]
  lr = LinearRegression(fit_intercept=False)
  lr.fit(event1_speed,event2_speed)

  return lr.coef_

In [None]:
def data_extract(gender):

  one_hundred = dataset[dataset.Event == f"100M {gender}"]
  one_hundred["Result"] = one_hundred.Result.str.replace("-", ":").str.replace("h", ":").apply(time_convert_second)

  two_hundred = dataset[dataset.Event == f"200M {gender}"]
  two_hundred["Result"] = two_hundred.Result.str.replace("-", ":").str.replace("h", ":").apply(time_convert_second)

  four_hundred = dataset[dataset.Event == f"400M {gender}"]
  four_hundred["Result"] = four_hundred.Result.str.replace("-", ":").str.replace("h", ":").apply(time_convert_second)

  eight_hundred = dataset[dataset.Event == f"800M {gender}"]
  eight_hundred["Result"] = eight_hundred.Result.str.replace("-", ":").str.replace("h", ":").apply(time_convert_second)

  fifteen_hundred = dataset[dataset.Event == f"1500M {gender}"]
  fifteen_hundred["Result"] = fifteen_hundred.Result.str.replace("-", ":").str.replace("h", ":").apply(time_convert_second)

  five_k = dataset[dataset.Event == f"5000M {gender}"]
  five_k["Result"] = five_k.Result.str.replace("-", ":").str.replace("h", ":").str.replace(".", ":", regex = False).apply(time_convert_minute)

  ten_k = dataset[dataset.Event == f"10000M {gender}"]
  ten_k["Result"] = ten_k.Result.str.replace("-", ":").str.replace("h", ":").str.replace(".", ":", regex = False).apply(time_convert_minute)

  marathon = dataset[dataset.Event == f"Marathon {gender}"]
  marathon["Result"] = marathon.Result.str.replace("-", ":").str.replace("h", ":").str.replace(".", ":", regex = False).apply(time_convert_hour)

  return one_hundred, two_hundred, four_hundred, eight_hundred, fifteen_hundred, five_k, ten_k, marathon

In [None]:
gender = "Men"
events = [100, 200, 400, 800, 1500, 5000, 10000, 42195]

def get_conversion(gender):

  events = [100, 200, 400, 800, 1500, 5000, 10000, 42195]
  one_hundred, two_hundred, four_hundred, eight_hundred, fifteen_hundred, five_k, ten_k, marathon = data_extract(gender)
  event_names = [one_hundred, two_hundred, four_hundred, eight_hundred, fifteen_hundred, five_k, ten_k, marathon]

  if gender == "Women":

    events = events[:-1]
    event_names = event_names[:-1]

  conversion = []
  for i in range (len(events)):

    for j in range (i + 1, len(events)):

      try:

        conversion_rates = pair_event(event_names[i], event_names[j], events[i], events[j])[0]
        conversion.append({"conversion rate": conversion_rates, "Distance 1": events[i], "Distance 2": events[j]})

      except:

        continue

  conversion = pd.DataFrame(conversion)

  conversion = conversion.set_index(["Distance 1", "Distance 2"])

  return conversion

In [None]:
conversion = get_conversion("Men")

In [None]:
speed_list = []
for i in range(len(events)):

  if events[i] == 100:

    speed = 1

  else:

    speed *= conversion.loc[events[i-1], events[i]].values[0]

  speed_list.append(speed)

speed_list2 = []
for i in range(len(events)):

  if events[i] == 100:

    speed = 1

  elif events[i] == 400:

    speed = conversion.loc[100, 400].values[0]

  else:

    speed *= conversion.loc[events[i-1], events[i]].values[0]

  speed_list2.append(speed)

speed_list3 = []
speed_1500 = 0
for i in range(len(events)):

  if events[i] == 100:

    speed = 1

  elif events[i] == 10000 and gender == "Men":

    speed = speed_1500 * conversion.loc[1500, 10000].values[0]

  else:

    speed *= conversion.loc[events[i-1], events[i]].values[0]

  if events[i] == 1500:

    speed_1500 = speed

  speed_list3.append(speed)

px.line(x = events, y = [speed_list, speed_list2, speed_list3])

In [None]:
conversion_men = get_conversion("Men")
conversion_women = get_conversion("Women")

speed_list_women = []
speed_list_men = []
for i in range(len(events) - 1):

  if events[i] == 100:

    speed_men = 1
    speed_women = 1

  else:

    speed_men *= conversion_men.loc[events[i-1], events[i]].values[0]
    speed_women *= conversion_women.loc[events[i-1], events[i]].values[0]

  speed_list_men.append(speed_men)
  speed_list_women.append(speed_women)

conversion_plot = px.line(x = events[:-1],
        y = [speed_list_men, speed_list_women],
        title = "Conversion Rates Across Distances for Men and Women")

conversion_plot.update_layout(
    xaxis_title = "Distance (m)",
    yaxis_title = "Conversion Rate",
    legend_title_text = "Gender"

)

for i, new_name in enumerate(["Men", "Women"]):
        conversion_plot.data[i].name = new_name

conversion_plot.show()

In [None]:
print (speed_list, events)

def exponential(x):

  return 0.4 * np.exp(-1.4 * np.array(x) / 1000) + 0.65

px.line(x = events, y = [speed_list, exponential(events)])

[1, 0.9956348768061271, 0.8908976653337459, 0.7607736332467596, 0.6791532669182347, 0.6043173403932832, 0.578556728356537, 0.5044525306623887] [100, 200, 400, 800, 1500, 5000, 10000, 42195]


In [None]:
fig = px.bar(marathon.groupby(["Year"], as_index = False).min(), x = "Year", y = "Result")
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

NameError: ignored

In [None]:
fig = px.bar(ten_k.groupby(["Nationality"], as_index = False).min(), x = "Nationality", y = "Result")
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
print ("100 - 200 ", pair_event(one_hundred, two_hundred, 100, 200))
print ("200 - 400 ", pair_event(two_hundred, four_hundred, 200, 400))
print ("400 - 800 ", pair_event(four_hundred, eight_hundred, 400, 800))
print ("800 - 1500 ", pair_event(eight_hundred, fifteen_hundred, 800, 1500))
print ("1500 - 5k ", pair_event(fifteen_hundred, five_k, 1500, 5000))
print ("5k - 10k ", pair_event(five_k, ten_k, 5000, 10000))
print ("10k - Marathon ", pair_event(ten_k, marathon, 10000, 42195))