In [1]:
import numpy as np
import pandas as pd

In [2]:
def computeELO(elos, ranks, k):
  n = len(elos)
  if n <=1:
    return elos

  scores = (n - ranks) / (n-1)

  diff = (elos.reshape(-1,1) - elos.reshape(1,-1)) /400
  expected = 1/(1+10**(-diff))
  np.fill_diagonal(expected, np.nan)
  expected = np.nanmean(expected, axis=1)

  return elos + k * (scores - expected)

In [18]:
df = pd.read_csv('IndyCar_dataset_v11.csv')

ritmo = 5
DNFwindow = 20
baseELO = 1500.0
k = 64
tk = 48

ordered_cols = [
    "DriverName",
    "DriverID",
    "Rookie",
    "DRFAvg",
    "DTAvg",
    "DTTAvg",
    "DNFRate",
    "TDNFRate",
    "DriverElo",
    "PositionStart",
    "TeamName",
    "TeamID",
    "TRP",
    "TTP",
    "TeamDNFRate",
    "TeamElo",
    "EventName",
    "Track",
    "TrackID",
    "EventTrackType",
    "EventTrackTypeID",
    "EventDate",
    "EventDateFormatted",
    "EventID",
    "Era",
    "EraID",
    "Status",
    "StatusID",
    "FieldSize",
    "PositionFinish",
    "NormalizedPositionFinish"
]

In [19]:
df["EventDate"] = pd.to_datetime(df["EventDate"])
df = df.sort_values(["EventDate", "EventID"]).reset_index(drop=True)

In [20]:
df["NormalizedPositionFinish"] = (
    (df["PositionFinish"]-1)/(df["FieldSize"]-1)
)

In [21]:
df["DRFAvg"] = (
    df.groupby("DriverID")["NormalizedPositionFinish"].transform(lambda s: s.shift(1).rolling(ritmo, min_periods=1).mean())
)

In [22]:
df["DTAvg"] = (
    df.groupby(["DriverID", "TrackID"])["NormalizedPositionFinish"].transform(lambda s: s.shift(1).expanding().mean())
)

In [23]:
df['DTTAvg'] = (
    df.groupby(["DriverID", "EventTrackTypeID"])["NormalizedPositionFinish"].transform(lambda s: s.shift(1).expanding().mean())
)

In [24]:
df["DNFRate"] = (
    df.groupby("DriverID")["StatusID"].transform((lambda s: s.shift(1).rolling(DNFwindow, min_periods=1).mean()))
)

In [25]:
df["TDNFRate"] = (
    df.groupby(["DriverID", "TrackID"])["StatusID"].transform((lambda s: s.shift(1).expanding().mean()))
)

In [26]:
df["EventDate"] = pd.to_datetime(df["EventDate"])
df = df.sort_values(["EventDate", "EventID"]).reset_index(drop=True)

driverElo ={}
df["DriverElo"] = np.nan

for eventID, event in df.groupby("EventID", sort=False):
  indx = event.index
  drivers = event["DriverID"].values
  ranks = event["PositionFinish"].values.astype(float)

  preElo = np.array([driverElo.get(d, baseELO) for d in drivers], dtype=float)
  df.loc[indx, "DriverElo"] = preElo

  postElo = computeELO(preElo, ranks, k)
  for d, new in zip(drivers, postElo):
    driverElo[d] = float(new)


In [27]:
df = df.drop(columns=["TRP", "TTP", "TeamDNFRate", "TeamElo"])
df["EventDate"] = pd.to_datetime(df["EventDate"])

teamdf = (
    df.groupby(["EventID", "TeamID"], as_index=False)
    .agg(
        teamPerf=("NormalizedPositionFinish", "mean"),
        EventTrackTypeID=("EventTrackTypeID", "first"),
        EventDate=("EventDate", "first"),
        teamDNF=("StatusID", "max")
    ).sort_values(["EventDate", "EventID", "TeamID"]).reset_index(drop=True)
)

In [28]:
teamdf["TRP"] = (
    teamdf.groupby("TeamID")["teamPerf"].transform(lambda s: s.shift(1).rolling(ritmo, min_periods=1).mean())
)

In [29]:
teamdf = teamdf.sort_values(["TeamID", "EventTrackTypeID", "EventDate", "EventID"]).reset_index(drop=True)

teamdf["TTP"] = (
    teamdf.groupby(["TeamID", "EventTrackTypeID"])["teamPerf"].transform(lambda s: s.shift(1).expanding().mean())
)

In [30]:
teamdf = teamdf.sort_values(["EventDate", "EventID", "TeamID"]).reset_index(drop=True)

teamdf["TeamDNFRate"] = (
     teamdf.groupby("TeamID")["teamDNF"].transform(lambda s: s.shift(1).rolling(DNFwindow, min_periods=1).mean())
)


In [31]:
df["EventDate"] = pd.to_datetime(df["EventDate"])
df = df.sort_values(["EventDate", "EventID", "TeamID"]).reset_index(drop=True)

teamElo ={}
teamdf["TeamElo"] = np.nan

for eventID, event in teamdf.groupby("EventID", sort=False):
  indx = event.index
  teams = event["TeamID"].values
  ranks = event["teamPerf"].rank(method="first").values.astype(float)

  preElo = np.array([teamElo.get(t, baseELO) for t in teams], dtype=float)
  teamdf.loc[indx, "TeamElo"] = preElo

  postElo = computeELO(preElo, ranks, tk)
  for t, new in zip(teams, postElo):
    teamElo[t] = float(new)

In [32]:
df = df.merge(
    teamdf[["EventID", "TeamID", "TRP", "TTP", "TeamDNFRate", "TeamElo"]],
    on=["EventID", "TeamID"],
    how="left"
)

df["EventDate"] = pd.to_datetime(df["EventDate"])
df = df.sort_values(["EventDate", "EventID", "TeamID", "DriverID"]).reset_index(drop=True)
df = df[ordered_cols]

In [33]:
df.to_csv("IndyCar_dataset_v12.csv")