# F1 Lap-by-Lap Race Prediction
Data from http://ergast.com/mrd/db/  

This model uses an LSTM to predict the positions, laptimes, pitstops and statuses of 20 drivers. 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F
import math


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
db_dir = '/content/drive/My Drive/f1ml/db/'

In [None]:
# The time format in the Ergast database is MM:SS.ms
def time_to_int(t):
  if (t == float):
    return t
  t2 = str(t)
  ts = t2.rsplit(':')
  if ('\\N' in t2):
    return None
  if (not '.' in t2):
    return None
  if (len(ts) > 1):
    return int(ts[0]) * 60 + float(ts[1])
  else:
    return float(ts[0])

In [None]:
time_to_int(22.)

22.0

## Data preparation

Each csv file contains information of one race, and each row contains information of a lap. 

v1: wrong constructor standing  
v2: fixed constructor standing, removed quali field  
v3: changed statuses to stick after retiring, intead of defaulting to 0; drivers are ordered in quali positions, instead of final positions  
v4: changed inpit to pitting at the previous lap

In [None]:
# Open relevant files
races = pd.read_csv(db_dir + 'races.csv')
d_standings = pd.read_csv(db_dir + 'driver_standings.csv')
c_standings = pd.read_csv(db_dir + 'constructor_standings.csv')
quali = pd.read_csv(db_dir + 'qualifying.csv')
pit_stops = pd.read_csv(db_dir + 'pit_stops.csv')
lap_times = pd.read_csv(db_dir + 'lap_times.csv')
results = pd.read_csv(db_dir + 'results.csv')

# we only look at data relevant to the current grid (2021)
races_newer = races.query('year  >= 2001')
rids = races_newer['raceId']
#for i in rids:
for i in tqdm(rids):
  # Get relevant entries to this race
  race_info = races.query(f'raceId == {i}')
  year = race_info['year'].item()
  if (year == 2021):
    continue
  circuit = race_info['circuitId'].item()
  if not os.path.exists(db_dir + f'races/{year}'):
      os.makedirs(db_dir + f'races/{year}')
  if os.path.exists(db_dir + f'races/{year}/race{i}.csv'):
    continue
  if (race_info['round'].item() > 1):
    d_standing_br = d_standings.query(f'raceId == {i-1}')
    c_standing_br = c_standings.query(f'raceId == {i-1}')
  else:
    prev_s = races.query(f'year == {year - 1}')
    prev_s = prev_s.sort_values(by=['round'])
    prev_s = prev_s.reset_index()
    prev_last_race = prev_s['raceId'].iloc[-1]
    d_standing_br  = d_standings.query(f'raceId == {prev_last_race}')
    c_standing_br  = c_standings.query(f'raceId == {prev_last_race}')
  quali_info = quali.query(f'raceId == {i}')
  r_laptimes = lap_times.query(f'raceId == {i}')
  r_pitstops = pit_stops.query(f'raceId == {i}') # only available from 2012...
  r_results = results.query(f'raceId == {i}')
  r_results_sorted = r_results.sort_values(by=['position'])
  r_results_sorted = r_results_sorted.reset_index()
  num_of_laps = r_results_sorted['laps'].iloc[0]
  r_results_sorted_grid = r_results.sort_values(by=['grid'])
  r_results_sorted_grid = r_results_sorted_grid.reset_index()
  # Make dataframe
  columns=['circuitId']
  for k in range(20):
    columns.extend([
            f'driverId{k+1}',
            f'driverStanding{k+1}',
            f'constructorStanding{k+1}',
            f'position{k+1}',
            f'inPit{k+1}',
            f'status{k+1}',
            f'laptime{k+1}'
    ])
  df = pd.DataFrame(columns=columns)
  # Each lap is a row, each race is a dataframe
  for lap_num in range(0, num_of_laps + 1):
    driver_ids = []
    d_s = [] # standings
    c_s = []
    pos = []
    pit = []
    statuses = []
    lps = []
    for id in r_results_sorted_grid['driverId']:
      driver_ids.append(id)
      constructorId = r_results.query(f'driverId == {id}')['constructorId'].item()
      d_s_p = d_standing_br.query(f'driverId == {id}')['position']
      if (not d_s_p.empty):
        d_s.append(d_s_p.item())
      else:
        d_s.append(20)
      c_s_p = c_standing_br.query(f'constructorId == {constructorId}')['position']
      if (not c_s_p.empty):
        c_s.append(c_s_p.item())
      else:
        c_s.append(10)
      if (lap_num == 0):
        p = quali_info.query(f'driverId == {id}')['position']
        if (not p.empty):
          p = p.item()
        else:
          p = 21
      else:
        p = r_laptimes.query(f'driverId == {id} & lap == {lap_num}')['position']
        if (not p.empty): # position could be null
          p = p.item()
        else:
          p = 21 # 21 means retired
      pos.append(p)
      inp = r_pitstops.query(f'driverId == {id} & lap == {lap_num}')
      if (not inp.empty):
        pit.append(1)
      else:
        pit.append(0)

      # Zeroth lap laptime is quali laptime
      if (lap_num == 0):
        q3_s = quali_info.query(f'driverId == {id}')['q3']
        if (not q3_s.empty):
          q3 = time_to_int(q3_s.item())
        else:
          q3 = None
        q2_s = quali_info.query(f'driverId == {id}')['q2']
        if (not q2_s.empty):
          q2 = time_to_int(q2_s.item())
        else:
          q2 = None
        q1_s = quali_info.query(f'driverId == {id}')['q1']
        if (not q1_s.empty):
          q1 = time_to_int(q1_s.item())
        else:
          q1 = None
        if (q3):
          lps.append(q3)
        elif (q2):
          lps.append(q2)
        elif (q1):
          lps.append(q1)
        else:
          lps.append(0)
        statuses.append(0) # 0 when in race or before race start
      elif (r_results.query(f'driverId == {id}')['laps'].item() <= lap_num): # check if driver has retired
        statuses.append(r_results.query(f'driverId == {id}')['statusId'].item())
        lps.append(0)
      else:
        statuses.append(0)
        t = r_laptimes.query(f'driverId == {id} & lap == {lap_num}')['time']
        if (not t.empty): 
          lps.append(time_to_int(t.item()))
        else: # if somehow we cant find a laptime
          lps.append(0)
    row = {}
    row['circuitId'] = circuit
    for j in range(len(driver_ids)):
        row[f'driverId{j+1}'] = driver_ids[j]
        row[f'driverStanding{j+1}'] = d_s[j]
        row[f'constructorStanding{j+1}'] = c_s[j]
        row[f'position{j+1}'] = pos[j]
        row[f'inPit{j+1}'] = pit[j]
        row[f'status{j+1}'] = statuses[j]
        row[f'laptime{j+1}'] = lps[j]
    df = df.append(row, ignore_index=True)
  df.to_csv(db_dir + f'races/{year}/race{i}.csv')

In [None]:
# Open relevant files
races = pd.read_csv(db_dir + 'races.csv')
results = pd.read_csv(db_dir + 'results.csv')

# we only look at data relevant to the current grid (2021)
races_newer = races.query('year  >= 2001')
rids = races_newer['raceId']
df = pd.DataFrame(columns=['driverId'])
for i in tqdm(rids):
  r_results = results.query(f'raceId == {i}')
  for id in r_results['driverId']:
    df = df.append({'driverId': id}, ignore_index=True)


HBox(children=(FloatProgress(value=0.0, max=395.0), HTML(value='')))




In [None]:
dddf = pd.DataFrame({'driverId':df['driverId'].unique()})

In [None]:
dddf = dddf.sort_values(by=['driverId']).reset_index()

In [None]:
dddf = dddf.drop(columns=['index'])
dddf

In [None]:
dddf.to_csv(db_dir + f'drivers_short.csv')

In [None]:
years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
for y in years:
  if not os.path.exists(db_dir + f'races/{y}'):
      os.makedirs(db_dir + f'races/{y}')

  cur_year = os.listdir(db_dir + f'races_v3/{y}/')
  for r in cur_year:
    #if os.path.exists(db_dir + f'races/{y}/{r}'):
    #  continue
    
    cur_race = pd.read_csv(db_dir + f'races_v3/{y}/{r}')
    for j in range(20):
      for i in range(len(cur_race) - 1):
        if (cur_race[f'inPit{j+1}'][i+1] == 1):
          cur_race[f'inPit{j+1}'][i] = 1
          cur_race[f'inPit{j+1}'][i+1] = 0
      cur_race.rename(columns={f'inPit{j+1}': f'pitting{j+1}'}, inplace=True)

    cur_race.to_csv(db_dir + f'races/{y}/{r}', index=False)
  print(y)
    


In [None]:
years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
numlaps = []
for y in years:
  cur_year = os.listdir(db_dir + f'races/{y}/')
  for r in cur_year:
    cur_race = pd.read_csv(db_dir + f'races/{y}/{r}')
    numlaps.append(len(cur_race) - 1)
  print(y)


2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [None]:
numlaps.sort()
print(numlaps)
print(numlaps[-1])

[31, 36, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 49, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 63, 64, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,

## Making a dataset

### Helper functions for getting information from ids

In [None]:
races = pd.read_csv(db_dir + 'races.csv')
circuits = pd.read_csv(db_dir + 'circuits.csv')
drivers = pd.read_csv(db_dir + 'drivers.csv')
constructor = pd.read_csv(db_dir + 'constructors.csv')
status = pd.read_csv(db_dir + 'status.csv')

def race_info(raceId):
  _races = races
  _r = _races.query(f'raceId  == {raceId}')
  if (_r.empty):
    return None, None, None
  _year = _r['year'].item()
  _round = _r['round'].item()
  _circuitId = _r['circuitId'].item()
  return _year, _round, _circuitId

def circuit_info(circuitId):
  _circuits = circuits
  _c = _circuits.query(f'circuitId  == {circuitId}')
  if (_c.empty):
    return None, None, None
  _name = _c['name'].item()
  _location = _c['location'].item()
  _country = _c['country'].item()
  return _name, _location, _country

def driver_info(id):
  _drivers = drivers
  _d = _drivers.query(f'driverId  == {id}')
  if (_d.empty):
    return None, None, None, None, None, None
  _number = _d['number'].item()
  _code = _d['code'].item()
  _forename = _d['forename'].item()
  _surname = _d['surname'].item()
  _dob = _d['dob'].item()
  _nationality = _d['nationality'].item()
  return _number, _code, _forename, _surname, _dob, _nationality

def constructor_info(id):
  _constructor = constructors
  _c = _constructor.query(f'constructorId  == {id}')
  if (_c.empty):
    return None, None
  _name = _d['name'].item()
  _nationality = _d['nationality'].item()
  return _name, _nationality

def status_info(id):
  _status = status
  _s = _status.query(f'statusId == {id}')
  if (_s.empty):
    return None
  _sstr = _s['status'].item()
  return _sstr

In [None]:
race_info(1053, races)

(2021, 2, 21)

### Dataset class

v2: expected output should exclude drivers, circuits, ... information that should not change. instead of having our model guess those, our model will now only predict laptimes, positions, etc  
v3: model now doesnt predict status, and we scale driver id and laptime  
v3.1: fixed the issue where i was scaling the position feature down  
v3.2: scale up position and pit status, scale laptime down less  
v3.3: does not scale pit status  
v3.4: does not scale position, scale pit status    
v4: only predicts pos and laptimes, only scale driver id and laptime  
v5: change input from inpit to pitting  
v5.1: predict  pitting  
v5.2: laptimes scale down 10 times instead of 100    
v6: returns a sequence of laptimes uptill the current lap instead of just the current one

In [None]:
# to make feature size consistent, we use laptimes of top 20 drivers
class RaceDataset(torch.utils.data.Dataset):
  def __init__(self, dir):

    self.dir = dir
    self.year = 2001
    self.round = 1

    # get a list of races in this season, should be ordered automatically
    self.cur_year = os.listdir(self.dir + f'{self.year}/')

    if (self.round < len(self.cur_year)):
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[self.round - 1]}')
    else:
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[-1]}')


  def set_year(self, year):
    self.year = year
    self.cur_year = os.listdir(self.dir + f'{self.year}/')

  def set_round(self, round):
    self.round = round
    if (self.year == 2021):
      return # no data yet
    # set round, if round > number of rounds in that season, set to the last round
    if (self.round < len(self.cur_year)):
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[self.round - 1]}')
    else:
      self.cur_race = pd.read_csv(self.dir + f'{self.year}/{self.cur_year[-1]}')

  def next_round(self):
    self.round += 1
    if (self.round < len(self.cur_year)):
      self.set_round(self.round)
    else:
      self.round = 1
      self.set_year(self.year+1)
      self.set_round(self.round)


  def __len__(self):
    # -1 since the last lap will always only be a label
    return len(self.cur_race) - 1

  def __getitem__(self, i):
    for j in range(i+1):
      # torch.Size([141])
      cur = torch.tensor(self.cur_race.iloc[j].values)[1:142] # removes index and trim size
      # replace NaN with 0
      cur[cur != cur] = 0
      for k in range(0, 140, 7):
        cur[k] = cur[k] / 100
        cur[k+6] = cur[k+6] / 10
      if (j == 0):
        cur_ret = cur.clone()
      elif (j == 1):
        cur_ret = cur_ret.unsqueeze(0)
        cur_ret = torch.cat((cur_ret, cur.clone().unsqueeze(0)), 0)
      else:
        cur_ret = torch.cat((cur_ret, cur.clone().unsqueeze(0)), 0)

      next = torch.tensor(self.cur_race.iloc[i+1].values)[1:142] # removes index and trim size
      next_exp = torch.cat((next[4:6], next[7:8]),0) # this crashes the session if both are single elements
      for k in range (1, 20):
        next_exp = torch.cat((
            next_exp,
            next[4 + 7*k :6+7*k],
            next[7 + 7*k: 8+7*k]
            ), 0)
      next_exp[next_exp != next_exp] = 0
      for k in range(0, 60, 3):
        next_exp[k+2] = next_exp[k+2] / 10
      if (j == 0):
        next_ret = next_exp.clone()
      elif (j == 1):
        next_ret = next_ret.unsqueeze(0)
        next_ret = torch.cat((next_ret, next_exp.clone().unsqueeze(0)), 0)
      else:
        next_ret = torch.cat((next_ret, next_exp.clone().unsqueeze(0)), 0)

    return (cur_ret, next_ret)

In [None]:
ds = RaceDataset(db_dir + 'races/', 88)

In [None]:
ds.next_round()
ds.year

2002

In [None]:
ds[0]

In [None]:
cur, n = ds[1]

In [None]:
cur

tensor([[1.0000e-02, 3.0000e+01, 1.0000e+00, 1.0000e+00, 2.1000e+01, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 2.2000e+01, 4.0000e+00, 1.0000e+00, 2.1000e+01,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 5.7000e+01, 2.0000e+00, 2.0000e+00,
         2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 4.9000e+01, 9.0000e+00,
         6.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.3000e+01,
         5.0000e+00, 3.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         1.4000e+01, 3.0000e+00, 2.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.5000e+01, 1.0000e+01, 6.0000e+00, 2.1000e+01, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 3.5000e+01, 7.0000e+00, 5.0000e+00, 2.1000e+01,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 4.4000e+01, 2.0000e+01, 5.0000e+00,
         2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.0000e+00, 2.0000e+01,
         8.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.1000e+01,
         2.0000e+01, 3.0000e

In [None]:
cur_pack = nn.utils.rnn.pack_padded_sequence(cur.unsqueeze(0), [50], batch_first=True)

In [None]:
print(cur_pack)

PackedSequence(data=tensor([[1.0000e-02, 3.0000e+01, 1.0000e+00, 1.0000e+00, 2.1000e+01, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 2.2000e+01, 4.0000e+00, 1.0000e+00, 2.1000e+01,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 5.7000e+01, 2.0000e+00, 2.0000e+00,
         2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 4.9000e+01, 9.0000e+00,
         6.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.3000e+01,
         5.0000e+00, 3.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         1.4000e+01, 3.0000e+00, 2.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.5000e+01, 1.0000e+01, 6.0000e+00, 2.1000e+01, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 3.5000e+01, 7.0000e+00, 5.0000e+00, 2.1000e+01,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 4.4000e+01, 2.0000e+01, 5.0000e+00,
         2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.0000e+00, 2.0000e+01,
         8.0000e+00, 2.1000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.1000e+01,
        

### Helper funcitons for displaying input/output tensors in English

In [None]:
def show_positions(lap_in, out):
  _lap = lap_in.detach().clone()
  _o = out.detach().clone()
  _lap = _lap.apply_(lambda x: x*100)
  _o = _o.apply_(lambda x: x*100)
  _name, _loc, _country = circuit_info(round(_lap[0].item()))
  print(f'Circuit: {_name}, {_loc}, {_country}')
  j = 0
  for i in range(1, 140, 7):
    _num, _code, _fn, _ln, _, _ = driver_info(round(_lap[i].item()))
    _pos = _o[j].item()
    _time = _o[j+3].item()
    _status = _o[j+2].item()
    _statusstr = status_info(round(_status))
    j += 4
    print(f'Driver: {_num}  {_fn} {_ln}')
    print(f'Position: {_pos}')
    print(f'Laptime: {_time}')
    print(f'Status: {_statusstr}')


In [None]:
show_positions(ds[1][0], ds[1][1])

In [None]:
def pos_df(lap_in, out):
  df = pd.DataFrame(columns=['code', 'driver', 'position', 'pitting', 'laptime'])
  _lap = lap_in.detach().clone()
  _o = out.detach().clone()
  #_lap = _lap.apply_(lambda x: x*10)
  #_o = _o.apply_(lambda x: x*10)
  for i in range(0, 140, 7):
    _lap[i] = _lap[i] * 100
    _lap[i+6] = _lap[i+6] * 10
  for i in range(0, 60, 3):
    _o[i+2] = _o[i+2] * 10
  _name, _loc, _country = circuit_info(round(_lap[0].item()))
  j = 0
  for i in range(1, 140, 7):
    _num, _code, _fn, _ln, _, _ = driver_info(round(_lap[i].item()))
    _pos = _o[j].item()
    _pitting = _o[j+1].item()
    _time = _o[j+2].item()
    df = df.append({
        'code': f'{_code}',
        'driver': f'{_fn} {_ln}',
        'position': _pos,
        'pitting': _pitting,
        'laptime': _time
    }, ignore_index=True)
    j += 3

  df = df.sort_values(by=['position', 'laptime'])
  return _name, _loc, _country, df


In [None]:
_, _, _, df = pos_df(ds[1][0], ds[1][1])

In [None]:
df

Unnamed: 0,driver,position,inpit,laptime,status
0,MSC Michael Schumacher,1.0,0.0,93.098,
2,\N Mika Häkkinen,2.0,0.0,93.047,
3,\N Heinz-Harald Frentzen,3.0,0.0,94.128,
1,BAR Rubens Barrichello,4.0,0.0,92.853,
5,COU David Coulthard,5.0,0.0,93.344,
6,TRU Jarno Trulli,6.0,0.0,95.085,
4,SCH Ralf Schumacher,7.0,0.0,98.018,
7,VIL Jacques Villeneuve,8.0,0.0,96.364,
9,HEI Nick Heidfeld,9.0,0.0,96.104,
8,\N Olivier Panis,10.0,0.0,95.906,


In [None]:
# Returns a tensor with the size of in but content of out
def out_to_in(in_, out_):
  _ret = in_.detach().clone()
  try:
    _ret[0][0][4] = out_[0][0][0]
    _ret[0][0][5] = out_[0][0][1]
    _ret[0][0][7] = out_[0][0][2]
    for i in range (1, 20):
      _ret[0][0][4 + 7*i] = out_[0][0][3*i]
      _ret[0][0][5 + 7*i] = out_[0][0][3*i + 1]
      _ret[0][0][7 + 7*i] = out_[0][0][3*i + 2]
    return _ret.squeeze().squeeze()
  except:
    _ret[4] = out_[0]
    _ret[5] = out_[1]
    _ret[7] = out_[2]
    for i in range (1, 20):
      _ret[4 + 7*i] = out_[3*i]
      _ret[5 + 7*i] = out_[3*i + 1]
      _ret[7 + 7*i] = out_[3*i + 2]
    return _ret


In [None]:
prev, n = ds[1]
df = out_to_in(prev, n)
df

## Model

In [None]:
class RacePredictionModel(nn.Module):
    def __init__(self, input_size, output_size, lstm_hids, lstm_layers, dropout):
        super(RacePredictionModel, self).__init__()

        self.input_size = input_size
        self.lstm_layers = lstm_layers
        self.lstm_hids = lstm_hids

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=lstm_hids, num_layers=lstm_layers, dropout=dropout, batch_first=True)

        self.fc = nn.Linear(lstm_hids, output_size)

        nn.init.xavier_uniform_(self.fc.weight.data)
        for name, params in self.lstm.named_parameters():
            if name[:6] == 'weight':
                nn.init.xavier_uniform_(params)
            elif name[:4] == 'bias':
                nn.init.constant_(params, 0.0)

    def zero_states(self):
        hidden_state = torch.zeros(self.lstm_layers, 1, self.lstm_hids)
        cell_state = torch.zeros(self.lstm_layers, 1, self.lstm_hids)
        return (hidden_state, cell_state)

    def forward(self, ins, prev_states=None):
        lstm_outs, next_states = self.lstm(ins, prev_states)
        outs = self.fc(lstm_outs)
        return outs, next_states

### Helper functions for training #1/3

In [None]:

# Trains model on data from 2001 to 2019, 2020 is reserved for testing
# Method 1
# Training procedure:
#   for each race:
#     set zero states,
#     for each lap:
#       feed input from dataset,
#       calculate loss from output,
#       back propagate,
# Method 3(.1)
# Training procedure:
#   for each race:
#     set zero states,
#     for each lap: (just once, on the last lap)
#       feed input(a sequence of laps until the current lap) from dataset,
#       calculate loss from output,
#       back propagate,
def run_train(model, ds, crit, opt, sched):
    model.train()
    total_loss, total_count = 0, 0

    ds.set_year(2001)
    ds.set_round(1)

    while (ds.year != 2020):
      hid_state, cell_state = model.zero_states()
      states = hid_state.to(device), cell_state.to(device)
      #for i in range(len(ds)):
      i = len(ds) - 1
      opt.zero_grad()
      lap_in, lap_exp = ds[i]

      # fix shape and type
      # input size should be (batch, seq_len, input_size)
      # we are not using batches
      if (i == 0):
        lap_in = lap_in.unsqueeze(0)
        lap_exp = lap_exp.unsqueeze(0)
      lap_in = lap_in.unsqueeze(0).float()
      lap_exp = lap_exp.unsqueeze(0).float()

      lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
      lap_out, states = model(lap_in, states)
      loss = crit(lap_out, lap_exp)
      for s in states:
          s.detach_()
      loss.backward()
      if (math.isnan(loss)):
        print('Loss is nan')
        print(f'Year: {ds.year}')
        print(f'round: {ds.round}')
        print(f'lap: {i}')
        print(lap_in)
        input()
      opt.step()
      total_loss += loss.item() * 1 #lap_in.size(0) # since we are not doing batches
      total_count += 1 #lap_in.size(0)

      ds.next_round()

    sched.step()
    return total_loss/total_count

def run_test(model, ds, crit):
    model.eval()
    total_loss, total_count = 0, 0

    ds.set_year(2020)
    ds.set_round(1)

    hid_state, cell_state = model.zero_states()
    states = hid_state.to(device), cell_state.to(device)

    #for i in range(len(ds)):

    i = len(ds) - 1
    lap_in, lap_exp = ds[i]
    if (i == 0):
      lap_in = lap_in.unsqueeze(0)
      lap_exp = lap_exp.unsqueeze(0)
    lap_in = lap_in.unsqueeze(0).float()
    lap_exp = lap_exp.unsqueeze(0).float()
    lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
    lap_out, states = model(lap_in, states)
    loss = crit(lap_out, lap_exp)
    total_loss += loss.item() * 1 #lap_in.size(0) # since we are not doing batches
    total_count += 1 #lap_in.size(0)

    return total_loss/total_count

def run_all(model, ds, crit, opt, sched, versionId, n_epochs=10):
    for epoch in tqdm(range(n_epochs), desc='epochs', unit='ep'):
      train_loss = run_train(model, ds, crit, opt, sched)
      test_loss = run_test(model, ds, crit)
      tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}  test loss {test_loss:.6f}')
      if epoch % 5 == 0:
        torch.save(model.state_dict(), f'/content/drive/My Drive/f1ml/sd/{versionId}-{epoch}-loss-{test_loss:.2f}.pth')
    torch.save(model.state_dict(), f'/content/drive/My Drive/f1ml/sd/{versionId}-{n_epochs}-loss-{test_loss:.2f}.pth')


### Helper functions for training #2

In [None]:
# Trains model on data from 2001 to 2019, 2020 is reserved for testing
# Training procedure:
#   for each race:
#     set zero states,
#     feed first input from dataset,
#     calculate loss from output,
#     back propagate,
#     for each lap:
#       feed input from last output
#       loss
#       back prop
def run_train_2(model, ds, crit, opt, sched):
    model.train()
    total_loss, total_count = 0, 0

    ds.set_year(2001)
    ds.set_round(1)

    while (ds.year != 2020):
      hid_state, cell_state = model.zero_states()
      states = hid_state.to(device), cell_state.to(device)
      lap_in, _ = ds[0]
      for i in range(len(ds)):
        opt.zero_grad()
        _, lap_exp = ds[i]

        # fix shape and type
        # input size should be (seq_len, batch, input_size)
        # we are not using batches, and seq_len is 1
        lap_in = lap_in.unsqueeze(0).unsqueeze(0).float()
        lap_exp = lap_exp.unsqueeze(0).unsqueeze(0).float()

        lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
        lap_out, states = model(lap_in, states)
        loss = crit(lap_out, lap_exp)
        for s in states:
          s.detach_()
        loss.backward(retain_graph=True)
        if (math.isnan(loss)):
          print('Loss is nan')
          print(f'Year: {ds.year}')
          print(f'round: {ds.round}')
          print(f'lap: {i}')
          print(lap_in)
          input()
        opt.step()
        total_loss += loss.item() * 1 #lap_in.size(0) # since we are not doing batches
        total_count += 1 #lap_in.size(0)

        lap_in= out_to_in(lap_in, lap_out.detach().clone())

      ds.next_round()

    sched.step()
    return total_loss/total_count

def run_test_2(model, ds, crit):
    model.eval()
    total_loss, total_count = 0, 0

    ds.set_year(2020)
    ds.set_round(1)

    hid_state, cell_state = model.zero_states()
    states = hid_state.to(device), cell_state.to(device)

    lap_in, _ = ds[0]
    for i in range(len(ds)):
      _, lap_exp = ds[i]
      lap_in = lap_in.unsqueeze(0).unsqueeze(0).float()
      lap_exp = lap_exp.unsqueeze(0).unsqueeze(0).float()
      lap_in, lap_exp = lap_in.to(device), lap_exp.to(device)
      lap_out, states = model(lap_in, states)
      loss = crit(lap_out, lap_exp)
      total_loss += loss.item() * 1 #lap_in.size(0) # since we are not doing batches
      total_count += 1 #lap_in.size(0)

      lap_in = out_to_in(lap_in.squeeze().squeeze(), lap_out.squeeze().squeeze())

    return total_loss/total_count

def run_all_2(model, ds, crit, opt, sched, versionId, n_epochs=10):
    for epoch in tqdm(range(n_epochs), desc='epochs', unit='ep'):
      train_loss = run_train_2(model, ds, crit, opt, sched)
      test_loss = run_test_2(model, ds, crit)
      tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f}  test loss {test_loss:.6f}')
      if epoch % 5 == 0:
        torch.save(model.state_dict(), f'/content/drive/My Drive/f1ml/sd/{versionId}-{epoch}-loss-{test_loss:.2f}.pth')
    torch.save(model.state_dict(), f'/content/drive/My Drive/f1ml/sd/{versionId}-{n_epochs}-loss-{test_loss:.2f}.pth')


### Training

In [None]:
device = torch.device('cuda:0')
#device = torch.device('cpu')
model = RacePredictionModel(141, 60, 141, 2, 0.2)
model.to(device)
crit = nn.MSELoss().to(device)
opt = optim.Adam(model.parameters(), lr=0.001)
sched = optim.lr_scheduler.StepLR(opt, 3, gamma=0.1)
ds = RaceDataset(db_dir + 'races/')

In [None]:
run_all(model, ds, crit, opt, sched, 25, 10)

HBox(children=(FloatProgress(value=0.0, description='epochs', max=10.0, style=ProgressStyle(description_width=…

epoch 0   train loss 23.726949  test loss 20.335810
epoch 1   train loss 14.090378  test loss 20.304308
epoch 2   train loss 14.076927  test loss 20.288363
epoch 3   train loss 14.135900  test loss 19.980404
epoch 4   train loss 14.057867  test loss 19.841948
epoch 5   train loss 14.036112  test loss 19.773136
epoch 6   train loss 13.998912  test loss 19.770061
epoch 7   train loss 13.997894  test loss 19.767181
epoch 8   train loss 13.996841  test loss 19.764452
epoch 9   train loss 13.992361  test loss 19.764212



vId | training method | ds | ds cls| input size | hid size | layers | dropout | crit | opt | sched | epochs | train loss | test loss
--- | --- | --- | --- | --- | --- | --- |--- | --- | --- | --- | ---| --- | ---
| 1 | v3 | v1| 141 | 1024 | 4 | 0.1| MSELoss | Adam, lr=0.0005 | StepLR, ss=20, g=0.1| 6 | 0.40 | 2.9
| 1 | v3 | v1| 141 | 1024 | 4 | 0.1| MSELoss | Adam, lr=0.0005 | StepLR, ss=1, g=0.1| 5 | 0.9| 2.00 *
| 1 | v3 | v2| 141 | 1024 | 4 | 0.1| MSELoss | Adam, lr=0.0005 | StepLR, ss=1, g=0.1| 6| 0.07| 0.64
| 1 | v3 | v2| 141 | 512 | 2 | 0.1| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 10| 0.068| 0.61
| 1 | v3 | v2| 141 | 256 | 2 | 0.1| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 10| 0.068| 0.619
5 | 1 | v3 | v2| 141 | 1024 | 10 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 1 | 0.067| 0.708
6 | 1 | v3 | v2.1 no scaling| 141 | 1024 | 10 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 8 | 818| 441 **
7 | 1 | v3 | v2.1 no scaling| 141 | 2048 | 10 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6 | 817 | 437***
8 | 1 | v3 | v2.1 no scaling| 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 10 | 795 | 354
9 | 2 | v3 | v2.1 no scaling| 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6 | 818| 440 ****
10 | 1 | v3 | v2.2 some scaling*****| 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 41| 36
11 | 1 | v3 | v2.2 some scaling*****| 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=2, g=0.5| 6| 38| 37 \*\*\*\*\*\*
12 | 1 | v3 | v2.2 some scaling*****| 141 | 256 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6 | 41| 37
13 | 1 | v3 | v3 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 0.10| 0.064
14 | 1 | v3 | v3.1 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 9.1| 10.6
15 | 1 | v3 | v3.2 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 922 | 1078
16 | 1 | v3 | v3.3 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 922| 1077  
17 | 1 | v3 | v3.4 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 19.2|16.5   
18 | 1 | v3 | v4 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 10| 13.5 |16.5    
19 | 1 | v3 | v5 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 10| 13.5|16.0  
20 | 1 | v4 | v5.1 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 8| 9.04|10.66   
21 | 1 | v4 | v5.2 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 6| 18.94|15.84   
23 | 3 | v4 | v6 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 10 | 18.67|15.85   
24 | 3.1 | v4 | v6 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=1, g=0.1| 5 | 14.05|19.99  
25 | 3.1 | v4 | v6 | 141 | 141 | 2 | 0.2| MSELoss | Adam, lr=0.001 | StepLR, ss=3, g=0.1| 10 | 13.99| 19.76  



```
# *
epoch 0   train loss 0.466812  test loss 3.057176
epoch 1   train loss 0.777065  test loss 2.431262
epoch 2   train loss 0.891661  test loss 1.766107
epoch 3   train loss 1.023275  test loss 1.931356
epoch 4   train loss 0.919382  test loss 2.009557
```
`**`: performs similarly to the other models, in terms of how the predictions look  
`***`: took a long time to train  
`****`: same performance as the original training method  
`*****`: only scaling driverId and laptime  
`******`: positions dont change much  
NOTE: before 14, i was scaling position in output...  
15: models learns to not change anything  
23,24: laptimes too small(~0), does not make sense

In [None]:
cpu = torch.device('cpu')
model.load_state_dict(torch.load('/content/drive/My Drive/f1ml/sd/24-5-loss-19.99.pth'))
#model.load_state_dict(torch.load('/content/drive/My Drive/f1ml/sd/11-5-loss-37.31.pth'))
model.eval()
ds.set_year(2020)
ds.set_round(22)
p, n = ds[0]
p = p.to(device)
hid_state, cell_state = model.zero_states()
states = (hid_state.to(device), cell_state.to(device))
out, s = model(p.unsqueeze(0).unsqueeze(0).float(), states)
out = out.squeeze().squeeze()
print(out)
for i in range(50):
  out, s = model(out_to_in(p, out).unsqueeze(0).unsqueeze(0).float(), s)
  out = out.squeeze().squeeze()
  _, _, _, d = pos_df(p.to(cpu), out)
  print(d)
print(out)
out = out.detach().to(cpu)
d

tensor([ 4.2889e+00, -1.5449e-02,  1.8602e-02,  3.8363e+00,  8.2501e-03,
        -5.4131e-03,  4.2897e+00, -2.3380e-02,  1.0928e-02,  5.2610e+00,
        -6.3883e-03, -1.2196e-02,  6.8927e+00,  5.9450e-04,  6.7464e-03,
         7.8784e+00, -7.8406e-03, -2.5686e-02,  8.9446e+00,  6.0315e-03,
        -1.6726e-02,  9.8647e+00,  2.5332e-02, -3.0484e-03,  1.0591e+01,
         2.2855e-02, -2.7421e-02,  1.0796e+01,  1.1781e-02,  2.0113e-02,
         1.1954e+01,  1.5436e-02, -1.3688e-02,  1.2078e+01, -2.5207e-02,
        -3.0845e-03,  1.2763e+01, -2.1317e-03, -1.2411e-02,  1.2839e+01,
         3.5188e-02,  2.7600e-02,  1.3339e+01, -3.8156e-03,  1.0184e-02,
         1.3573e+01, -2.0644e-02,  2.1797e-02,  1.3987e+01,  2.7668e-02,
        -2.9144e-02,  1.3789e+01, -1.8079e-02, -1.2822e-02,  1.4043e+01,
         2.8903e-02, -1.1932e-02,  1.3997e+01, -2.4507e-02,  2.4919e-02],
       device='cuda:0', grad_fn=<SqueezeBackward0>)
   code              driver   position   pitting   laptime
1   BOT     

Unnamed: 0,code,driver,position,pitting,laptime
1,BOT,Valtteri Bottas,5.205608,-0.00025,0.001423
2,HAM,Lewis Hamilton,5.839981,0.000401,-0.002316
0,VER,Max Verstappen,5.897659,0.000225,-0.00329
3,NOR,Lando Norris,7.1868,-4e-06,0.00143
4,ALB,Alexander Albon,9.404008,-8.9e-05,-0.000312
5,SAI,Carlos Sainz,10.713583,0.00019,0.005204
6,KVY,Daniil Kvyat,12.171057,-2.7e-05,0.00241
7,STR,Lance Stroll,13.483019,-0.000388,0.000692
8,GAS,Pierre Gasly,14.444311,-0.000303,0.003693
9,OCO,Esteban Ocon,14.723085,-2.4e-05,-0.003977
