In [299]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from tqdm.notebook import tqdm, trange
import warnings
warnings.filterwarnings("ignore")

In [300]:
circuits = pd.read_csv('../data/circuits.csv')
constructor_results = pd.read_csv('../data/constructor_results.csv')
constructor_standings = pd.read_csv('../data/constructor_standings.csv')
constructors = pd.read_csv('../data/constructors.csv')
driver_standings = pd.read_csv('../data/driver_standings.csv')
drivers = pd.read_csv('../data/drivers.csv', parse_dates=[6])
lap_times = pd.read_csv('../data/lap_times.csv')
pit_stops = pd.read_csv('../data/pit_stops.csv')
qualifying = pd.read_csv('../data/qualifying.csv')
races = pd.read_csv('../data/races.csv', parse_dates=[5])
results = pd.read_csv('../data/results.csv')
seasons = pd.read_csv('../data/seasons.csv')
status = pd.read_csv('../data/status.csv')

In [301]:
result_with_dob = results.merge(drivers, left_on='driverId', right_on='driverId')
results_with_dob_date = result_with_dob.merge(races, on = 'raceId', how = 'left')

In [302]:
#results_with_dob_date['ageAtRace'] = pd.to_datetime(results_with_dob_date['date']) - pd.to_datetime(results_with_dob_date['dob'])
pd.to_datetime(results_with_dob_date['date'])
pd.to_datetime(results_with_dob_date['dob'])

0       1985-01-07
1       1985-01-07
2       1985-01-07
3       1985-01-07
4       1985-01-07
           ...    
24955   1995-06-29
24956   1995-06-29
24957   1995-09-23
24958   1996-06-25
24959   1996-06-25
Name: dob, Length: 24960, dtype: datetime64[ns]

In [303]:
#Calculate age at race in days (timedelta)
results_with_dob_date['ageAtRace'] = results_with_dob_date['date'] - results_with_dob_date['dob']

In [304]:
#Calculate how many days in f1 in days (timedelta)
date_of_debut = results_with_dob_date.groupby('driverId')['date'].min().to_frame()
date_of_debut = date_of_debut.merge(drivers, on = 'driverId', how = 'left').rename(columns={'date':'date_of_debut'})
date_of_debut = date_of_debut[['driverId','date_of_debut']]
date_of_debut['date_of_debut']
df = results_with_dob_date.merge(date_of_debut, on = 'driverId', how = 'left')
df['ageAtDebut'] = df['date_of_debut'] - df['dob']
df['yearsExperience'] = df['date'] - df['date_of_debut']


In [305]:
#Drop columns
df = df.drop(columns=['number_y', 'number_x', 'code', 'forename', 'surname', 'dob', 'url_x', 'time_y', 'url_y'])

In [306]:
#Rename Columns
df = df.rename(columns={"grid": "gridStart", "name": "circuitName", "date_of_debut" : "dateOfDebut", "points": "pointsGained"})

In [307]:
#Add is racing at home feature
df['racingAtHome'] = df.apply(lambda row: row.nationality in row.circuitName, axis=1)

In [308]:
#Convert ages from days to years
df['ageAtRace'] = df.apply(lambda row: row.ageAtRace / pd.Timedelta('365 days'), axis=1)
df['ageAtDebut'] = df.apply(lambda row: row.ageAtDebut / pd.Timedelta('365 days'), axis=1)
df['yearsExperience'] = df.apply(lambda row: row.yearsExperience / pd.Timedelta('365 days'), axis=1)

In [309]:
#Add starts in first 2 places on the grid
df['startsFrontRow'] = df['gridStart'] <= 2

In [310]:
#Merge with driver standings
driver_standings = driver_standings.rename(columns={"points": "driverStandingsPoints", "position": "driverStandingsPosition", "wins" : "driverStandingsWins"})
driver_standings = driver_standings.drop(columns=['positionText'])
df = df.merge(driver_standings, on = ['raceId','driverId'], how = 'left')

In [311]:
#Get driver standings information before race
df['driverStandingsPoints'] = df.groupby(['year','driverId'])['driverStandingsPoints'].shift(fill_value=0)
df['driverStandingsPosition'] = df.groupby(['year','driverId'])['driverStandingsPosition'].shift(fill_value=0)
df['driverStandingsWins'] = df.groupby(['year','driverId'])['driverStandingsWins'].shift(fill_value=0)
df['lastRaceRank'] = df.groupby(['year','driverId'])['rank'].shift()

In [312]:
#Merge with constructor standings
constructor_standings = constructor_standings.rename(columns={"points": "constructorStandingsPoints", "position": "constructorStandingsPosition", "wins" : "constructorStandingsWins"})
constructor_standings = constructor_standings.drop(columns=['positionText'])
df = df.merge(constructor_standings, on = ['raceId','constructorId'], how = 'left')

In [313]:
#Get constructor standings information before race
df['constructorStandingsPoints'] = df.groupby(['year','driverId'])['constructorStandingsPoints'].shift(fill_value=0)
df['constructorStandingsPosition'] = df.groupby(['year','driverId'])['constructorStandingsPosition'].shift(fill_value=0)
df['constructorStandingsWins'] = df.groupby(['year','driverId'])['constructorStandingsWins'].shift(fill_value=0)

In [314]:
#Get previous race grid and final race positions
df['previousRaceGridStart'] = df.groupby(['year','driverId'])['gridStart'].shift()
df['previousRacePosition'] = df.groupby(['year','driverId'])['positionText'].shift()

In [315]:
#Add status for each row
df = df.merge(status, on = ['statusId'], how = 'left')
#Add finished race feature
df['finished'] = (df['positionText']) != 'R'

In [316]:
#Get all historic features up until current row date

for date in tqdm(df.date.unique()):
    #Races won by driver
    races_won_by_driver = df[df.date < date].groupby('driverId')['positionText'].apply(lambda x: (x==1).sum()).reset_index(name='racesWon')
    races_won_by_driver['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['racesWon'] = 'to_delete'
    dfaux = dfaux.drop('racesWon', axis = 1)
    df.loc[df['date'] == date, 'racesWon'] = dfaux.merge(races_won_by_driver, on=['date','driverId'], how='left')['racesWon'].values
    #Number of races retired
    races_retired_by_driver = df[df.date < date].groupby('driverId')['positionText'].apply(lambda x: (x=='R').sum()).reset_index(name='racesRetired')
    races_retired_by_driver['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['racesRetired'] = 'to_delete'
    dfaux = dfaux.drop('racesRetired', axis = 1)
    df.loc[df['date'] == date, 'racesRetired'] = dfaux.merge(races_retired_by_driver, on=['date','driverId'], how='left')['racesRetired'].values
    #Number of races finished
    races_finished_by_driver = df[df.date < date].groupby('driverId')['positionText'].apply(lambda x: (x!='R').sum()).reset_index(name='racesFinished')
    races_finished_by_driver['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['racesFinished'] = 'to_delete'
    dfaux = dfaux.drop('racesFinished', axis = 1)
    df.loc[df['date'] == date, 'racesFinished'] = dfaux.merge(races_finished_by_driver, on=['date','driverId'], how='left')['racesFinished'].values
    #Number of pole positions
    pole_positions_by_driver = df[df.date < date].groupby('driverId')['gridStart'].apply(lambda x: (x=='1').sum()).reset_index(name='polePositions')
    pole_positions_by_driver['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['polePositions'] = 'to_delete'
    dfaux = dfaux.drop('polePositions', axis = 1)
    df.loc[df['date'] == date, 'polePositions'] = dfaux.merge(pole_positions_by_driver, on=['date','driverId'], how='left')['polePositions'].values
    #Constructors races won
    races_won_by_constructor = df[df.date < date].groupby('constructorId')['positionText'].apply(lambda x: (x=='1').sum()).reset_index(name='racesWonByConstructor')
    races_won_by_constructor['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['racesWonByConstructor'] = 'to_delete'
    dfaux = dfaux.drop('racesWonByConstructor', axis = 1)
    df.loc[df['date'] == date, 'racesWonByConstructor'] = dfaux.merge(races_won_by_constructor, on=['date','constructorId'], how='left')['racesWonByConstructor'].values
    #Constructor times retired
    races_retired_by_constructor = df[df.date < date].groupby('constructorId')['positionText'].apply(lambda x: (x=='R').sum()).reset_index(name='racesRetiredByConstructor')
    races_retired_by_constructor['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['racesRetiredByConstructor'] = 'to_delete'
    dfaux = dfaux.drop('racesRetiredByConstructor', axis = 1)
    df.loc[df['date'] == date, 'racesRetiredByConstructor'] = dfaux.merge(races_retired_by_constructor, on=['date','constructorId'], how='left')['racesRetiredByConstructor'].values
    #Correlation between grid and finishing positions Driver
    '''
    correlationGridFinishDriver = df[df.date < date].groupby('driverId').apply(lambda df: df['positionOrder'].corr(df['gridStart'])).reset_index(name='correlationGridFinishDriver')
    correlationGridFinishDriver['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['correlationGridFinishDriver'] = 'to_delete'
    dfaux = dfaux.drop('correlationGridFinishDriver', axis = 1)
    df.loc[df['date'] == date, 'correlationGridFinishDriver'] = dfaux.merge(correlationGridFinishDriver, on=['date','driverId'], how='left')['correlationGridFinishDriver'].values
    #Correlation between grid and finishing positions Circuit
    
    correlationGridFinishCircuit = df[df.date < date].groupby('circuitId').apply(lambda df: df['positionOrder'].corr(df['gridStart'])).reset_index(name='correlationGridFinishCircuit')
    correlationGridFinishCircuit['date'] = date
    dfaux = df[df['date'] == date].copy()
    dfaux['correlationGridFinishCircuit'] = 'to_delete'
    dfaux = dfaux.drop('correlationGridFinishCircuit', axis = 1)
    df.loc[df['date'] == date, 'correlationGridFinishCircuit'] = dfaux.merge(correlationGridFinishDriver, on=['date','circuitId'], how='left')['correlationGridFinishCircuit'].values
    '''




  0%|          | 0/1035 [00:00<?, ?it/s]

In [317]:
#Formating qualifying times
qualifying['q1'] = qualifying['q1'].str.replace('.',':')
qualifying['q2'] = qualifying['q2'].str.replace('.',':')
qualifying['q3'] = qualifying['q3'].str.replace('.',':')

qualifying[['q1', 'q2', 'q3']] = qualifying[['q1', 'q2', 'q3']].applymap(lambda x: pd.to_timedelta(x, errors='coerce'))

qualifying['bestLap'] = qualifying[['q1','q2','q3']].min(axis=1)

In [318]:
#Percent diference between drivers qualifying time and best time 
quali_aux = pd.merge(df[['raceId','driverId']], qualifying[['bestLap','raceId','driverId']], on = ['raceId','driverId'], how = 'left')
quali_aux_best = quali_aux.groupby(['raceId'], as_index=False)['bestLap'].min()
df = df.merge(quali_aux, on = ['raceId','driverId'], how = 'left')
df = df.merge(quali_aux_best, on = ['raceId'], how = 'left')
df['percentageOfBestQuali'] = (df['bestLap_x'] / df['bestLap_y'])*100
df = df.drop(columns=['bestLap_x', 'bestLap_y'])

In [319]:
qualifying.loc[qualifying['constructorId'] == 1].head()

Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3,bestLap
0,1,18,1,1,22,1,0 days 01:35:32,0 days 01:28:07,0 days 01:37:54,0 days 01:28:07
2,3,18,5,1,23,3,0 days 01:36:04,0 days 01:32:32,0 days 01:28:19,0 days 01:28:19
24,25,19,5,1,23,3,0 days 01:38:47,0 days 01:46:39,0 days 01:46:13,0 days 01:38:47
25,26,19,1,1,22,4,0 days 01:41:32,0 days 01:44:27,0 days 01:47:49,0 days 01:41:32
46,47,20,1,1,22,3,0 days 01:44:30,0 days 01:46:22,0 days 01:37:52,0 days 01:37:52


In [323]:
#df.loc[df['driverId'] == 2]
df.tail(20)

Unnamed: 0,resultId,raceId,driverId,constructorId,gridStart,position,positionText,positionOrder,pointsGained,laps,...,finished,racesWon,racesRetired,racesFinished,polePositions,racesWonByConstructor,racesRetiredByConstructor,bestLap_x,bestLap_y,percentageOfBestQuali
25134,24636,1031,849,3,20,11,11,11,0.0,71,...,True,,,,,114.0,386.0,0 days 01:17:37,0 days 01:03:15,122.714097
25135,24662,1032,849,3,18,17,17,17,0.0,69,...,True,0.0,0.0,1.0,0.0,114.0,387.0,0 days 01:33:39,0 days 01:19:14,118.195204
25136,24684,1033,849,3,15,19,19,19,0.0,65,...,True,0.0,0.0,2.0,0.0,114.0,387.0,0 days 01:17:45,0 days 01:15:06,103.528628
25137,24700,1034,849,3,18,15,15,15,0.0,52,...,True,0.0,0.0,3.0,0.0,114.0,387.0,0 days 01:38:45,0 days 01:25:15,115.835777
25138,24724,1035,849,3,18,19,19,19,0.0,51,...,True,0.0,0.0,4.0,0.0,114.0,387.0,0 days 01:35:10,0 days 01:27:11,109.156949
25139,24743,1036,849,3,19,18,18,18,0.0,64,...,True,0.0,0.0,5.0,0.0,114.0,387.0,0 days 01:26:52,0 days 01:16:13,113.973322
25140,24761,1037,849,3,19,16,16,16,0.0,44,...,True,0.0,0.0,6.0,0.0,114.0,387.0,0 days 01:46:18,0 days 01:42:14,103.977828
25141,24776,1038,849,3,20,11,11,11,0.0,53,...,True,0.0,0.0,7.0,0.0,114.0,388.0,0 days 01:32:57,0 days 01:20:31,115.441937
25142,24800,1039,849,3,19,\N,R,15,0.0,6,...,False,0.0,0.0,8.0,0.0,114.0,388.0,0 days 01:22:20,0 days 01:17:24,106.373816
25143,24821,1040,849,3,20,16,16,16,0.0,52,...,True,0.0,1.0,8.0,0.0,114.0,389.0,0 days 01:36:06,0 days 01:33:00,103.333333
