In [63]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [64]:
# Loading data
races = pd.read_csv('../data/races.csv')
results = pd.read_csv('../data/results.csv')
driver_standings = pd.read_csv('../data/driver_standings.csv')
constructor_standings = pd.read_csv('../data/constructor_standings.csv')
weather = pd.read_csv('../data/weather.csv')
qualifying = pd.read_csv('../data/qualifying.csv')

# Preprocessing
qualifying.rename(columns = {"grid_position" : "grid"}, inplace = True)
driver_standings.drop(['driver_points_after_race', 'driver_wins_after_race', 'driver_standings_pos_after_race'] ,axis = 1, inplace = True)
constructor_standings.drop(['constructor_points_after_race', 'constructor_wins_after_race','constructor_standings_pos_after_race' ],axis = 1, inplace = True)


In [65]:
#### MERGE

# Inner joins
df = pd.merge(races, weather, how = "inner", on = ["season", "round", "circuit_id"]).drop(["lat", "long", "country", "weather"], axis = 1)
df = pd.merge(df, results, how = "inner", on = ["season", "round", "circuit_id", "url"]).drop(["url", "points", "status", "time"], axis = 1)

# Left joins
df = pd.merge(df, driver_standings, how = "left", on = ["season", "round", "driver"])
df = pd.merge(df, constructor_standings, how = "left", on = ["season", "round", "constructor"])

# Last inner join
df = pd.merge(df, qualifying, how = "inner", on = ["season", "round", "grid"]).drop(["driver_name", "car"], axis = 1)

In [66]:
df.head()

Unnamed: 0,season,round,circuit_id,date,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,...,constructor,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time
0,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,piquet,...,brabham,4,1,0.0,0.0,0.0,0.0,0.0,0.0,1:35.114
1,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,lauda,...,mclaren,9,2,0.0,0.0,0.0,0.0,0.0,0.0,1:36.054
2,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,laffite,...,williams,18,3,0.0,0.0,0.0,0.0,0.0,0.0,1:38.234
3,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,tambay,...,ferrari,3,4,0.0,0.0,0.0,0.0,0.0,0.0,1:34.758
4,1983,1,jacarepagua,1983-03-13,0,0,1,0,0,surer,...,arrows,20,5,0.0,0.0,0.0,0.0,0.0,0.0,1:38.468


In [67]:
from dateutil.relativedelta import *

# Calculate age of drivers
df["date"] = pd.to_datetime(df.date)
df["date_of_birth"] = pd.to_datetime(df.date_of_birth)
df["driver_age"] = df.apply(lambda x: relativedelta(x["date"], x["date_of_birth"]).years, axis = 1)
df.drop(["date", "date_of_birth"], axis = 1, inplace = True)

In [68]:
# Fill/drop NaNs
for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 'constructor_wins' , 'constructor_standings_pos']:
    df[col].fillna(0, inplace = True)
    df[col] = df[col].map(lambda x: int(x))

df.dropna(inplace = True)

In [69]:
# Convert to boolean

for col in ['weather_warm', 'weather_cold','weather_dry', 'weather_wet', 'weather_cloudy']:
    df[col] = df[col].map(lambda x: bool(x))

In [70]:
# Calculate difference in qualifying times

try:
    df['qualifying_time'] = df.qualifying_time.map(lambda x: 0 if str(x) == '00.000' else(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])) if x != 0 else 0))
except:
    print(df['qualifying_time'])

df = df[df["qualifying_time"] != 0]
df.sort_values(["season", "round", "grid"], inplace = True)
df["qualifying_time_diff"] = df.groupby(["season", "round"]).qualifying_time.diff()
df["qualifying_time"] = df.groupby(["season", "round"]).qualifying_time_diff.cumsum().fillna(0)
df.drop("qualifying_time_diff", axis = 1, inplace = True)

In [71]:
# Dummy variables

df = pd.get_dummies(df, columns = ["circuit_id", "nationality", "constructor"])

for col in df.columns:
    if "nationality" in col and df[col].sum() < 140:
        df.drop(col, axis = 1, inplace = True)
    elif "constructor" in col and df[col].sum() < 140:
        df.drop(col, axis = 1, inplace = True)
    elif "circuit_id" in col and df[col].sum() < 70:
        df.drop(col, axis = 1, inplace = True)
    else:
        pass

In [72]:
df.shape

(14272, 100)

In [73]:
df.to_csv("../data/f1_df_final.csv", index = False)