# F1 2022 Data Analysis
### (with a little data science)

This Jupyter notebook has data analysis of driver stats. This currently uses the __Race Result__ data from the Formula 1 website (e.g. https://www.formula1.com/en/results.html/2022/races/1125/saudi-arabia/race-result.html).

The race results are saved in the data folder with a separate CSV file for each race.

__HAVE YOU READ THE README FILE? PLEASE DO BEFORE USING THIS JUPYTER NOTEBOOK!__

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
import re
from datetime import datetime
import time

In [2]:
%matplotlib inline

In [3]:
cf.go_offline() #allows to use cufflinks offline

In [27]:
# dataframe for all race data
race_results = pd.DataFrame()

race_results = race_results.assign(POS = '', Driver = '', Car = '', Laps = '', Time = '', PTS = '', Race = '')
# rename the column header
race_results.columns = race_results.columns.str.replace('Time', 'Time/Retired')

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race


In [28]:
race_results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   POS           0 non-null      object
 1   Driver        0 non-null      object
 2   Car           0 non-null      object
 3   Laps          0 non-null      object
 4   Time/Retired  0 non-null      object
 5   PTS           0 non-null      object
 6   Race          0 non-null      object
dtypes: object(7)
memory usage: 0.0+ bytes


In [56]:
race_results.to_csv('data/compiled-data/race-results.csv')

# Bahrain Data

In [30]:
# example of loading csv data
bahrain_df = pd.read_csv("data/BAHRAIN.csv")
bahrain_df.head()

Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,1,16,Charles Leclerc,Ferrari,57,37:33.6,26
1,2,55,Carlos Sainz,Ferrari,57,+5.598s,18
2,3,44,Lewis Hamilton,Mercedes,57,+9.675s,15
3,4,63,George Russell,Mercedes,57,+11.211s,12
4,5,20,Kevin Magnussen,Haas Ferrari,57,+14.754s,10


In [31]:
driver_count = 0

while driver_count < len(bahrain_df.index):
    pos = bahrain_df.loc[driver_count, 'Pos']
    driverNo = bahrain_df.loc[driver_count, 'No']
    name = bahrain_df.loc[driver_count, 'Driver']
    car = bahrain_df.loc[driver_count, 'Car']
    laps = bahrain_df.loc[driver_count, 'Laps']
    time = bahrain_df.loc[driver_count, 'Time/Retired']
    points = bahrain_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'BAHRAIN']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
19,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
18,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
17,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
16,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
15,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


In [32]:
bahrain_df.describe()

Unnamed: 0,No,Laps,PTS
count,20.0,20.0,20.0
mean,25.8,56.15,5.1
std,21.142623,2.942877,7.503683
min,1.0,44.0,0.0
25%,10.75,57.0,0.0
50%,21.0,57.0,0.5
75%,34.25,57.0,8.5
max,77.0,57.0,26.0


In [33]:
bahrain_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Pos           20 non-null     object
 1   No            20 non-null     int64 
 2   Driver        20 non-null     object
 3   Car           20 non-null     object
 4   Laps          20 non-null     int64 
 5   Time/Retired  20 non-null     object
 6   PTS           20 non-null     int64 
dtypes: int64(3), object(4)
memory usage: 1.2+ KB


# Saudi Arabia Data

In [34]:
saudi_arabia_df = pd.read_csv('data/SAUDI Arabia.csv')
saudi_arabia_df.head()

Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,1,1,Max Verstappen,Red Bull Racing RBPT,50,24:19.3,25
1,2,16,Charles Leclerc,Ferrari,50,+0.549s,19
2,3,55,Carlos Sainz,Ferrari,50,+8.097s,15
3,4,11,Sergio Perez,Red Bull Racing RBPT,50,+10.800s,12
4,5,63,George Russell,Mercedes,50,+32.732s,10


In [35]:
driver_count = 0

while driver_count < len(saudi_arabia_df.index):
    pos = saudi_arabia_df.loc[driver_count, 'Pos']
    driverNo = saudi_arabia_df.loc[driver_count, 'No']
    name = saudi_arabia_df.loc[driver_count, 'Driver']
    car = saudi_arabia_df.loc[driver_count, 'Car']
    laps = saudi_arabia_df.loc[driver_count, 'Laps']
    time = saudi_arabia_df.loc[driver_count, 'Time/Retired']
    points = saudi_arabia_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'SAUDI ARABIA']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
39,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
38,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
37,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
36,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
35,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


In [36]:
saudi_arabia_df.describe()

Unnamed: 0,No,Laps,PTS
count,20.0,20.0,20.0
mean,25.8,40.8,5.1
std,21.142623,16.637307,7.454423
min,1.0,0.0,0.0
25%,10.75,35.75,0.0
50%,21.0,50.0,0.5
75%,34.25,50.0,8.5
max,77.0,50.0,25.0


In [37]:
saudi_arabia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Pos           20 non-null     object
 1   No            20 non-null     int64 
 2   Driver        20 non-null     object
 3   Car           20 non-null     object
 4   Laps          20 non-null     int64 
 5   Time/Retired  20 non-null     object
 6   PTS           20 non-null     int64 
dtypes: int64(3), object(4)
memory usage: 1.2+ KB


# Australia Data

In [38]:
australia_df = pd.read_csv('data/AUSTRALIA.csv')
australia_df.head()

Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,1,16,Charles Leclerc,Ferrari,58,27:46.5,26
1,2,11,Sergio Perez,Red Bull Racing RBPT,58,+20.524s,18
2,3,63,George Russell,Mercedes,58,+25.593s,15
3,4,44,Lewis Hamilton,Mercedes,58,+28.543s,12
4,5,4,Lando Norris,McLaren Mercedes,58,+53.303s,10


In [39]:
driver_count = 0

while driver_count < len(australia_df.index):
    pos = australia_df.loc[driver_count, 'Pos']
    driverNo = australia_df.loc[driver_count, 'No']
    name = australia_df.loc[driver_count, 'Driver']
    car = australia_df.loc[driver_count, 'Car']
    laps = australia_df.loc[driver_count, 'Laps']
    time = australia_df.loc[driver_count, 'Time/Retired']
    points = australia_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'AUSTRALIA']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
59,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
58,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
57,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
56,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
55,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


In [40]:
australia_df.describe()

Unnamed: 0,No,Laps,PTS
count,20.0,20.0,20.0
mean,24.7,52.1,5.1
std,21.64328,14.962761,7.503683
min,1.0,1.0,0.0
25%,9.0,57.0,0.0
50%,19.0,58.0,0.5
75%,34.25,58.0,8.5
max,77.0,58.0,26.0


In [41]:
australia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Pos           20 non-null     object
 1   No            20 non-null     int64 
 2   Driver        20 non-null     object
 3   Car           20 non-null     object
 4   Laps          20 non-null     int64 
 5   Time/Retired  20 non-null     object
 6   PTS           20 non-null     int64 
dtypes: int64(3), object(4)
memory usage: 1.2+ KB


# Italy Data

In [42]:
italy_df = pd.read_csv('data/ITALY.csv')
italy_df.head()

Unnamed: 0,POS,NO,DRIVER,CAR,LAPS,TIME/RETIRED,PTS
0,1,1,Max Verstappen,RED BULL RACING RBPT,63,32:08.0,26
1,2,11,Sergio Perez,RED BULL RACING RBPT,63,+16.527s,18
2,3,4,Lando Norris,MCLAREN MERCEDES,63,+34.834s,15
3,4,63,George Russell,MERCEDES,63,+42.506s,12
4,5,77,Valtteri Bottas,ALFA ROMEO FERRARI,63,+43.181s,10


In [43]:
driver_count = 0

while driver_count < len(italy_df.index):
    pos = italy_df.loc[driver_count, 'POS']
    driverNo = italy_df.loc[driver_count, 'NO']
    name = italy_df.loc[driver_count, 'DRIVER']
    car = italy_df.loc[driver_count, 'CAR']
    laps = italy_df.loc[driver_count, 'LAPS']
    time = italy_df.loc[driver_count, 'TIME/RETIRED']
    points = italy_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'ITALY']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
79,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
78,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
77,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
76,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
75,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


# Miami Data

In [44]:
miami_df = pd.read_csv('data/MIAMI.csv')
miami_df.head()

Unnamed: 0,POS,NO,DRIVER,CAR,LAPS,TIME/RETIRED,PTS
0,1,1,Max Verstappen,RED BULL RACING RBPT,57,34:24.3,26
1,2,16,Charles Leclerc,FERRARI,57,+3.786s,18
2,3,55,Carlos Sainz,FERRARI,57,+8.229s,15
3,4,11,Sergio Perez,RED BULL RACING RBPT,57,+10.638s,12
4,5,63,George Russell,MERCEDES,57,+18.582s,10


In [45]:
driver_count = 0

while driver_count < len(miami_df.index):
    pos = miami_df.loc[driver_count, 'POS']
    driverNo = miami_df.loc[driver_count, 'NO']
    name = miami_df.loc[driver_count, 'DRIVER']
    car = miami_df.loc[driver_count, 'CAR']
    laps = miami_df.loc[driver_count, 'LAPS']
    time = miami_df.loc[driver_count, 'TIME/RETIRED']
    points = miami_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'MIAMI']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
99,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
98,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
97,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
96,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
95,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


# Spain Data

In [46]:
spain_df = pd.read_csv('data/SPAIN.csv')
spain_df.head()

Unnamed: 0,POS,NO,DRIVER,CAR,LAPS,TIME/RETIRED,PTS
0,1,1,Max Verstappen,RED BULL RACING RBPT,66,37:20.5,25
1,2,11,Sergio Perez,RED BULL RACING RBPT,66,+13.072s,19
2,3,63,George Russell,MERCEDES,66,+32.927s,15
3,4,55,Carlos Sainz,FERRARI,66,+45.208s,12
4,5,44,Lewis Hamilton,MERCEDES,66,+54.534s,10


In [47]:
driver_count = 0

while driver_count < len(spain_df.index):
    pos = spain_df.loc[driver_count, 'POS']
    driverNo = spain_df.loc[driver_count, 'NO']
    name = spain_df.loc[driver_count, 'DRIVER']
    car = spain_df.loc[driver_count, 'CAR']
    laps = spain_df.loc[driver_count, 'LAPS']
    time = spain_df.loc[driver_count, 'TIME/RETIRED']
    points = spain_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'SPAIN']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
119,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
118,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
117,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
116,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
115,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


# Monaco Data

In [48]:
monaco_df = pd.read_csv('data/MONACO.csv')
monaco_df.head()

Unnamed: 0,POS,NO,DRIVER,CAR,LAPS,TIME/RETIRED,PTS
0,1,11,Sergio Perez,RED BULL RACING RBPT,64,56:30.3,25
1,2,55,Carlos Sainz,FERRARI,64,+1.154s,18
2,3,1,Max Verstappen,RED BULL RACING RBPT,64,+1.491s,15
3,4,16,Charles Leclerc,FERRARI,64,+2.922s,12
4,5,63,George Russell,MERCEDES,64,+11.968s,10


In [49]:
driver_count = 0

while driver_count < len(monaco_df.index):
    pos = monaco_df.loc[driver_count, 'POS']
    driverNo = monaco_df.loc[driver_count, 'NO']
    name = monaco_df.loc[driver_count, 'DRIVER']
    car = monaco_df.loc[driver_count, 'CAR']
    laps = monaco_df.loc[driver_count, 'LAPS']
    time = monaco_df.loc[driver_count, 'TIME/RETIRED']
    points = monaco_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'MONACO']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
139,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
138,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
137,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
136,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
135,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


# Azerbaijan Data

In [50]:
azerbaijan_df = pd.read_csv('data/AZERBAIJAN.csv')
azerbaijan_df.head()

Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,1,1,Max Verstappen,Red Bull Racing RBPT,51,34:05.9,25
1,2,11,Sergio Perez,Red Bull Racing RBPT,51,+20.823s,19
2,3,63,George Russell,Mercedes,51,+45.995s,15
3,4,44,Lewis Hamilton,Mercedes,51,+71.679s,12
4,5,10,Pierre Gasly,AlphaTauri RBPT,51,+77.299s,10


In [51]:
driver_count = 0

while driver_count < len(azerbaijan_df.index):
    pos = azerbaijan_df.loc[driver_count, 'Pos']
    driverNo = azerbaijan_df.loc[driver_count, 'No']
    name = azerbaijan_df.loc[driver_count, 'Driver']
    car = azerbaijan_df.loc[driver_count, 'Car']
    laps = azerbaijan_df.loc[driver_count, 'Laps']
    time = azerbaijan_df.loc[driver_count, 'Time/Retired']
    points = azerbaijan_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'AZERBAIJAN ']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
159,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
158,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
157,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
156,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
155,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


# Canada Data

In [52]:
canada_df = pd.read_csv('data/CANADA.csv')
canada_df.head()

Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,1,1,Max Verstappen,Red Bull Racing RBPT,70,36:21.8,25
1,2,55,Carlos Sainz,Ferrari,70,+0.993s,19
2,3,44,Lewis Hamilton,Mercedes,70,+7.006s,15
3,4,63,George Russell,Mercedes,70,+12.313s,12
4,5,16,Charles Leclerc,Ferrari,70,+15.168s,10


In [53]:
driver_count = 0

while driver_count < len(canada_df.index):
    pos = canada_df.loc[driver_count, 'Pos']
    driverNo = canada_df.loc[driver_count, 'No']
    name = canada_df.loc[driver_count, 'Driver']
    car = canada_df.loc[driver_count, 'Car']
    laps = canada_df.loc[driver_count, 'Laps']
    time = canada_df.loc[driver_count, 'Time/Retired']
    points = canada_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'CANADA ']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
179,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
178,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
177,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
176,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
175,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN


# Great Britain

In [54]:
gb_df = pd.read_csv('data/GREATBRITAIN.csv')
gb_df.head()

Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,1,55,Carlos Sainz,Ferrari,52,17:50.3,25
1,2,11,Sergio Perez,Red Bull Racing RBPT,52,+3.779s,18
2,3,44,Lewis Hamilton,Mercedes,52,+6.225s,16
3,4,16,Charles Leclerc,Ferrari,52,+8.546s,12
4,5,14,Fernando Alonso,Alpine Renault,52,+9.571s,10


In [55]:
driver_count = 0

while driver_count < len(gb_df.index):
    pos = gb_df.loc[driver_count, 'Pos']
    driverNo = gb_df.loc[driver_count, 'No']
    name = gb_df.loc[driver_count, 'Driver']
    car = gb_df.loc[driver_count, 'Car']
    laps = gb_df.loc[driver_count, 'Laps']
    time = gb_df.loc[driver_count, 'Time/Retired']
    points = gb_df.loc[driver_count, 'PTS']

    # add row
    race_results.loc[-1] = [pos, name, car, laps, time, points, 'GREAT BRITAIN ']
    # shift the index
    race_results.index = race_results.index + 1
    driver_count += 1

race_results.head()

Unnamed: 0,POS,Driver,Car,Laps,Time/Retired,PTS,Race
199,1,Charles Leclerc,Ferrari,57,37:33.6,26,BAHRAIN
198,2,Carlos Sainz,Ferrari,57,+5.598s,18,BAHRAIN
197,3,Lewis Hamilton,Mercedes,57,+9.675s,15,BAHRAIN
196,4,George Russell,Mercedes,57,+11.211s,12,BAHRAIN
195,5,Kevin Magnussen,Haas Ferrari,57,+14.754s,10,BAHRAIN
