# EXPLORATORY ANALYSIS OF THE DATASET merged_f1.csv


In [2]:
import plotly.express as px
import pandas as pd

data = pd.read_csv("data/combined_driver_standings.csv")


In [3]:
# First look
print(data.head())
print(data.info())

   driverStandingsId  raceId  driverId  points  position positionText  wins  \
0                  1      18         1    10.0         1            1     1   
1                  2      18         2     8.0         2            2     0   
2                  3      18         3     6.0         3            3     0   
3                  4      18         4     5.0         4            4     0   
4                  5      18         5     4.0         5            5     0   

    driverRef  year  
0    hamilton  2008  
1    heidfeld  2008  
2     rosberg  2008  
3      alonso  2008  
4  kovalainen  2008  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34595 entries, 0 to 34594
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   driverStandingsId  34595 non-null  int64  
 1   raceId             34595 non-null  int64  
 2   driverId           34595 non-null  int64  
 3   points             34595 non-null  float

In [4]:
# Statistical description
print("\nStatistical description:")
print(data.describe())


Statistical description:
       driverStandingsId        raceId      driverId        points  \
count       34595.000000  34595.000000  34595.000000  34595.000000   
mean        42944.384188    580.120104    313.460760     14.114939   
std         21859.815278    289.290183    272.052171     37.235186   
min             1.000000      1.000000      1.000000      0.000000   
25%         19767.500000    352.000000     87.000000      0.000000   
50%         49910.000000    600.000000    222.000000      1.000000   
75%         59299.500000    803.000000    517.000000     10.000000   
max         72871.000000   1132.000000    860.000000    575.000000   

           position          wins          year  
count  34595.000000  34595.000000  34595.000000  
mean      19.778928      0.273074   1985.453592  
std       16.331054      1.024520     21.500652  
min        1.000000      0.000000   1950.000000  
25%        8.000000      0.000000   1968.000000  
50%       16.000000      0.000000   1985.00

In [5]:
# Check for missing values
print("\nMissing values per column:")
print(data.isnull().sum())


Missing values per column:
driverStandingsId    0
raceId               0
driverId             0
points               0
position             0
positionText         0
wins                 0
driverRef            0
year                 0
dtype: int64


## Basic and exploratory visualizations:

In [6]:
# Points per driver
# Sum total points per driver
points_by_driver = data.groupby('driverRef')['points'].sum().reset_index()

# Sort descending
points_by_driver = points_by_driver.sort_values(by='points', ascending=False)

# Top 10 drivers
top_10_drivers = points_by_driver.head(10)

# Bar chart
fig = px.bar(top_10_drivers, x='points', y='driverRef', orientation='h',
             title='Total Points by Driver (Top 10)',
             labels={'points': 'Total Points', 'driverRef': 'Driver'},
             text='points')
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.show()


In [7]:
# Bar chart: Wins by driver in 2022
victories_2022 = data[(data['year'] == 2022) & (data['wins'] > 0)]
victories_by_driver = victories_2022.groupby('driverRef')['wins'].sum().reset_index()

fig = px.bar(victories_by_driver, x='driverRef', y='wins', color='driverRef',
             title='Wins by Driver in 2022',
             labels={'wins': 'Wins', 'driverRef': 'Driver'},
             text='wins')
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.show()
