# Clean Data
Jack Wilson
9/24/2025

# Import Modules

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)

# Clean F1 Website [Race Results 2001-2017]

## Load Data

In [3]:
races_2001 = pd.read_csv('../data/raw/races_results_raw_2001-2017.csv', encoding='utf-8')
races_2001.head()

Unnamed: 0,race_url,driverid,driver_name,position,points
0,https://www.formula1.com/en/results/2001/races/703/australia/race-result,1,Michael Schumacher,1,10.0
1,https://www.formula1.com/en/results/2001/races/703/australia/race-result,2,David Coulthard,2,6.0
2,https://www.formula1.com/en/results/2001/races/703/australia/race-result,3,Rubens Barrichello,3,4.0
3,https://www.formula1.com/en/results/2001/races/703/australia/race-result,4,Nick Heidfeld,4,3.0
4,https://www.formula1.com/en/results/2001/races/703/australia/race-result,5,Heinz-Harald Frentzen,5,2.0


## Separate Position and Status

In [5]:
races_2001['position'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', 'NC', '15', '16', '17', '18', '19', '20', '21', 'DQ',
       '22', '23', '24', 'EX'], dtype=object)

In [6]:
# Initialize new columns
end_positions = []
statuses = []
current_position = 1
current_race = None

# Group by each race
for idx, row in races_2001.iterrows():
    # Check if its a new race
    if current_race != row["race_url"]:
        current_race = row["race_url"]
        current_position = 1

    pos = row["position"]

    try:
        # Try converting to int to valid position
        numeric_pos = int(pos)
        end_positions.append(numeric_pos)
        statuses.append("CLAS")
        current_position = numeric_pos + 1
    except ValueError:
        # Not a number so need to assign position and keep status
        if pos in ["NC", "DQ", "DNS", "EX"]:
            end_positions.append(current_position)
            statuses.append(pos)
            current_position += 1
        else:
            end_positions.append(current_position)
            statuses.append("DNF")
            current_position += 1

# Assign back to dataframe
races_2001["end_position"] = end_positions
races_2001["position_status"] = statuses

# Rename columns
races_2001.rename(columns={"position": "raw_end_position"}, inplace=True)
races_2001.rename(columns={"end_position": "end_position"}, inplace=True)
races_2001.head()

Unnamed: 0,race_url,driverid,driver_name,raw_end_position,points,end_position,position_status
0,https://www.formula1.com/en/results/2001/races/703/australia/race-result,1,Michael Schumacher,1,10.0,1,CLAS
1,https://www.formula1.com/en/results/2001/races/703/australia/race-result,2,David Coulthard,2,6.0,2,CLAS
2,https://www.formula1.com/en/results/2001/races/703/australia/race-result,3,Rubens Barrichello,3,4.0,3,CLAS
3,https://www.formula1.com/en/results/2001/races/703/australia/race-result,4,Nick Heidfeld,4,3.0,4,CLAS
4,https://www.formula1.com/en/results/2001/races/703/australia/race-result,5,Heinz-Harald Frentzen,5,2.0,5,CLAS


## Consolidate Statuses

In [8]:
races_2001['position_status'].unique()

array(['CLAS', 'NC', 'DQ', 'EX'], dtype=object)

In [9]:
# Convert 'EX' values in position_status to 'DQ'
races_2001['position_status'] = races_2001['position_status'].replace('EX', 'DQ')

In [10]:
races_2001['position_status'].unique()

array(['CLAS', 'NC', 'DQ'], dtype=object)

## Aggregate Results

In [11]:
# Convert end_position and points columns to int
races_2001['end_position'] = races_2001['end_position'].astype(int)
races_2001['points'] = races_2001['points'].astype(float)

In [12]:
# Aggregate the races_2001 dataframe
races_2001_agg = races_2001.groupby(['driverid', 'driver_name']).agg({
    'end_position': ['mean', 'std'],
    'points': ['sum', 'mean', 'std']
}).round(2)

# Calculate NC and DQ rates
clas_rate = races_2001.groupby('driverid')['position_status'].apply(lambda x: (x == 'CLAS').mean()).round(2)
nc_rate = races_2001.groupby('driverid')['position_status'].apply(lambda x: (x == 'NC').mean()).round(2)
dq_rate = races_2001.groupby('driverid')['position_status'].apply(lambda x: (x == 'DQ').mean()).round(2)

# Flatten the multi-level column names
races_2001_agg.columns = ['avg_position', 'std_position', 'total_points', 'avg_points', 'std_points']
races_2001_agg = races_2001_agg.reset_index()

# Add NC and DQ rates
races_2001_agg['CLAS_rate'] = clas_rate.values
races_2001_agg['NC_rate'] = nc_rate.values
races_2001_agg['DQ_rate'] = dq_rate.values

races_2001_agg.head()

Unnamed: 0,driverid,driver_name,avg_position,std_position,total_points,avg_points,std_points,CLAS_rate,NC_rate,DQ_rate
0,1,Michael Schumacher,7.0,6.85,888.0,5.45,4.24,0.84,0.16,0.0
1,2,David Coulthard,10.21,6.04,241.0,1.72,2.44,0.72,0.28,0.0
2,3,Rubens Barrichello,9.15,6.01,519.0,2.68,3.3,0.84,0.16,0.0
3,4,Nick Heidfeld,10.19,5.22,259.0,1.55,2.45,0.79,0.21,0.0
4,5,Heinz-Harald Frentzen,12.6,5.28,21.0,0.5,1.27,0.6,0.4,0.0


## Save Clean DataFrame

In [13]:
races_2001_agg.to_csv('races_results_clean_2001-2017.csv', encoding='utf-8', index=False)

# Clean F1 Website [Race Results 2018-2025]

In [None]:
races_2018 = pd.read_csv('../data/raw/races_results_raw_2018-2025.csv', encoding='utf-8')
races_2018.head()