# Scrape Most Recent Practice Races
Jack Wilson
9/21/2025

# Import Modules

In [115]:
import pandas as pd
import time, random, re, os

from datetime import datetime, timedelta

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

In [26]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)

# Scrape Practice

In [None]:
# Establish current year
year = datetime.now().year
#year = 2026

# Establish URL and round list
race_url = []
rounds = []
r = 1

# Establish web browser
browser = webdriver.Chrome()
browser.maximize_window()

while True:
    # Get URL
    url = 'https://www.formula1.com/en/results/' + str(year) + '/races'
    print(year)
    print(url)
    browser.get(url)
    time.sleep(random.uniform(1,2))
    
    # Try to get data for that that year, or go to the year prior and try again
    try:
        # Find the table
        table = browser.find_elements(By.TAG_NAME, 'table')
        for tr in table:
            # Find the table rows
            rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
            for row in rows:
                # Find the table data
                cells = row.find_elements(By.TAG_NAME, 'td')
                
                # Get the race url and round number
                link = cells[0].find_element(By.TAG_NAME, 'a')
                race_url.append(link.get_attribute('href'))
                rounds.append(r)
                r += 1
        
        # Break the loop if data is found
        break
    # If no data is found subtract 1 from the year and try again
    except:
        year -= 1
        # If the year is less than 1950, break the loop so it isn't infinite
        if year < 1949:
            break
        pass

# Extract the country from the last race URL and create practice session URLs
last_race_url = race_url[-1]
last_round = rounds[-1]

race_parts = last_race_url.split('/')
last_race_country = race_parts[-2]

# Set the practice number to start at 1
p = 1

# Initiate data lists
years, race, race_round, session, position, driver_name, team_name, lap_times, laps = [], [], [], [], [], [], [], [], []

while True:
    # Get the practice URL
    practice_url = last_race_url.replace('/race-result', '/practice/') + str(p)
    browser.get(practice_url)
    print(practice_url)
    time.sleep(random.uniform(1,2))
    
    try:
        # Find the table
        table = browser.find_elements(By.TAG_NAME, 'table')
        for tr in table:
            # Find the table rows
            rows = tr.find_elements(By.TAG_NAME, 'tr')[1:]
            for row in rows:
                # Find the table data
                cells = row.find_elements(By.TAG_NAME, 'td')
                
                # Append constant data
                years.append(year)
                race.append(last_race_country)
                race_round.append(last_round)
                session.append('practice ' + str(p))

                # Append table data
                position.append(cells[0].text)
                driver_name.append(cells[2].text)
                team_name.append(cells[3].text)
                
                # For the first row after header, save that lap_time as the base time
                if row == rows[0]:
                    # Find raw lap time
                    lap_time = cells[4].text
                    # Split into parts (min, sec, millisec)
                    time_parts = re.split(r"[:.]", lap_time)
                    minutes = int(time_parts[0])
                    seconds = int(time_parts[1])
                    milliseconds = int(time_parts[2])
                    # Convert that into timedelta so it can be added later
                    base_time = timedelta(minutes=minutes, seconds=seconds, milliseconds=milliseconds)
                    # Append it to the list
                    lap_times.append(base_time)
                    print('base: ', base_time)
                else:
                    # Find raw lap time
                    lap_time = cells[4].text
                    # Get rid of the + and s
                    time_clean = lap_time.strip('+s')
                    # Split into parts (sec, millisec)
                    time_parts = time_clean.split('.')
                    gap_seconds = int(time_parts[0])
                    gap_milliseconds = int(time_parts[1])
                    # Convert that into timedelta so it can be added
                    gap = timedelta(seconds=gap_seconds, milliseconds=gap_milliseconds)
                    print('gap: ', gap)

                    # Add the time gap to the base time
                    new_time = base_time + gap
                    lap_times.append(new_time)
                    print('new appended time: ', new_time)
                
                laps.append(cells[5].text)
        p += 1
        if p > 3:
            break
    except:
        p += 1
        if p > 3:
            break

browser.close()


2025
https://www.formula1.com/en/results/2025/races
https://www.formula1.com/en/results/2025/races/1269/azerbaijan/practice/1
base:  0:01:42.704000
gap:  0:00:00.310000
new appended time:  0:01:43.014000
gap:  0:00:00.552000
new appended time:  0:01:43.256000
gap:  0:00:00.553000
new appended time:  0:01:43.257000
gap:  0:00:00.859000
new appended time:  0:01:43.563000
gap:  0:00:01.034000
new appended time:  0:01:43.738000
gap:  0:00:01.086000
new appended time:  0:01:43.790000
gap:  0:00:01.155000
new appended time:  0:01:43.859000
gap:  0:00:01.199000
new appended time:  0:01:43.903000
gap:  0:00:01.271000
new appended time:  0:01:43.975000
gap:  0:00:01.281000
new appended time:  0:01:43.985000
gap:  0:00:01.282000
new appended time:  0:01:43.986000
gap:  0:00:01.383000
new appended time:  0:01:44.087000
gap:  0:00:01.383000
new appended time:  0:01:44.087000
gap:  0:00:01.435000
new appended time:  0:01:44.139000
gap:  0:00:01.447000
new appended time:  0:01:44.151000
gap:  0:00:0

In [97]:
practice_data = pd.DataFrame({
    'year': years,
    'race': race,
    'round': race_round,
    'session': session,
    'position': position,
    'driver_name': driver_name,
    'team_name': team_name,
    'lap_time': lap_times,
    'laps': laps
})

In [None]:
practice_data['lap_time'] = practice_data["lap_time"].dt.total_seconds()
practice_data['position'] = practice_data['position'].astype(float)
practice_data['laps'] = practice_data['laps'].astype(int)

In [121]:
practice_session_list = practice_data['session'].unique()
if len(practice_session_list) == 3:
    abr_practice_list = 'p1-2-3'
elif len(practice_session_list) == 2:
    abr_practice_list = 'p1-2'
elif len(practice_session_list) == 1:
    abr_practice_list = 'p1'
print(abr_practice_list)

p1-2-3


In [128]:
pc_time = datetime.now()
pc_time = str(pc_time)
print(pc_time)
pc_time = pc_time.replace(":", "-").replace(".", "-")
print(pc_time)

2025-09-23 10:22:07.024760
2025-09-23 10-22-07-024760


In [134]:
practice_data.to_csv(f'{year}_{last_race_country}_{abr_practice_list}_recent {pc_time}.csv')

In [None]:
keyword = 'recent'

# List all files in the folder path
folder_path = os.getcwd()
for f in os.listdir(folder_path):
    if f.endswith('.csv') and keyword in f:
        os.remove(f)
        print('Deleted: ', f)
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Regex to capture the datetime part at end of file name
pattern = re.compile(r"(\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}-\d+)")

file_dates = []
for f in files:
    match = pattern.search(f)
    if match:
        dt = datetime.strptime(match.group(1), "%Y-%m-%d %H-%M-%S-%f")
        file_dates.append((dt, f))

# Pick most recent file
latest_file = max(file_dates, key=lambda x: x[0])[1]
print(latest_file)
recent_practice_data = pd.read_csv(latest_file)

2025_azerbaijan_p1-2-3 2025-09-23 10-22-07-024760.csv


In [133]:
recent_practice_data

Unnamed: 0.1,Unnamed: 0,year,race,round,session,position,driver_name,team_name,lap_time,laps
0,0,2025,azerbaijan,17,practice 1,1.0,Lando Norris,McLaren,102.704,19
1,1,2025,azerbaijan,17,practice 1,2.0,Oscar Piastri,McLaren,103.014,14
2,2,2025,azerbaijan,17,practice 1,3.0,Charles Leclerc,Ferrari,103.256,17
3,3,2025,azerbaijan,17,practice 1,4.0,George Russell,Mercedes,103.257,16
4,4,2025,azerbaijan,17,practice 1,5.0,Alexander Albon,Williams,103.563,17
5,5,2025,azerbaijan,17,practice 1,6.0,Yuki Tsunoda,Red Bull Racing,103.738,16
6,6,2025,azerbaijan,17,practice 1,7.0,Max Verstappen,Red Bull Racing,103.79,15
7,7,2025,azerbaijan,17,practice 1,8.0,Carlos Sainz,Williams,103.859,17
8,8,2025,azerbaijan,17,practice 1,9.0,Liam Lawson,Racing Bulls,103.903,17
9,9,2025,azerbaijan,17,practice 1,10.0,Isack Hadjar,Racing Bulls,103.975,17
