In [3]:
""" Load and combine ATP match data from 2000-2024 """
import pandas as pd
import glob

file_pattern = "../data/raw/atp_matches_*.csv"
all_files = sorted(glob.glob(file_pattern))
all_files.pop() # exclude 2025

df_list = [pd.read_csv(file) for file in all_files]
df = pd.concat(df_list)

df['date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
df = df[['date', 'tourney_name', 'surface', 'round', 'best_of', 'winner_name', 'loser_name', 'winner_name', 'score','winner_rank', 'loser_rank','winner_rank_points','loser_rank_points']].copy()
df.columns = ['date', 'tournament_name', 'surface', 'round', 'best_of', 'player_A_name', 'player_B_name', 'winner_name', 'score', 'player_A_rank', 'player_B_rank', 'player_A_rank_points', 'player_B_rank_points']
print(df.head())

df.to_csv('../data/processed/atp_matches_2000-2024.csv', index=False)

        date tournament_name surface round  best_of        player_A_name  \
0 2000-01-10        Auckland    Hard   R32        3           Tommy Haas   
1 2000-01-10        Auckland    Hard   R32        3        Juan Balcells   
2 2000-01-10        Auckland    Hard   R32        3       Alberto Martin   
3 2000-01-10        Auckland    Hard   R32        3  Juan Carlos Ferrero   
4 2000-01-10        Auckland    Hard   R32        3         Michael Sell   

         player_B_name          winner_name           score  player_A_rank  \
0         Jeff Tarango           Tommy Haas     7-5 4-6 7-5           11.0   
1     Franco Squillari        Juan Balcells         7-5 7-5          211.0   
2  Alberto Berasategui       Alberto Martin         6-3 6-1           48.0   
3        Roger Federer  Juan Carlos Ferrero         6-4 6-4           45.0   
4       Nicolas Escude         Michael Sell  0-6 7-6(7) 6-1          167.0   

   player_B_rank  player_A_rank_points  player_B_rank_points  
0          

In [75]:
""" Scrape 2025 ATP match data from Tennis Abstract before Wimbledon 2025"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_tennis_abstract_match(url):
    """
    Extract information from single tennis match
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Parse from URL: 20250104-M-Canberra_CH-F-Ethan_Quinn-Joao_Fonseca.html
    filename = url.split('/')[-1]
    parts = filename.replace('.html', '').split('-')

    date, _, tourney_name, round, *_ = parts
    
    page_text = soup.get_text()
    winner = None
    loser = None
    score = None

    # Find winner, loser, score from page text
    if ' d. ' in page_text:
        parts = page_text.split(' d. ')
        
        # Get text before " d. " - take last line/words as winner
        before = parts[0].strip().split('\n')[-1].strip()
        winner = before
        
        # Get text after " d. " - take first part up to "Use" as loser and score
        after = parts[1].split('Use')[0].strip()
        
        # Split loser and score by finding where numbers start
        score_match = re.search(r'(\d+[-\(\)]\d+.*)', after)
        if score_match:
            score = score_match.group(1).strip()
            loser = after[:score_match.start()].strip()
        else:
            loser = after
    
    return {
        'tourney_date': date,
        'tourney_name': tourney_name.replace('_', ' '),
        'round': round,
        'winner_name': winner,
        'loser_name': loser,
        'score': score
    }

def get_all_2025_atp_matches():
    """
    Scrape the Tennis Abstract charting index page to find all 2025 ATP matches
    """
    index_url = "https://www.tennisabstract.com/charting/"
    
    response = requests.get(index_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all links to match pages
    match_urls = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Look for 2025 ATP (M = Men's) matches
        if '2025' in href and '-M-' in href and href.endswith('.html'):
            full_url = f"https://www.tennisabstract.com/charting/{href}"
            match_urls.append(full_url)
    
    return match_urls

# Get all 2025 ATP match URLs
all_match_urls = get_all_2025_atp_matches()

# Filter for matches before 6/30/2025
match_urls = []
for url in all_match_urls:
    filename = url.split('/')[-1]
    date_str = filename.split('-')[0]  # e.g., "20250104"
    date_int = int(date_str)
    
    # Keep matches before June 30, 2025 (20250630)
    if date_int < 20250630:
        match_urls.append(url)

# Scrape each match
matches_list = []
for i, url in enumerate(match_urls):
    # print(f"Scraping {i+1}/{len(match_urls)}: {url.split('/')[-1]}")
    match_data = scrape_tennis_abstract_match(url)
    matches_list.append(match_data)

# Create DataFrame and save
df_2025 = pd.DataFrame(matches_list)
df_2025 = df_2025.sort_values('tourney_date').reset_index(drop=True)
print(f"\nScraped {len(df_2025)} matches successfully")
print(df_2025.head())

# Save to CSV
df_2025.to_csv('../data/processed/atp_matches_2025.csv', index=False)


Scraped 431 matches successfully
  tourney_date tourney_name round      winner_name        loser_name  \
0     20250101     Brisbane   R16  Grigor Dimitrov  Aleksandar Vukic   
1     20250101    Hong Kong   R16      Arthur Fils       Zizou Bergs   
2     20250101   United Cup    RR     Tomas Machac    Hubert Hurkacz   
3     20250102    Hong Kong   R16  Lorenzo Musetti    Gabriel Diallo   
4     20250102    Hong Kong   R16  Fabian Marozsan     Andrey Rublev   

         score  
0   6-2 7-6(5)  
1   7-6(7) 6-4  
2  7-5 3-6 6-4  
3      6-4 6-3  
4  7-5 3-6 6-3  


In [76]:
""" Scrape Wimbledon 2025 matches from Tennis Abstract """

# Get all 2025 ATP match URLs
all_match_urls = get_all_2025_atp_matches()

# Filter for Wimbledon matches only
wimbledon_urls = []
for url in all_match_urls:
    filename = url.split('/')[-1]
    # Check if "Wimbledon" is in the tournament name
    if 'Wimbledon' in filename and "Wimbledon_Juniors" not in filename:
        wimbledon_urls.append(url)

# Scrape each match
matches_list = []
for i, url in enumerate(wimbledon_urls):
    match_data = scrape_tennis_abstract_match(url)
    matches_list.append(match_data)

# Create DataFrame and save
df_wimbledon = pd.DataFrame(matches_list)
df_wimbledon = df_wimbledon.sort_values('tourney_date').reset_index(drop=True)
print(f"\nScraped {len(df_wimbledon)} Wimbledon matches successfully")
print(df_wimbledon.head())

# Save to CSV
df_wimbledon.to_csv('../data/processed/wimbledon_2025.csv', index=False)


Scraped 33 Wimbledon matches successfully
  tourney_date tourney_name round         winner_name  \
0     20250630    Wimbledon  R128  Arthur Rinderknech   
1     20250630    Wimbledon  R128         Ethan Quinn   
2     20250630    Wimbledon  R128      Carlos Alcaraz   
3     20250630    Wimbledon  R128        Taylor Fritz   
4     20250630    Wimbledon  R128     Kamil Majchrzak   

                   loser_name                         score  
0            Alexander Zverev  7-6(3) 6-7(8) 6-3 6-7(5) 6-4  
1                Henry Searle           4-6 6-2 7-6(11) 6-2  
2               Fabio Fognini        7-5 6-7(5) 7-5 2-6 6-1  
3  Giovanni Mpetshi Perricard  6-7(6) 6-7(8) 6-4 7-6(6) 6-4  
4           Matteo Berrettini           4-6 6-2 6-4 5-7 6-3  
