# Tennis

TODOs:

* Scrape the ATP 100 list for years 2022-2012 (done)
* Join ATP list tables (done)
* Figure out how not to get banned from APTTour.com 
* Run statistical analysis

In [1]:
# ### SELENIUM ###
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from typing import List, Union
from bs4 import BeautifulSoup
import os
import pandas as pd
from requests import get as get_website
from tqdm.notebook import tqdm
from time import sleep
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

def get_height(url: str) -> Union[float, None]:
    """This method uses the player link to get the height

    Parameters
    ----------
    url : str
        url to the player description page
    driver : webdriver
        webdrive Selenium scrape

    Returns
    -------
    float or None
        returns the player height in cm
    """
    website_ex = get_website(url).content
    soup = BeautifulSoup(website_ex, "html.parser")
    find_res: Union[str, None] = soup.find("th", string="Height")
    if find_res:
        height_string: str = find_res.nextSibling.text
        meters_idx: int = height_string.find('m')
        height = height_string[meters_idx-5:meters_idx-1]
        return float(height) * 100
    else:
        return None

In [2]:

# Questa opzione fa sì che non vada ad aprirsi fisicamente il browser
# Ti consiglio di scommenterla solamente una volta cher hai fatto
# bene il codice e non ti serve più vedere.
# Se metti l'opzione ti aiuta a risparmiare un pochettino di tempo, ma non tanto
# chrome_options.add_argument("--headless")

driver = webdriver.Safari()

In [3]:
dates = pd.date_range(start='2022', end='today', freq='W-MON')
slee_time: int = 5
print(f'Number of weeks: {len(dates)}')

Number of weeks: 19


In [4]:
players_dfs: List[pd.DataFrame] = []
for date in tqdm(dates, desc='Current week progress'):
    url : str = f"https://www.atptour.com/en/rankings/singles?rankRange=0-100&rankDate={date}"
    print(f'Getting players for week {date}')
    driver.get(url)
    player_table = driver.find_element_by_class_name('mega-table')
    player_info_path = player_table.find_elements_by_xpath('//*[@id="player-rank-detail-ajax"]/tbody/tr/td/*[@class="player-cell-wrapper"]/a[1]')
    # NOTE: this list creation is called "list comprehension"
    player_names: List[str] = [player_name_path.text.strip() 
                            for player_name_path in player_info_path]
    player_links = [f"https://en.wikipedia.org/wiki/{player_name.replace(' ', '_')}" 
                    for player_name in player_names]
    players_df = pd.DataFrame({'Name': player_names, 
                               'Link': player_links})

    players_df['Height'] = players_df['Link'].progress_apply(get_height)
    players_dfs.append(players_df.copy())
    del players_df
    print(f'Sleeping for {slee_time}sec not to be flagged by ATP website...')
    for _ in tqdm(range(slee_time), desc='Sleep progress'):
        sleep(1)

Current week progress:   0%|          | 0/19 [00:00<?, ?it/s]

Getting players for week 2022-01-03 00:00:00


  0%|          | 0/100 [00:00<?, ?it/s]

Sleeping for 5sec not to be flagged by ATP website...


Sleep progress:   0%|          | 0/5 [00:00<?, ?it/s]

Getting players for week 2022-01-10 00:00:00


NoSuchElementException: Message: 


In [15]:
players_dfs_reindexed = [df.set_index('Name') for df in players_dfs]
pd.concat(players_dfs_reindexed, join='outer', axis=0).sort_index()

Unnamed: 0_level_0,Link,Height
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Adrian Mannarino,https://en.wikipedia.org/wiki/Adrian_Mannarino,180.0
Adrian Mannarino,https://en.wikipedia.org/wiki/Adrian_Mannarino,180.0
Albert Ramos-Vinolas,https://en.wikipedia.org/wiki/Albert_Ramos-Vin...,188.0
Albert Ramos-Vinolas,https://en.wikipedia.org/wiki/Albert_Ramos-Vin...,188.0
Alejandro Davidovich Fokina,https://en.wikipedia.org/wiki/Alejandro_Davido...,183.0
...,...,...
Tommy Paul,https://en.wikipedia.org/wiki/Tommy_Paul,163.0
Ugo Humbert,https://en.wikipedia.org/wiki/Ugo_Humbert,188.0
Ugo Humbert,https://en.wikipedia.org/wiki/Ugo_Humbert,188.0
Yoshihito Nishioka,https://en.wikipedia.org/wiki/Yoshihito_Nishioka,170.0
