# Normal Updates - Last Update: 2025-03-03

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from geopy.geocoders import Nominatim
import time

## 1. Update ATP Ranking

### Single Date Update

In [2]:
# Load txt files
def load_txt(file):
    with open(file, 'r', encoding='utf-8-sig') as f:
        attributes = f.readline()[:-1].split(',')
        df = pd.DataFrame([line[:-1].split(',') for line in f], columns = attributes)
    f.close()
    return df

In [3]:
# The webdesign may change over time!

def scrape_a_date_rank(date, previous):
    # scrape a particular date
    prefix = 'https://www.atptour.com/en/rankings/singles?RankRange=0-100&DateWeek='  # subject to change
    img_pre = 'https://www.atptour.com/en/~/media/images/flags/'

    # open chrome window
    chromedrive_path = "D:\chromedriver.exe"
    s = Service(chromedrive_path)
    driver = webdriver.Chrome(service=s)
    driver.set_window_rect(x=10, y=10, width=1500, height=800)

    # get url
    driver.get(prefix + date)
    # driver.refresh()   # Refreshing can be detected by blockers

    # get content
    print('Start scraping:', date)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    trs = soup.find('table', {'class':'mega-table desktop-table non-live'}).find_all('tr')

    # set attrs
    attrs = ['Date', 'Rank', 'Country', 'Image', 'Player', 'Points']
    dict = {}
    for attr in attrs:
        dict[attr] = []

    # date
    dict['Date'] = [date for i in range(100)]

    for tr in trs:
        
        tds = tr.find_all('td')

        if len(tds) == 8: # check error
            # rank
            dict['Rank'].append(tds[0].text.strip())

            # country, image, player name
            lis = tds[1].find_all('li')
            country_abbr = (lis[1].svg.use.get('href', None))[-3:]
            dict['Country'].append(country_abbr.upper())
            dict['Image'].append(img_pre + country_abbr + '.svg')
            dict['Player'].append(lis[2].text.strip())

            # points
            dict['Points'].append(tds[3].text.strip().replace(',', ''))

    # save to txt
    df = pd.DataFrame(dict)
    df.to_csv('m/rank/dates/' + date + '.txt', index = False, sep = ',', encoding='utf-8-sig')
    print('Finished:', date)

    # open the ranking file
    dict_r = load_txt('m/rank/atpranking_20000110_'+previous.replace('-', '')+'.txt').to_dict('list')
    dict_r[date] = []

    # first get points for existing players
    for p in dict_r['Player']:
        found = False
        for i in range(len(dict['Player'])):
            if p == dict['Player'][i]:
                # add new point
                dict_r[date].append(dict['Points'][i])
                found = True
                break
        if not found:
            dict_r[date].append(0)

    # add new player if any
    for p in dict['Player']:
        if p not in dict_r['Player']:
            # add player
            dict_r['Player'].append(p)
            for i in range(len(dict['Player'])):
                if p == dict['Player'][i]:
                    # add country
                    dict_r['Country'].append(dict['Country'][i])
                    # add image
                    dict_r['Country_Image'].append(dict['Image'][i])
                    # add point
                    dict_r[date].append(dict['Points'][i])
                    break
            # add previous points, all 0
            for d in [key for key in dict_r.keys()][3:-1]:
                dict_r[d].append(0)

    # save to txt
    df_new = pd.DataFrame(dict_r)
    df_new.to_csv('m/rank/atpranking_20000110_'+date.replace('-', '')+'.txt', index = False, sep = ',', encoding='utf-8-sig')

In [4]:
scrape_a_date_rank('2025-03-03', '2025-02-24')

Start scraping: 2025-03-03
Finished: 2025-03-03


Remember to add the new dates to dates.txt

Remember to update rank file in Flourish

## 2. Update ATP Links According to New Ranks

In [5]:
# Define the function to update ATP profile links by ATP Top 100 ranks
def update_link_by_new_rank(dates):
    '''
    dates: a list of dates text
    '''
    # update links according to atp ranking

    prefix = 'https://www.atptour.com/en/rankings/singles?RankRange=0-100&DateWeek='
    head = 'https://www.atptour.com'

    # open txt
    df = load_txt('m/basics/atpprofilebasics.txt')
    pl = [p for p in df['Player']]

    with open('m/basics/atpprofilelinks.txt', 'r', encoding='utf-8-sig') as f:
        links = [line[:-1] for line in f]
    f.close()

    chromedrive_path = "D:\chromedriver.exe"
    s = Service(chromedrive_path)
    driver = webdriver.Chrome(service=s)
    driver.set_window_rect(x=10, y=10, width=1500, height=800)

    # date list
    for date in dates:
        driver.get(prefix + date)
        # driver.refresh()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        trs = soup.find('table', {'class':'mega-table desktop-table non-live'}).find_all('tr')
        for tr in trs[1:]:
            tds = tr.find_all('td')
            if len(tds) == 8:
                lis = tds[1].find_all('li')
                if lis[2].text.strip().replace('-', ' ') not in pl:
                    # add new link
                    links.append(head+lis[2].a.get('href'))
                    pl.append(lis[2].text.strip().replace('-', ' '))
                    print('Found:', lis[2].text.strip().replace('-', ' '))
        print('Finish:', date)
    f.close()

    # update links
    with open('m/basics/atpprofilelinks.txt', 'w', encoding='utf-8-sig') as f:
        for link in set(links):
            if link != '':
                f.write(link)
                f.write('\n')
    f.close()

In [6]:
update_link_by_new_rank(["2025-03-03"])

Found: Jaime Faria
Finish: 2025-03-03


## 3. Update ATP Profile Information

In [7]:
# Update new basics based on new links in atprofilelinks
# Website design may change

with open('m/basics/atpprofilelinks.txt', 'r', encoding='utf-8-sig') as f:
    links = [line[:-1] for line in f]
f.close()

dict = load_txt('m/basics/atpprofilebasics.txt').to_dict('list')

chromedrive_path = "D:\chromedriver.exe"
s = Service(chromedrive_path)
driver = webdriver.Chrome(service=s)
driver.set_window_rect(x=10, y=10, width=100, height=200)

for i, link in enumerate(links):
    if link not in dict['ATP Link']:
        driver.get(link)
        # driver.refresh()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # add name
        try:
            dict['Player'].append(soup.find('div', {'class': 'player_name'}).text.strip().replace('-', ' '))
        except:
            print(link, 'not found')
            break
        # add link
        dict['ATP Link'].append(link)
        # add birthday
        try:
            lis = soup.find('ul', {'class': 'pd_left'}).find_all('li')
            dict['Birthday'].append(lis[0].find_all('span')[1].text.split('(')[1][:-1].replace('.', '-'))
        except:
            dict['Birthday'].append('')
        # add birthplace
        try:
            lis = soup.find('ul', {'class': 'pd_right'}).find_all('li')
            birthplace = lis[1].find_all('span')[1].text.replace(', ', ' - ')
        except:
            birthplace = ''
        dict['Birthplace'].append(birthplace)
        # add plays
        try:
            lis = soup.find('ul', {'class': 'pd_right'}).find_all('li')
            dict['Plays'].append(lis[2].find_all('span')[1].text.replace(', ', ' - '))
        except:
            dict['Plays'].append('')
        # add turned pro
        try:
            lis = soup.find('ul', {'class': 'pd_left'}).find_all('li')
            dict['Turned Pro'].append(lis[3].find_all('span')[1].text)
        except:
            dict['Turned Pro'].append('')
        # add weight
        try:
            lis = soup.find('ul', {'class': 'pd_left'}).find_all('li')
            dict['Weight (kg)'].append(lis[1].find_all('span')[1].text.split('(')[1][:-3])
        except:
            dict['Weight (kg)'].append('')
        # add height
        try:
            lis = soup.find('ul', {'class': 'pd_left'}).find_all('li')
            dict['Height (cm)'].append(lis[2].find_all('span')[1].text.split('(')[1][:-3])
        except:
            dict['Height (cm)'].append('')
        print('Finished:', i+1)

        # add birthplace details, matched region, lat, lon
        geolocator = Nominatim(user_agent='tennis-vis')
        try:
            location = geolocator.geocode(birthplace)
            dict['Birthplace_matched'].append(location.address.replace(',', ' -'))
            dict['Birthplace_lat'].append(location.latitude)
            dict['Birthplace_lon'].append(location.longitude)
            print(location.address.replace(',', ' -'), location.latitude, location.longitude)
        except:
            try:
                location = geolocator.geocode(birthplace.split(',')[0]+','+birthplace.split(',')[1])
                dict['Birthplace_matched'].append(location.address.replace(',', ' -'))
                dict['Birthplace_lat'].append(location.latitude)
                dict['Birthplace_lon'].append(location.longitude)
                print(location.address.replace(',', ' -'), location.latitude, location.longitude)
            except:
                try:
                    location = geolocator.geocode(birthplace.split(',')[0])
                    dict['Birthplace_matched'].append(location.address.replace(',', ' -'))
                    dict['Birthplace_lat'].append(location.latitude)
                    dict['Birthplace_lon'].append(location.longitude)
                    print(location.address.replace(',', ' -'), location.latitude, location.longitude)
                except:
                    try:
                        location = geolocator.geocode(birthplace.split(',')[1])
                        dict['Birthplace_matched'].append(location.address.replace(',', ' -'))
                        dict['Birthplace_lat'].append(location.latitude)
                        dict['Birthplace_lon'].append(location.longitude)
                        print(location.address.replace(',', ' -'), location.latitude, location.longitude)
                    except:
                        try:
                            location = geolocator.geocode(birthplace.split(',')[2])
                            dict['Birthplace_matched'].append(location.address.replace(',', ' -'))
                            dict['Birthplace_lat'].append(location.latitude)
                            dict['Birthplace_lon'].append(location.longitude)
                            print(location.address.replace(',', ' -'), location.latitude, location.longitude)
                        except:
                            dict['Birthplace_matched'].append('')
                            dict['Birthplace_lat'].append('')
                            dict['Birthplace_lon'].append('')
                            print('','','')

df = pd.DataFrame(dict)
df.to_csv('m/basics/atpprofilebasics.txt', index = False, sep = ',', encoding='utf-8-sig')

Finished: 2961
Lisboa - Portugal 38.7077507 -9.1365919


## 4. Update ATP TOP 100 Distribution

In [8]:
# Define the function to update Geo distribution of ATP Top 100 players
def update_geo_atp100(date,num):
    # get basics
    df_basics = load_txt('m/basics/atpprofilebasics.txt')

    # set basic dict
    attrs = ['Start', 'End', 'Player', 'Rank', 'Points', 'Birthplace', 'Lat', 'Lon']
    dict = {}
    for attr in attrs:
        dict[attr] = []

    # get dates
    dates = []
    with open('m/rank/dates.txt', 'r') as f:
        for date in f:
            dates.append(date[:-1])
    f.close()

    # Start looping
    for k, date in enumerate([dates[num - i - 1] for i in range(num)]):
        df_rank = load_txt('m/rank/dates/'+date+'.txt')
        for i, p in enumerate(df_rank['Player']):
            for j, r in enumerate(df_basics['Player']):
                if p.replace('-', ' ') == r and k != (num-1):
                    try:
                        player = p.replace('-', ' ')
                        rank = int(df_rank['Rank'][i])
                        points = int(df_rank['Points'][i])
                        birthplace = df_basics['Birthplace'][j]
                        birthplace_lat = float(df_basics['Birthplace_lat'][j])
                        birthplace_lon = float(df_basics['Birthplace_lon'][j])
                        dict['Player'].append(player)
                        dict['Rank'].append(rank)
                        dict['Points'].append(points)
                        dict['Birthplace'].append(birthplace)
                        dict['Lat'].append(birthplace_lat)
                        dict['Lon'].append(birthplace_lon)
                        dict['Start'].append(date)
                        dict['End'].append(dates[num - k - 2])
                    except:
                        pass
                    break
                if p.replace('-', ' ') == r and k == (num-1):
                    try:
                        player = p.replace('-', ' ')
                        rank = int(df_rank['Rank'][i])
                        points = int(df_rank['Points'][i])
                        birthplace = df_basics['Birthplace'][j]
                        birthplace_lat = float(df_basics['Birthplace_lat'][j])
                        birthplace_lon = float(df_basics['Birthplace_lon'][j])
                        dict['Player'].append(player)
                        dict['Rank'].append(rank)
                        dict['Points'].append(points)
                        dict['Birthplace'].append(birthplace)
                        dict['Lat'].append(birthplace_lat)
                        dict['Lon'].append(birthplace_lon)
                        dict['Start'].append(date)
                        dict['End'].append('')
                    except:
                        pass
                    break

        print('Finished:', date)

    df = pd.DataFrame(dict)

    # Save csv
    df.to_csv('m/rank/atp100geo20000110_'+date.replace("-","")+'.txt', index = False, sep = ',', encoding = 'utf-8-sig')

In [9]:
# num = Row Number of "2000-01-10" in date.txt file
update_geo_atp100("2025-03-03", 1205)

Finished: 2000-01-10
Finished: 2000-01-17
Finished: 2000-01-24
Finished: 2000-01-31
Finished: 2000-02-07
Finished: 2000-02-14
Finished: 2000-02-21
Finished: 2000-02-28
Finished: 2000-03-06
Finished: 2000-03-13
Finished: 2000-03-20
Finished: 2000-03-27
Finished: 2000-04-03
Finished: 2000-04-10
Finished: 2000-04-17
Finished: 2000-04-24
Finished: 2000-05-01
Finished: 2000-05-08
Finished: 2000-05-15
Finished: 2000-05-22
Finished: 2000-05-29
Finished: 2000-06-05
Finished: 2000-06-12
Finished: 2000-06-19
Finished: 2000-06-26
Finished: 2000-07-03
Finished: 2000-07-10
Finished: 2000-07-17
Finished: 2000-07-24
Finished: 2000-07-31
Finished: 2000-08-07
Finished: 2000-08-14
Finished: 2000-08-21
Finished: 2000-08-28
Finished: 2000-09-04
Finished: 2000-09-11
Finished: 2000-09-18
Finished: 2000-09-25
Finished: 2000-10-02
Finished: 2000-10-09
Finished: 2000-10-16
Finished: 2000-10-23
Finished: 2000-10-30
Finished: 2000-11-06
Finished: 2000-11-13
Finished: 2000-11-20
Finished: 2000-11-27
Finished: 200

Remember to update geo file in Flourish

## 5. Update All Matches Player by Player

The website started to block me but it seems for most important players I can scrape them. 

In [9]:
# define functions to get all players' names and scrape all matches data
def get_names(sex = ''):
    names = []
    if sex == 'm':
        with open('mp.txt', 'r', encoding='utf-8-sig') as f:
            for line in f:
                names.append(line[:-1])
    elif sex == 'w':
        with open('wp.txt', 'r', encoding='utf-8-sig') as f:
            for line in f:
                names.append(line[:-1])
    else:
        with open('mwplayerlist_processed.txt', 'r', encoding='utf-8-sig') as f:
            for line in f:
                names.append(line[:-1])
    return names

# define a function to scrape for a single man player's matches
def scrape_matches(name, driver):
    try:
        # set attributes
        attributes_m = ['Date', 'Tournament', 'Sets', 'Surface', 
                        'Rd', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 
                        'WP', 'LP', 'Score', 'DR', 'A%', 'DF%',
                        '1stIn', '1st%', '2nd%', 'BPSvd',
                        'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%',
                        'BPCnv', 'TP', 'Aces', 'DFs', 'SP', 
                        '1SP', '2SP', 'vA', 'Time']

        attributes_w = ['Date', 'Tournament', 'Sets', 'Surface', 
                        'Rd', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 
                        'WP', 'LP', 'Score', 'DR', 'A%', 'DF%',
                        '1stIn', '1st%', '2nd%', 'BPSvd',
                        'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%',
                        'BPCnv', 'Time']

        # get cleaned name
        sex = name[1]
        name_cleaned = name[4:].replace(' ', '')

        # set url prefix and types of webpages
        if sex == 'M':
            url = "https://www.tennisabstract.com/cgi-bin/player-classic.cgi?p=" 
        #   types = ["&f=ACareerqq", "&f=ACareerqqr1", "&f=ACareerqqw1"]
        else:
            url = "https://www.tennisabstract.com/cgi-bin/wplayer-classic.cgi?p="

        # Start of Scraping
        print('Start Scraping:', name[4:])
        
        # Man and woman pages are different
        if sex == 'M':

            # For man players
            # Set dictionary
            dic = {}
            for attr in attributes_m:
                dic[attr] = []

            # For serve page
            driver.get(url + name_cleaned.replace(' ', '') + "&f=ACareerqq")
            driver.refresh()
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            matches = soup.find('table', {'id': 'matches'}).find_all('tr')
            # driver.quit()

            if matches != []:
                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # date, contain special character
                        date = tds[0].text.split('‑')
                        dic[attributes_m[0]].append('-'.join(date))
                        # tournament
                        dic[attributes_m[1]].append(tds[1].text)
                        # sets
                        dic[attributes_m[2]].append(5 if tds[1].text in ['Wimbledon', 'Roland Garros', 'US Open', 'Australian Open'] else 3)
                        # surface
                        dic[attributes_m[3]].append(tds[2].text)
                        # round
                        dic[attributes_m[4]].append(tds[3].text)
                        # rank
                        dic[attributes_m[5]].append(tds[4].text)
                        # opponent rank
                        dic[attributes_m[6]].append(tds[5].text)
                        # win?, tournament rank, opponent tournament rank, winner, loser
                        if spans[1].text in name:
                            # win?
                            dic[attributes_m[7]].append(1)
                            # winner
                            dic[attributes_m[10]].append(name[4:])
                            # loser
                            dic[attributes_m[11]].append(spans[6].text)
                            # tRk
                            if spans[0].text != '':
                                dic[attributes_m[8]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[5].text != '':
                                dic[attributes_m[9]].append(spans[5].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        else:
                            # win?
                            dic[attributes_m[7]].append(0)
                            # winner
                            dic[attributes_m[10]].append(spans[1].text)
                            # loser
                            dic[attributes_m[11]].append(name[4:])
                            # tRk
                            if spans[6].text != '':
                                dic[attributes_m[8]].append(spans[6].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[0].text != '':
                                dic[attributes_m[9]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        # score
                        dic[attributes_m[12]].append(tds[7].text)
                        # DR
                        dic[attributes_m[13]].append(tds[9].text)
                        # A%
                        dic[attributes_m[14]].append(tds[10].text)
                        # DF%
                        dic[attributes_m[15]].append(tds[11].text)
                        # 1stIn
                        dic[attributes_m[16]].append(tds[12].text)
                        # 1st%
                        dic[attributes_m[17]].append(tds[13].text)
                        # 2rd%
                        dic[attributes_m[18]].append(tds[14].text)
                        # BPSvd
                        dic[attributes_m[19]].append(tds[15].text)

                print('Serve Page:', len(dic['Date']))

                # For return page
                show_return = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Return"]')))
                show_return.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver = webdriver.Chrome(service=s)
          #     driver.get(url + name_cleaned.replace(' ', '') + types[1])
          #     soup = BeautifulSoup(driver.page_source, 'html.parser')
          #     matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TPW
                        dic[attributes_m[20]].append(tds[10].text)
                        # RPW
                        dic[attributes_m[21]].append(tds[11].text)
                        # vA%
                        dic[attributes_m[22]].append(tds[12].text)
                        # v1st%
                        dic[attributes_m[23]].append(tds[13].text)
                        # v2rd%
                        dic[attributes_m[24]].append(tds[14].text)
                        # BPCvt
                        dic[attributes_m[25]].append(tds[15].text)
                
                print('Return Page:', len(dic['TPW']))

                # For raw page
                show_raw = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Raw"]')))
                show_raw.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver = webdriver.Chrome(service=s)
          #     driver.get(url + name_cleaned.replace(' ', '') + types[2])
          #     soup = BeautifulSoup(driver.page_source, 'html.parser')
          #     matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TP
                        dic[attributes_m[26]].append(tds[9].text)
                        # Aces
                        dic[attributes_m[27]].append(tds[10].text)
                        # DFs
                        dic[attributes_m[28]].append(tds[11].text)
                        # SP
                        dic[attributes_m[29]].append(tds[12].text)
                        # 1SP
                        dic[attributes_m[30]].append(tds[13].text)
                        # 2SP
                        dic[attributes_m[31]].append(tds[14].text)
                        # vA
                        dic[attributes_m[32]].append(tds[15].text)
                        # Time
                        dic[attributes_m[33]].append(tds[16].text)

                print('Raw Page:', len(dic['TP']))

            else:
                # Serve page
                driver.get(url + name_cleaned.replace(' ', ''))
                driver.refresh()
                # Show career
                try:
                    show_career = driver.find_element(By.ID, 'careerclick')
                except:
                    return None
                show_career.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # date, contain special character
                        date = tds[0].text.split('‑')
                        dic[attributes_m[0]].append('-'.join(date))
                        # tournament
                        dic[attributes_m[1]].append(tds[1].text)
                        # sets
                        dic[attributes_m[2]].append(5 if tds[1].text in ['Wimbledon', 'Roland Garros', 'US Open', 'Australian Open'] else 3)
                        # surface
                        dic[attributes_m[3]].append(tds[2].text)
                        # round
                        dic[attributes_m[4]].append(tds[3].text)
                        # rank
                        dic[attributes_m[5]].append(tds[4].text)
                        # opponent rank
                        dic[attributes_m[6]].append(tds[5].text)
                        # win?, tournament rank, opponent tournament rank, winner, loser
                        if spans[1].text in name:
                            # win?
                            dic[attributes_m[7]].append(1)
                            # winner
                            dic[attributes_m[10]].append(name[4:])
                            # loser
                            dic[attributes_m[11]].append(spans[6].text)
                            # tRk
                            if spans[0].text != '':
                                dic[attributes_m[8]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[5].text != '':
                                dic[attributes_m[9]].append(spans[5].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        else:
                            # win?
                            dic[attributes_m[7]].append(0)
                            # winner
                            dic[attributes_m[10]].append(spans[1].text)
                            # loser
                            dic[attributes_m[11]].append(name[4:])
                            # tRk
                            if spans[6].text != '':
                                dic[attributes_m[8]].append(spans[6].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[0].text != '':
                                dic[attributes_m[9]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        # score
                        dic[attributes_m[12]].append(tds[7].text)
                        # DR
                        dic[attributes_m[13]].append(tds[9].text)
                        # A%
                        dic[attributes_m[14]].append(tds[10].text)
                        # DF%
                        dic[attributes_m[15]].append(tds[11].text)
                        # 1stIn
                        dic[attributes_m[16]].append(tds[12].text)
                        # 1st%
                        dic[attributes_m[17]].append(tds[13].text)
                        # 2rd%
                        dic[attributes_m[18]].append(tds[14].text)
                        # BPSvd
                        dic[attributes_m[19]].append(tds[15].text)

                print('Serve Page:', len(dic['Date']))

                # Return page
                show_return = driver.find_element(By.XPATH, '//span[text()="Return"]')
                show_return.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TPW
                        dic[attributes_m[20]].append(tds[10].text)
                        # RPW
                        dic[attributes_m[21]].append(tds[11].text)
                        # vA%
                        dic[attributes_m[22]].append(tds[12].text)
                        # v1st%
                        dic[attributes_m[23]].append(tds[13].text)
                        # v2rd%
                        dic[attributes_m[24]].append(tds[14].text)
                        # BPCvt
                        dic[attributes_m[25]].append(tds[15].text)
                
                print('Return Page:', len(dic['TPW']))

                # raw page
                show_raw = driver.find_element(By.XPATH, '//span[text()="Raw"]')
                show_raw.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
              # driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TP
                        dic[attributes_m[26]].append(tds[9].text)
                        # Aces
                        dic[attributes_m[27]].append(tds[10].text)
                        # DFs
                        dic[attributes_m[28]].append(tds[11].text)
                        # SP
                        dic[attributes_m[29]].append(tds[12].text)
                        # 1SP
                        dic[attributes_m[30]].append(tds[13].text)
                        # 2SP
                        dic[attributes_m[31]].append(tds[14].text)
                        # vA
                        dic[attributes_m[32]].append(tds[15].text)
                        # Time
                        dic[attributes_m[33]].append(tds[16].text)

                print('Raw Page:', len(dic['TP']))

        else:

            # For woman players
            # set dictionary
            dic = {}
            for attr in attributes_w:
                dic[attr] = []

            # For serve page
            driver.get(url + name_cleaned.replace(' ', '') + '&f=ACareerqq')
            driver.refresh()
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            matches = soup.find('table', {'id': 'matches'}).find_all('tr')

            # Check whether it is empty
            if matches != []:
                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # date, contain special character
                        date = tds[0].text.split('‑')
                        dic[attributes_w[0]].append('-'.join(date))
                        # tournament
                        dic[attributes_w[1]].append(tds[1].text)
                        # sets
                        dic[attributes_w[2]].append(5 if tds[1].text in ['Wimbledon', 'Roland Garros', 'US Open', 'Australian Open'] else 3)
                        # surface
                        dic[attributes_w[3]].append(tds[2].text)
                        # round
                        dic[attributes_w[4]].append(tds[3].text)
                        # rank
                        dic[attributes_w[5]].append(tds[4].text)
                        # opponent rank
                        dic[attributes_w[6]].append(tds[5].text)
                        # win?, tournament rank, opponent tournament rank, winner, loser
                        if spans[1].text in name:
                            # win?
                            dic[attributes_w[7]].append(1)
                            # winner
                            dic[attributes_w[10]].append(name[4:])
                            # loser
                            dic[attributes_w[11]].append(spans[6].text)
                            # tRk
                            if spans[0].text != '':
                                dic[attributes_w[8]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_w[8]].append('')
                            # vtRk
                            if spans[5].text != '':
                                dic[attributes_w[9]].append(spans[5].text[1:-1])
                            else:
                                dic[attributes_w[9]].append('')
                        else:
                            # win?
                            dic[attributes_w[7]].append(0)
                            # winner
                            dic[attributes_w[10]].append(spans[1].text)
                            # loser
                            dic[attributes_w[11]].append(name[4:])
                            # tRk
                            if spans[6].text != '':
                                dic[attributes_w[8]].append(spans[6].text[1:-1])
                            else:
                                dic[attributes_w[8]].append('')
                            # vtRk
                            if spans[0].text != '':
                                dic[attributes_w[9]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_w[9]].append('')
                        # score
                        dic[attributes_w[12]].append(tds[7].text.replace(" (ch)", ""))
                        # DR
                        dic[attributes_w[13]].append(tds[8].text)
                        # A%
                        dic[attributes_w[14]].append(tds[9].text)
                        # DF%
                        dic[attributes_w[15]].append(tds[10].text)
                        # 1stIn
                        dic[attributes_w[16]].append(tds[11].text)
                        # 1st%
                        dic[attributes_w[17]].append(tds[12].text)
                        # 2rd%
                        dic[attributes_w[18]].append(tds[13].text)
                        # BPSvd
                        dic[attributes_w[19]].append(tds[14].text)

                print('Serve Page:', len(dic['Date']))

                # For return page
                page_return = driver.find_element(By.XPATH, '//span[text()="Show Return Stats"]')
                page_return.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
            #   driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TPW
                        dic[attributes_w[20]].append(tds[9].text)
                        # RPW
                        dic[attributes_w[21]].append(tds[10].text)
                        # vA%
                        dic[attributes_w[22]].append(tds[11].text)
                        # v1st%
                        dic[attributes_w[23]].append(tds[12].text)
                        # v2rd%
                        dic[attributes_w[24]].append(tds[13].text)
                        # BPCvt
                        dic[attributes_w[25]].append(tds[14].text)
                        # Time
                        dic[attributes_w[26]].append(tds[15].text)

                print('Return Page:', len(dic['TPW']))

        # save to text, separated by gender
        df = pd.DataFrame(dic)
        df.to_csv(sex.lower() + '/matches/txt/' + name[4:] + '.txt', index = False, sep = ',', encoding='utf-8-sig')

    except (IndexError, AttributeError) as e:
        pass
    
# define a loop function to scrape players from a given index
def scrape_from(idx, sex, driver):
    names = get_names(sex)
    for i in range(idx, len(names)):
        try:

            # Scrape the player matches
            scrape_matches(names[i], driver)

            # Print out progress
            print('Finish Scraping', i + 1, 'players.')
            print(len(names) - (i + 1), 'players remaining.')
            print('\n')

            # Sleep for second
            # time.sleep(5)

        except:

            # Sleep for second
            # time.sleep(5)
            
            continue
        
# load txt files
def load_txt(name):
    with open(name[1].lower() + '/matches/txt/' + name[4:] + '.txt', 'r', encoding='utf-8-sig') as f:
        attributes = f.readline()[:-1].split(',')
        df = pd.DataFrame([line[:-1].split(',') for line in f], columns = attributes)
    return df

In [4]:
# UPDATE MATCHES DATA!!!
# # Set up normal chromdriver
# chromedrive_path = "D:\chromedriver.exe"
# s = Service(chromedrive_path)
# driver = webdriver.Chrome(service=s)
# driver.set_window_rect(x=10, y=10, width=100, height=200)

# Set up undetected driver in headless mode
options = uc.ChromeOptions()
options.add_argument("--headless=new")
driver = uc.Chrome(options=options)

scrape_from(553, 'm', driver)

Start Scraping: Raul Ramirez
Serve Page: 822
Return Page: 822
Raw Page: 822
Finish Scraping 554 players.
30943 players remaining.


Start Scraping: Wishaya Trongcharoenchaikul
Serve Page: 464
Return Page: 464
Raw Page: 464
Finish Scraping 555 players.
30942 players remaining.


Start Scraping: Stefan Koubek
Finish Scraping 556 players.
30941 players remaining.


Start Scraping: Dan Added
Finish Scraping 557 players.
30940 players remaining.


Start Scraping: Onny Parun
Finish Scraping 558 players.
30939 players remaining.


Start Scraping: Yunseong Chung
Finish Scraping 559 players.
30938 players remaining.


Start Scraping: Dennis Novikov
Finish Scraping 560 players.
30937 players remaining.


Start Scraping: Younes El Aynaoui
Finish Scraping 561 players.
30936 players remaining.


Start Scraping: Ronald Agenor
Finish Scraping 562 players.
30935 players remaining.


Start Scraping: Alex Corretja
Finish Scraping 563 players.
30934 players remaining.


Start Scraping: Soon Woo Kwon
Fini