# Tennis Player Analysis

## 1. Import Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

## 2. Process Names

In [26]:
# load name list
# https://www.tennisabstract.com/mwplayerlist.js?_=1668453288068
with open("mwplayerlist.txt", 'r') as f:
    names = f.read()
f.close()

In [27]:
# process name list and save to a new text file
names = names[17:-5].split("\", \"")
with open("mwplayerlist_processed.txt", "w") as f:
    for name in names:
        f.write(name)
        f.write('\n')
f.close()

## 3. Scraper All Matches Player by Player

In [191]:
# headers = {
#   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
#}

In [16]:
# define functions to get all players' names
def get_names(sex = ''):
    names = []
    if sex == 'm':
        with open('mp.txt', 'r') as f:
            for line in f:
                names.append(line[:-1])
    elif sex == 'w':
        with open('wp.txt', 'r') as f:
            for line in f:
                names.append(line[:-1])
    else:
        with open('mwplayerlist_processed.txt', 'r') as f:
            for line in f:
                names.append(line[:-1])
    return names

# define a function to scrape for a single man player's matches
def scrape_matches(name, driver):
    try:
        # set attributes
        attributes_m = ['Date', 'Tournament', 'Sets', 'Surface', 
                        'Rd', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 
                        'WP', 'LP', 'Score', 'DR', 'A%', 'DF%',
                        '1stIn', '1st%', '2nd%', 'BPSvd',
                        'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%',
                        'BPCnv', 'TP', 'Aces', 'DFs', 'SP', 
                        '1SP', '2SP', 'vA', 'Time']

        attributes_w = ['Date', 'Tournament', 'Sets', 'Surface', 
                        'Rd', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 
                        'WP', 'LP', 'Score', 'DR', 'A%', 'DF%',
                        '1stIn', '1st%', '2nd%', 'BPSvd',
                        'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%',
                        'BPCnv', 'Time']

        # get cleaned name
        sex = name[1]
        name_cleaned = name[4:].replace(' ', '')

        # set url prefix and types of webpages
        if sex == 'M':
            url = "https://www.tennisabstract.com/cgi-bin/player-classic.cgi?p=" 
        #   types = ["&f=ACareerqq", "&f=ACareerqqr1", "&f=ACareerqqw1"]
        else:
            url = "https://www.tennisabstract.com/cgi-bin/wplayer-classic.cgi?p="

        # Start of Scraping
        print('Start Scraping:', name[4:])
        
        # man and woman pages are different
        if sex == 'M':

            # for man players
            # set dictionary
            dic = {}
            for attr in attributes_m:
                dic[attr] = []

            # for serve page
            driver.get(url + name_cleaned.replace(' ', '') + "&f=ACareerqq")
            driver.refresh()
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            matches = soup.find('table', {'id': 'matches'}).find_all('tr')
        #   driver.quit()

            if matches != []:
                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # date, contain special character
                        date = tds[0].text.split('‑')
                        dic[attributes_m[0]].append('-'.join(date))
                        # tournament
                        dic[attributes_m[1]].append(tds[1].text)
                        # sets
                        dic[attributes_m[2]].append(5 if tds[1].text in ['Wimbledon', 'Roland Garros', 'US Open', 'Australian Open'] else 3)
                        # surface
                        dic[attributes_m[3]].append(tds[2].text)
                        # round
                        dic[attributes_m[4]].append(tds[3].text)
                        # rank
                        dic[attributes_m[5]].append(tds[4].text)
                        # opponent rank
                        dic[attributes_m[6]].append(tds[5].text)
                        # win?, tournament rank, opponent tournament rank, winner, loser
                        if spans[1].text in name:
                            # win?
                            dic[attributes_m[7]].append(1)
                            # winner
                            dic[attributes_m[10]].append(name[4:])
                            # loser
                            dic[attributes_m[11]].append(spans[6].text)
                            # tRk
                            if spans[0].text != '':
                                dic[attributes_m[8]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[5].text != '':
                                dic[attributes_m[9]].append(spans[5].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        else:
                            # win?
                            dic[attributes_m[7]].append(0)
                            # winner
                            dic[attributes_m[10]].append(spans[1].text)
                            # loser
                            dic[attributes_m[11]].append(name[4:])
                            # tRk
                            if spans[6].text != '':
                                dic[attributes_m[8]].append(spans[6].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[0].text != '':
                                dic[attributes_m[9]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        # score
                        dic[attributes_m[12]].append(tds[7].text)
                        # DR
                        dic[attributes_m[13]].append(tds[9].text)
                        # A%
                        dic[attributes_m[14]].append(tds[10].text)
                        # DF%
                        dic[attributes_m[15]].append(tds[11].text)
                        # 1stIn
                        dic[attributes_m[16]].append(tds[12].text)
                        # 1st%
                        dic[attributes_m[17]].append(tds[13].text)
                        # 2rd%
                        dic[attributes_m[18]].append(tds[14].text)
                        # BPSvd
                        dic[attributes_m[19]].append(tds[15].text)

                print('Serve Page:', len(dic['Date']))

                # for return page
                show_return = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Return"]')))
                show_return.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver = webdriver.Chrome(service=s)
          #     driver.get(url + name_cleaned.replace(' ', '') + types[1])
          #     soup = BeautifulSoup(driver.page_source, 'html.parser')
          #     matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TPW
                        dic[attributes_m[20]].append(tds[10].text)
                        # RPW
                        dic[attributes_m[21]].append(tds[11].text)
                        # vA%
                        dic[attributes_m[22]].append(tds[12].text)
                        # v1st%
                        dic[attributes_m[23]].append(tds[13].text)
                        # v2rd%
                        dic[attributes_m[24]].append(tds[14].text)
                        # BPCvt
                        dic[attributes_m[25]].append(tds[15].text)
                
                print('Return Page:', len(dic['TPW']))

                # for raw page
                show_raw = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Raw"]')))
                show_raw.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver = webdriver.Chrome(service=s)
          #     driver.get(url + name_cleaned.replace(' ', '') + types[2])
          #     soup = BeautifulSoup(driver.page_source, 'html.parser')
          #     matches = soup.find('table', {'id': 'matches'}).find_all('tr')
          #     driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TP
                        dic[attributes_m[26]].append(tds[9].text)
                        # Aces
                        dic[attributes_m[27]].append(tds[10].text)
                        # DFs
                        dic[attributes_m[28]].append(tds[11].text)
                        # SP
                        dic[attributes_m[29]].append(tds[12].text)
                        # 1SP
                        dic[attributes_m[30]].append(tds[13].text)
                        # 2SP
                        dic[attributes_m[31]].append(tds[14].text)
                        # vA
                        dic[attributes_m[32]].append(tds[15].text)
                        # Time
                        dic[attributes_m[33]].append(tds[16].text)

                print('Raw Page:', len(dic['TP']))

            else:

                # serve page
                driver.get(url + name_cleaned.replace(' ', ''))
                driver.refresh()
                # show career
                try:
                    show_career = driver.find_element(By.ID, 'careerclick')
                except:
                    return None
                show_career.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # date, contain special character
                        date = tds[0].text.split('‑')
                        dic[attributes_m[0]].append('-'.join(date))
                        # tournament
                        dic[attributes_m[1]].append(tds[1].text)
                        # sets
                        dic[attributes_m[2]].append(5 if tds[1].text in ['Wimbledon', 'Roland Garros', 'US Open', 'Australian Open'] else 3)
                        # surface
                        dic[attributes_m[3]].append(tds[2].text)
                        # round
                        dic[attributes_m[4]].append(tds[3].text)
                        # rank
                        dic[attributes_m[5]].append(tds[4].text)
                        # opponent rank
                        dic[attributes_m[6]].append(tds[5].text)
                        # win?, tournament rank, opponent tournament rank, winner, loser
                        if spans[1].text in name:
                            # win?
                            dic[attributes_m[7]].append(1)
                            # winner
                            dic[attributes_m[10]].append(name[4:])
                            # loser
                            dic[attributes_m[11]].append(spans[6].text)
                            # tRk
                            if spans[0].text != '':
                                dic[attributes_m[8]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[5].text != '':
                                dic[attributes_m[9]].append(spans[5].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        else:
                            # win?
                            dic[attributes_m[7]].append(0)
                            # winner
                            dic[attributes_m[10]].append(spans[1].text)
                            # loser
                            dic[attributes_m[11]].append(name[4:])
                            # tRk
                            if spans[6].text != '':
                                dic[attributes_m[8]].append(spans[6].text[1:-1])
                            else:
                                dic[attributes_m[8]].append('')
                            # vtRk
                            if spans[0].text != '':
                                dic[attributes_m[9]].append(spans[0].text[1:-1])
                            else:
                                dic[attributes_m[9]].append('')
                        # score
                        dic[attributes_m[12]].append(tds[7].text)
                        # DR
                        dic[attributes_m[13]].append(tds[9].text)
                        # A%
                        dic[attributes_m[14]].append(tds[10].text)
                        # DF%
                        dic[attributes_m[15]].append(tds[11].text)
                        # 1stIn
                        dic[attributes_m[16]].append(tds[12].text)
                        # 1st%
                        dic[attributes_m[17]].append(tds[13].text)
                        # 2rd%
                        dic[attributes_m[18]].append(tds[14].text)
                        # BPSvd
                        dic[attributes_m[19]].append(tds[15].text)

                print('Serve Page:', len(dic['Date']))

                # return page
                show_return = driver.find_element(By.XPATH, '//span[text()="Return"]')
                show_return.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TPW
                        dic[attributes_m[20]].append(tds[10].text)
                        # RPW
                        dic[attributes_m[21]].append(tds[11].text)
                        # vA%
                        dic[attributes_m[22]].append(tds[12].text)
                        # v1st%
                        dic[attributes_m[23]].append(tds[13].text)
                        # v2rd%
                        dic[attributes_m[24]].append(tds[14].text)
                        # BPCvt
                        dic[attributes_m[25]].append(tds[15].text)
                
                print('Return Page:', len(dic['TPW']))

                # raw page
                show_raw = driver.find_element(By.XPATH, '//span[text()="Raw"]')
                show_raw.click()
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                matches = soup.find('table', {'id': 'matches'}).find_all('tr')
              # driver.quit()

                for match in matches[2:]:
                    tds = match.find_all('td')
                    # first check if it is ongoing match
                    spans = tds[6].find_all('span')
                    if spans[3].text != 'vs':
                        # TP
                        dic[attributes_m[26]].append(tds[9].text)
                        # Aces
                        dic[attributes_m[27]].append(tds[10].text)
                        # DFs
                        dic[attributes_m[28]].append(tds[11].text)
                        # SP
                        dic[attributes_m[29]].append(tds[12].text)
                        # 1SP
                        dic[attributes_m[30]].append(tds[13].text)
                        # 2SP
                        dic[attributes_m[31]].append(tds[14].text)
                        # vA
                        dic[attributes_m[32]].append(tds[15].text)
                        # Time
                        dic[attributes_m[33]].append(tds[16].text)

                print('Raw Page:', len(dic['TP']))

        else:

            # for woman players
            # set dictionary
            dic = {}
            for attr in attributes_w:
                dic[attr] = []

            # for serve page
            driver.get(url + name_cleaned.replace(' ', '') + '&f=ACareerqq')
            driver.refresh()
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            matches = soup.find('table', {'id': 'matches'}).find_all('tr')

            for match in matches[2:]:
                tds = match.find_all('td')
                # first check if it is ongoing match
                spans = tds[6].find_all('span')
                if spans[3].text != 'vs':
                    # date, contain special character
                    date = tds[0].text.split('‑')
                    dic[attributes_w[0]].append('-'.join(date))
                    # tournament
                    dic[attributes_w[1]].append(tds[1].text)
                    # sets
                    dic[attributes_w[2]].append(3)
                    # surface
                    dic[attributes_w[3]].append(tds[2].text)
                    # round
                    dic[attributes_w[4]].append(tds[3].text)
                    # rank
                    dic[attributes_w[5]].append(tds[4].text)
                    # opponent rank
                    dic[attributes_w[6]].append(tds[5].text)
                    # win?, tournament rank, opponent tournament rank, winner, loser
                    if spans[1].text in name:
                        # win?
                        dic[attributes_w[7]].append(1)
                        # winner
                        dic[attributes_w[10]].append(name[4:])
                        # loser
                        dic[attributes_w[11]].append(spans[6].text)
                        # tRk
                        if spans[0].text != '':
                            dic[attributes_w[8]].append(spans[0].text[1:-1])
                        else:
                            dic[attributes_w[8]].append('')
                        # vtRk
                        if spans[5].text != '':
                            dic[attributes_w[9]].append(spans[5].text[1:-1])
                        else:
                            dic[attributes_w[9]].append('')
                    else:
                        # win?
                        dic[attributes_w[7]].append(0)
                        # winner
                        dic[attributes_w[10]].append(spans[1].text)
                        # loser
                        dic[attributes_w[11]].append(name[4:])
                        # tRk
                        if spans[6].text != '':
                            dic[attributes_w[8]].append(spans[6].text[1:-1])
                        else:
                            dic[attributes_w[8]].append('')
                        # vtRk
                        if spans[0].text != '':
                            dic[attributes_w[9]].append(spans[0].text[1:-1])
                        else:
                            dic[attributes_w[9]].append('')
                    # score
                    dic[attributes_w[12]].append(tds[7].text)
                    # DR
                    dic[attributes_w[13]].append(tds[8].text)
                    # A%
                    dic[attributes_w[14]].append(tds[9].text)
                    # DF%
                    dic[attributes_w[15]].append(tds[10].text)
                    # 1stIn
                    dic[attributes_w[16]].append(tds[11].text)
                    # 1st%
                    dic[attributes_w[17]].append(tds[12].text)
                    # 2rd%
                    dic[attributes_w[18]].append(tds[13].text)
                    # BPSvd
                    dic[attributes_w[19]].append(tds[14].text)

            print('Serve Page:', len(dic['Date']))

            # for return page
            page_return = driver.find_element(By.XPATH, '//span[text()="Show Return Stats"]')
            page_return.click()
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            matches = soup.find('table', {'id': 'matches'}).find_all('tr')
        #   driver.quit()

            for match in matches[2:]:
                tds = match.find_all('td')
                # first check if it is ongoing match
                spans = tds[6].find_all('span')
                if spans[3].text != 'vs':
                    # TPW
                    dic[attributes_w[20]].append(tds[9].text)
                    # RPW
                    dic[attributes_w[21]].append(tds[10].text)
                    # vA%
                    dic[attributes_w[22]].append(tds[11].text)
                    # v1st%
                    dic[attributes_w[23]].append(tds[12].text)
                    # v2rd%
                    dic[attributes_w[24]].append(tds[13].text)
                    # BPCvt
                    dic[attributes_w[25]].append(tds[14].text)
                    # Time
                    dic[attributes_w[26]].append(tds[15].text)

            print('Return Page:', len(dic['TPW']))

        # save to text, separated by gender
        df = pd.DataFrame(dic)
        df.to_csv(sex.lower() + '/matches/txt/' + name[4:] + '.txt', index = False, sep = ',')

    except (IndexError, AttributeError) as e:
        pass
    
# define a loop function to scrape players from a given index
def scrape_from(idx, sex, driver):
    names = get_names(sex)
    for i in range(idx, len(names)):
        scrape_matches(names[i], driver)
        print('Finish Scraping', i + 1, 'players.')
        print(len(names) - (i + 1), 'players remaining.')
        print('\n')
        
# load txt files
def load_txt(name):
    with open(name[1].lower() + '/matches/txt/' + name[4:] + '.txt', 'r') as f:
        attributes = f.readline()[:-1].split(',')
        df = pd.DataFrame([line[:-1].split(',') for line in f], columns = attributes)
    return df

In [4]:
# separate women and men
names = get_names()
mp = []
wp = []
for name in names:
    if name[1] == 'M':
        mp.append(name)
    elif name[1] == 'W':
        wp.append(name)

with open('mp.txt', 'w') as f:
    for name in mp:
        f.write(name)
        f.write('\n')
f.close()

with open('wp.txt', 'w') as f:
    for name in wp:
        f.write(name)
        f.write('\n')
f.close()

In [17]:
# scraping begins
# set up chromdriver
chromedrive_path = "D:\chromedriver.exe"
s = Service(chromedrive_path)
driver = webdriver.Chrome(service=s)
driver.set_window_rect(x=10, y=10, width=100, height=200)
scrape_from(2030, 'm', driver)

Start Scraping: James Lemke
Finish Scraping 2031 players.
24557 players remaining.


Start Scraping: Ervin Eleskovic
Serve Page: 59
Return Page: 59
Raw Page: 59
Finish Scraping 2032 players.
24556 players remaining.


Start Scraping: Bogdan Leonte
Serve Page: 1
Return Page: 1
Raw Page: 1
Finish Scraping 2033 players.
24555 players remaining.


Start Scraping: Hector Ruiz Cadenas
Serve Page: 48
Return Page: 48
Raw Page: 48
Finish Scraping 2034 players.
24554 players remaining.


Start Scraping: John Paul Fruttero
Serve Page: 308
Return Page: 308
Raw Page: 308
Finish Scraping 2035 players.
24553 players remaining.


Start Scraping: Kirill Ivanov Smolensky
Serve Page: 75
Return Page: 75
Raw Page: 75
Finish Scraping 2036 players.
24552 players remaining.


Start Scraping: Colin Dowdeswell
Serve Page: 316
Return Page: 316
Raw Page: 316
Finish Scraping 2037 players.
24551 players remaining.


Start Scraping: Juan Sebastian Cabal
Serve Page: 313
Return Page: 313
Raw Page: 313
Finish Scraping 

KeyboardInterrupt: 

In [None]:
scrape_matches('(M) Rafael Nadal')