In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### **Testing**

In [5]:
url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p=1&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [8]:
response = requests.get(url, headers=headers)

In [10]:
if response.status_code == 200: # Check if the request was successful. If the status code is 200, it means successful.
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup.prettify())  # Print formatted HTML to see the content
else:
    print("Failed to retrieve the page.")

<!DOCTYPE html>
<html>
 <head id="hdBase">
  <meta charset="utf-8"/>
  <title>
   BWF - BWF World Rankings - Overview
  </title>
  <meta content="tournamentsoftware.com: Software to run tournaments, leagues and ladders for tennis, badminton, padel, squash and other racket sports" name="description"/>
  <meta content="tennis, tournament, planner, tennis, amateur tennis, software, tournamentplanner, register, sport, www.tournamentsoftware.com, www.tournament.com, www.tennistournament.com, www.tennistoernooi.com, register, squash, badminton, entry, tournament, planner, tournooi, tournament, volleybal, volleyball, association, planning, poule, draw, tennis software, match, poule, match overview, teamsport, team, official, teamsportplanner, hockey, basketball, competition, league" name="keywords"/>
  <link href="//static.tournamentsoftware.com/content/images/themes/bwf/app/favicon.ico?v=20170126155446" rel="shortcut icon" type="image/icon"/>
  <link href="../content/images/themes/bwf/app/ma

In [12]:
ranking_table = soup.find('table', class_='ruler') # Find the table containing players data
print(ranking_table)

<table class="ruler" style="min-width:762px">
<caption>
		Men's Singles
	</caption><tr>
</tr><tr>
<th class="orderby rank" colspan="2"><a class="down" href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=1&amp;desc=1">Rank</a></th><th class="orderby"> </th><th class="orderby left"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=10">Country</a></th><th class="orderby left"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=10">Player</a></th><th class="orderby left"> </th><th class="orderby">Member ID</th><th class="orderby right rankingpoints"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=2">Points</a></th><th class="orderby right"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=7">Tournaments</a></th><th class="orderby"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=13">Confederation</a></th><th class="orderby"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C

In [14]:
table_columns = ranking_table.find_all('th') # Find the all table columns which are under 'th' tag
table_columns

[<th class="orderby rank" colspan="2"><a class="down" href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=1&amp;desc=1">Rank</a></th>,
 <th class="orderby"> </th>,
 <th class="orderby left"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=10">Country</a></th>,
 <th class="orderby left"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=10">Player</a></th>,
 <th class="orderby left"> </th>,
 <th class="orderby">Member ID</th>,
 <th class="orderby right rankingpoints"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=2">Points</a></th>,
 <th class="orderby right"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=7">Tournaments</a></th>,
 <th class="orderby"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=13">Confederation</a></th>,
 <th class="orderby"><a href="?id=42511&amp;category=472&amp;ps=100&amp;C472FOC=&amp;order=12">Country</a></th>]

In [16]:
# This filters out empty strings: table_columns = [title for title in table_titles if title] (Don't need this in this project)
columns = [column.text.strip() for column in table_columns] # Convert all columns into a list for a DataFrame creation later
print(columns)

['Rank', '', 'Country', 'Player', '', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']


In [107]:
row_data = ranking_table.find_all('tr') # Find all values in each row which are under the 'td' tag
#row_test = [row.text.strip() for row in row_data[2]] 
#print(row_test)
individual_row_data = [data.text.strip() for data in row_data] # Convert all values into list
individual_row_data[2]
#individual_row_data # Check if the amount of values in each row matches the table column which is 10 from the list above

'1\xa0\xa0CHN[CHN] SHI Yu Qi5794510181518AsiaChina'

In [194]:
# Turns out there are 11 values in each row. To match these data correctly with our table columns, we need to add another column in the table with the correct position.
columns.insert(1, 'x1') # Add a column called 'x1'. Don't worry about the vague name since it will store no data then we will drop it later with other unnecessary columns 
columns[2] = 'x2' # Rename the empty column as 'x2' which will be dropped later.
columns[3] = 'Country Code' # Rename to differentiate with the Country column.
columns[5] = 'x3' # Rename the empty column as 'x3' which will be dropped later.
print(columns)

['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']


In [196]:
df = pd.DataFrame(columns = columns) # Create DataFrame
df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country


In [115]:
all_rows = ranking_table.find_all('tr') # Find all rows on the page which are under the 'tr' tag
test_row = [row.text.strip() for row in all_rows[2]] # Convert all columns into a list for a DataFrame creation later
print(test_row)

['', '1', '', '', 'CHN', '[CHN] SHI Yu Qi', '', '57945', '101815', '18', 'Asia', 'China', '']


In [200]:
for row in all_rows[2:102]: # Create a loop to get all row data and add it into the DataFrame df. Range from 2:102 to get rid of the headers & footers not being part of the data we need, and to avoid error of mismatched columns.
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = individual_row_data
df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHN,[CHN] SHI Yu Qi,,57945,101815,18,Asia,China
1,2,,,DEN,[DEN] Viktor AXELSEN,,25831,93790,14,Europe,Denmark
2,3,,,DEN,[DEN] Anders ANTONSEN,,91554,84241,22,Europe,Denmark
3,4,,,JPN,[JPN] Kodai NARAOKA,,62063,84197,22,Asia,Japan
4,5,,,MAS,[MAS] LEE Zii Jia,,81561,83716,22,Asia,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...
95,96,,,IND,[IND] Raghu MARISWAMY,,62932,19070,18,Asia,India
96,97,,,PER,[PER] Adriano VIALE,,55942,19018,15,Pan America,Peru
97,98,,,GER,[GER] Kai SCHAEFER,,20808,18741,19,Europe,Germany
98,99,,,FRA,[FRA] Enogat ROY,,85118,18630,16,Europe,France


In [202]:
# Dropping the columns 'x1', 'x2', 'x3'
df = df.drop(columns=['x1', 'x2', 'x3'])
df

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country
0,1,CHN,[CHN] SHI Yu Qi,57945,101815,18,Asia,China
1,2,DEN,[DEN] Viktor AXELSEN,25831,93790,14,Europe,Denmark
2,3,DEN,[DEN] Anders ANTONSEN,91554,84241,22,Europe,Denmark
3,4,JPN,[JPN] Kodai NARAOKA,62063,84197,22,Asia,Japan
4,5,MAS,[MAS] LEE Zii Jia,81561,83716,22,Asia,Malaysia
...,...,...,...,...,...,...,...,...
95,96,IND,[IND] Raghu MARISWAMY,62932,19070,18,Asia,India
96,97,PER,[PER] Adriano VIALE,55942,19018,15,Pan America,Peru
97,98,GER,[GER] Kai SCHAEFER,20808,18741,19,Europe,Germany
98,99,FRA,[FRA] Enogat ROY,85118,18630,16,Europe,France


In [83]:
# Export the result as csv file
# df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\Player.csv', index = False)

### **Men's Singles**

In [82]:
# Men's Singles URL, filtered by 100 results on each page.
ms_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [84]:
# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this info to work with the last page which doesn't contain 100 results as the previous ones.
ms_page_number = 19
ms_results = 1831

In [86]:
# From the Test above, now we know all the columns needed for the DataFrame. Create a list of column names.
list = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']

In [88]:
# Create DataFrame
ms_df = pd.DataFrame(columns = list)
ms_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country


In [90]:
# Create a loop to add data to all pages in DataFrame 
# Loop through pages 1 to 19
for ms_page in range(1, ms_page_number +1):
    if ms_page < ms_page_number: # From pages 1-18 with 100 results on each page
        # Format the URL with the current page number
        ms_url = ms_base_url.format(ms_page)
        # Make a request to the URL
        ms_response = requests.get(ms_url, headers=headers)
        ms_soup = BeautifulSoup(ms_response.text, 'html.parser')
        ms_ranking_table = ms_soup.find('table', class_='ruler')
        ms_all_rows = ms_ranking_table.find_all('tr')[2:102]
        for ms_row in ms_all_rows:
            ms_row_data = ms_row.find_all('td')
            ms_individual_row_data = [ms_data.text.strip() for ms_data in ms_row_data]
            ms_length = len(ms_df)
            if ms_length < ms_results:
                ms_df.loc[ms_length] = ms_individual_row_data
            else:
                break
            
    else: # Page 19 with less than 100 results on this page
        # Format the URL with the current page number
        ms_url = ms_base_url.format(ms_page)
        # Make a request to the URL
        ms_response = requests.get(ms_url, headers=headers)
        ms_soup = BeautifulSoup(ms_response.text, 'html.parser')
        ms_ranking_table = ms_soup.find('table', class_='ruler')
        ms_all_rows = ms_ranking_table.find_all('tr')[2:(ms_results-((ms_page_number-1)*100) +2)]
        for ms_row in ms_all_rows:
            ms_row_data = ms_row.find_all('td')
            ms_individual_row_data = [ms_data.text.strip() for ms_data in ms_row_data]
            ms_length = len(ms_df)
            if ms_length < ms_results:
                ms_df.loc[ms_length] = ms_individual_row_data
            else:
                break

ms_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHN,[CHN] SHI Yu Qi,,57945,101815,18,Asia,China
1,2,,,DEN,[DEN] Viktor AXELSEN,,25831,93790,14,Europe,Denmark
2,3,,,DEN,[DEN] Anders ANTONSEN,,91554,84241,22,Europe,Denmark
3,4,,,JPN,[JPN] Kodai NARAOKA,,62063,84197,22,Asia,Japan
4,5,,,MAS,[MAS] LEE Zii Jia,,81561,83716,22,Asia,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...
1826,1767,,,UGA,[UGA] Hassan Jamil MAKINDA,,33916,20,1,Africa,Uganda
1827,1767,,,UGA,[UGA] Latif NGOBI,,29626,20,1,Africa,Uganda
1828,1767,,,UGA,[UGA] Abaasi BBAALE,,89938,20,1,Africa,Uganda
1829,1767,,,UGA,[UGA] Preet Nikulkumar PATEL,,34706,20,1,Africa,Uganda


In [91]:
# Dropping the columns 'x1', 'x2', 'x3'
ms_df = ms_df.drop(columns=['x1', 'x2', 'x3'])

In [92]:
# Add 'Category' column
ms_df['Category'] = 'MS'

In [93]:
ms_df

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,[CHN] SHI Yu Qi,57945,101815,18,Asia,China,MS
1,2,DEN,[DEN] Viktor AXELSEN,25831,93790,14,Europe,Denmark,MS
2,3,DEN,[DEN] Anders ANTONSEN,91554,84241,22,Europe,Denmark,MS
3,4,JPN,[JPN] Kodai NARAOKA,62063,84197,22,Asia,Japan,MS
4,5,MAS,[MAS] LEE Zii Jia,81561,83716,22,Asia,Malaysia,MS
...,...,...,...,...,...,...,...,...,...
1826,1767,UGA,[UGA] Hassan Jamil MAKINDA,33916,20,1,Africa,Uganda,MS
1827,1767,UGA,[UGA] Latif NGOBI,29626,20,1,Africa,Uganda,MS
1828,1767,UGA,[UGA] Abaasi BBAALE,89938,20,1,Africa,Uganda,MS
1829,1767,UGA,[UGA] Preet Nikulkumar PATEL,34706,20,1,Africa,Uganda,MS


In [94]:
#ms_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\MS Player.csv', index = False)

### **Women's Singles**

In [96]:
# Women's Singles URL, filtered by 100 results on each page.
ws_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42633&category=473&C473FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [97]:
# After checking with the filter by 100 results on each page.
# We need this info to work with the last page which doesn't contain 100 results as the previous ones.
ws_page_number = 13
ws_results = 1224

In [98]:
# From the Test above, now we know all the columns needed for the DataFrame. Create a list of column names.
list = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']

In [99]:
# Create DataFrame
ws_df = pd.DataFrame(columns = list)
ws_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country


In [100]:
# Create a loop to add data to all pages in DataFrame 
# Loop through pages 1 to 19
for ws_page in range(1, ws_page_number +1):
    if ws_page < ws_page_number: # From pages 1-18 with 100 results on each page
        # Format the URL with the current page number
        ws_url = ws_base_url.format(ws_page)
        # Make a request to the URL
        ws_response = requests.get(ws_url, headers=headers)
        ws_soup = BeautifulSoup(ws_response.text, 'html.parser')
        ws_ranking_table = ws_soup.find('table', class_='ruler')
        ws_all_rows = ws_ranking_table.find_all('tr')[2:102]
        for ws_row in ws_all_rows:
            ws_row_data = ws_row.find_all('td')
            ws_individual_row_data = [ws_data.text.strip() for ws_data in ws_row_data]
            ws_length = len(ws_df)
            if ws_length < ws_results:
                ws_df.loc[ws_length] = ws_individual_row_data
            else:
                break
            
    else: # Page 19 with less than 100 results on this page
        # Format the URL with the current page number
        ws_url = ws_base_url.format(ws_page)
        # Make a request to the URL
        ws_response = requests.get(ws_url, headers=headers)
        ws_soup = BeautifulSoup(ws_response.text, 'html.parser')
        ws_ranking_table = ws_soup.find('table', class_='ruler')
        ws_all_rows = ws_ranking_table.find_all('tr')[2:(ws_results-((ws_page_number-1)*100) +2)]
        for ws_row in ws_all_rows:
            ws_row_data = ws_row.find_all('td')
            ws_individual_row_data = [ws_data.text.strip() for ws_data in ws_row_data]
            ws_length = len(ws_df)
            if ws_length < ws_results:
                ws_df.loc[ws_length] = ws_individual_row_data
            else:
                break

ws_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,2,CHN,[CHN] CHEN Yu Fei,,78778,101682,14,Asia,China
1,2,,,KOR,[KOR] AN Se Young,,87442,100337,12,Asia,Korea
2,3,,,CHN,[CHN] WANG Zhi Yi,,61854,90895,19,Asia,China
3,4,,,ESP,[ESP] Carolina MARIN,,18228,87736,14,Europe,Spain
4,5,,,TPE,[TPE] TAI Tzu Ying,,61427,87111,17,Asia,Chinese Taipei
...,...,...,...,...,...,...,...,...,...,...,...
1219,1194,,,DEN,[DEN] Mathilde SLOTSAGER,,76549,20,1,Europe,Denmark
1220,1194,,,EST,[EST] Karolina PINTŠUK,,87914,20,1,Europe,Estonia
1221,1194,,,FRA,[FRA] Manon HEITZMANN,,99557,20,1,Europe,France
1222,1194,,,DEN,[DEN] Julie BRINCH,,98323,20,1,Europe,Denmark


In [101]:
# Dropping the columns 'x1', 'x2', 'x3'
ws_df = ws_df.drop(columns=['x1', 'x2', 'x3'])

In [102]:
# Add 'Category' column
ws_df['Category'] = 'WS'

In [103]:
ws_df

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,[CHN] CHEN Yu Fei,78778,101682,14,Asia,China,WS
1,2,KOR,[KOR] AN Se Young,87442,100337,12,Asia,Korea,WS
2,3,CHN,[CHN] WANG Zhi Yi,61854,90895,19,Asia,China,WS
3,4,ESP,[ESP] Carolina MARIN,18228,87736,14,Europe,Spain,WS
4,5,TPE,[TPE] TAI Tzu Ying,61427,87111,17,Asia,Chinese Taipei,WS
...,...,...,...,...,...,...,...,...,...
1219,1194,DEN,[DEN] Mathilde SLOTSAGER,76549,20,1,Europe,Denmark,WS
1220,1194,EST,[EST] Karolina PINTŠUK,87914,20,1,Europe,Estonia,WS
1221,1194,FRA,[FRA] Manon HEITZMANN,99557,20,1,Europe,France,WS
1222,1194,DEN,[DEN] Julie BRINCH,98323,20,1,Europe,Denmark,WS


In [104]:
#ws_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\WS Player.csv', index = False)

### **Men's Doubles**

In [106]:
# Men's Doubles URL, filtered by 100 results on each page.
md_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42633&category=474&C474FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [107]:
# After checking with the filter by 100 results on each page.
# We need this info to work with the last page which doesn't contain 100 results as the previous ones.
md_page_number = 15
md_results = 1429

In [108]:
# From the Test above, now we know all the columns needed for the DataFrame. Create a list of column names.
list = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']

In [109]:
# Create DataFrame
md_df = pd.DataFrame(columns = list)
md_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country


In [110]:
# Create a loop to add data to all pages in DataFrame 
# Loop through pages 1 to 19
for md_page in range(1, md_page_number +1):
    if md_page < md_page_number: # From pages 1-18 with 100 results on each page
        # Format the URL with the current page number
        md_url = md_base_url.format(md_page)
        # Make a request to the URL
        md_response = requests.get(md_url, headers=headers)
        md_soup = BeautifulSoup(md_response.text, 'html.parser')
        md_ranking_table = md_soup.find('table', class_='ruler')
        md_all_rows = md_ranking_table.find_all('tr')[2:102]
        for md_row in md_all_rows:
            md_row_data = md_row.find_all('td')
            md_individual_row_data = [md_data.text.strip() for md_data in md_row_data]
            md_length = len(md_df)
            if md_length < md_results:
                md_df.loc[md_length] = md_individual_row_data
            else:
                break
            
    else: # Page 19 with less than 100 results on this page
        # Format the URL with the current page number
        md_url = md_base_url.format(md_page)
        # Make a request to the URL
        md_response = requests.get(md_url, headers=headers)
        md_soup = BeautifulSoup(md_response.text, 'html.parser')
        md_ranking_table = md_soup.find('table', class_='ruler')
        md_all_rows = md_ranking_table.find_all('tr')[2:(md_results-((md_page_number-1)*100) +2)]
        for md_row in md_all_rows:
            md_row_data = md_row.find_all('td')
            md_individual_row_data = [md_data.text.strip() for md_data in md_row_data]
            md_length = len(md_df)
            if md_length < md_results:
                md_df.loc[md_length] = md_individual_row_data
            else:
                break

md_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHNCHN,[CHN] LIANG Wei Keng[CHN] WANG Chang,,9053155414,96798,14,Asia,China
1,2,,,DENDEN,[DEN] Kim ASTRUP[DEN] Anders Skaarup RASMUSSEN,,9298044414,91753,20,Europe,Denmark
2,3,,,KORKOR,[KOR] KANG Min Hyuk[KOR] SEO Seung Jae,,7726266513,90415,20,Asia,Korea
3,4,,,MASMAS,[MAS] Aaron CHIA[MAS] SOH Wooi Yik,,5620399389,84315,16,Asia,Malaysia
4,5,,,CHNCHN,[CHN] HE Ji Ting[CHN] REN Xiang Yu,,5967384812,83961,23,Asia,China
...,...,...,...,...,...,...,...,...,...,...,...
1424,1415,,,ESPESP,[ESP] Fernando CIVICO[ESP] Ricardo RETTIG,,9632969882,20,1,Europe,Spain
1425,1426,,,BELBEL,[BEL] Elias BRACKE[BEL] Senne HOUTHOOFD,,6725155858,15,1,Europe,Belgium
1426,1427,,,BELBEL,[BEL] Charles FOUYN[BEL] Baptiste ROLIN,,9970396561,2,1,Europe,Belgium
1427,1427,,,FIJFIJ,[FIJ] Jared CHUNG[FIJ] Robert LOO,,8267255618,2,1,Oceania,Fiji


In [111]:
# Dropping the columns 'x1', 'x2', 'x3'
md_df = md_df.drop(columns=['x1', 'x2', 'x3'])

In [112]:
# Add 'Category' column
md_df['Category'] = 'MD'

In [113]:
md_df

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHNCHN,[CHN] LIANG Wei Keng[CHN] WANG Chang,9053155414,96798,14,Asia,China,MD
1,2,DENDEN,[DEN] Kim ASTRUP[DEN] Anders Skaarup RASMUSSEN,9298044414,91753,20,Europe,Denmark,MD
2,3,KORKOR,[KOR] KANG Min Hyuk[KOR] SEO Seung Jae,7726266513,90415,20,Asia,Korea,MD
3,4,MASMAS,[MAS] Aaron CHIA[MAS] SOH Wooi Yik,5620399389,84315,16,Asia,Malaysia,MD
4,5,CHNCHN,[CHN] HE Ji Ting[CHN] REN Xiang Yu,5967384812,83961,23,Asia,China,MD
...,...,...,...,...,...,...,...,...,...
1424,1415,ESPESP,[ESP] Fernando CIVICO[ESP] Ricardo RETTIG,9632969882,20,1,Europe,Spain,MD
1425,1426,BELBEL,[BEL] Elias BRACKE[BEL] Senne HOUTHOOFD,6725155858,15,1,Europe,Belgium,MD
1426,1427,BELBEL,[BEL] Charles FOUYN[BEL] Baptiste ROLIN,9970396561,2,1,Europe,Belgium,MD
1427,1427,FIJFIJ,[FIJ] Jared CHUNG[FIJ] Robert LOO,8267255618,2,1,Oceania,Fiji,MD


In [114]:
#md_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\MD Player.csv', index = False)

### **Women's Doubles**

In [116]:
# Women's Doubles URL, filtered by 100 results on each page.
wd_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42633&category=475&C475FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [117]:
# After checking with the filter by 100 results on each page.
# We need this info to work with the last page which doesn't contain 100 results as the previous ones.
wd_page_number = 11
wd_results = 1029

In [118]:
# From the Test above, now we know all the columns needed for the DataFrame. Create a list of column names.
list = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']

In [119]:
# Create DataFrame
wd_df = pd.DataFrame(columns = list)
wd_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country


In [120]:
# Create a loop to add data to all pages in DataFrame 
# Loop through pages 1 to 19
for wd_page in range(1, wd_page_number +1):
    if wd_page < wd_page_number: # From pages 1-18 with 100 results on each page
        # Format the URL with the current page number
        wd_url = wd_base_url.format(wd_page)
        # Make a request to the URL
        wd_response = requests.get(wd_url, headers=headers)
        wd_soup = BeautifulSoup(wd_response.text, 'html.parser')
        wd_ranking_table = wd_soup.find('table', class_='ruler')
        wd_all_rows = wd_ranking_table.find_all('tr')[2:102]
        for wd_row in wd_all_rows:
            wd_row_data = wd_row.find_all('td')
            wd_individual_row_data = [wd_data.text.strip() for wd_data in wd_row_data]
            wd_length = len(wd_df)
            if wd_length < wd_results:
                wd_df.loc[wd_length] = wd_individual_row_data
            else:
                break
            
    else: # Page 19 with less than 100 results on this page
        # Format the URL with the current page number
        wd_url = wd_base_url.format(wd_page)
        # Make a request to the URL
        wd_response = requests.get(wd_url, headers=headers)
        wd_soup = BeautifulSoup(wd_response.text, 'html.parser')
        wd_ranking_table = wd_soup.find('table', class_='ruler')
        wd_all_rows = wd_ranking_table.find_all('tr')[2:(wd_results-((wd_page_number-1)*100) +2)]
        for wd_row in wd_all_rows:
            wd_row_data = wd_row.find_all('td')
            wd_individual_row_data = [wd_data.text.strip() for wd_data in wd_row_data]
            wd_length = len(wd_df)
            if wd_length < wd_results:
                wd_df.loc[wd_length] = wd_individual_row_data
            else:
                break

wd_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,1,CHNCHN,[CHN] CHEN Qing Chen[CHN] JIA Yi Fan,,9412565144,104406,13,Asia,China
1,2,,,CHNCHN,[CHN] LIU Sheng Shu[CHN] TAN Ning,,8159959880,99396,21,Asia,China
2,3,,,KORKOR,[KOR] BAEK Ha Na[KOR] LEE So Hee,,5670691292,98116,15,Asia,Korea
3,4,,,JPNJPN,[JPN] Nami MATSUYAMA[JPN] Chiharu SHIDA,,6521268282,91175,18,Asia,Japan
4,5,,,CHNCHN,[CHN] ZHANG Shu Xian[CHN] ZHENG Yu,,8591485331,81491,14,Asia,China
...,...,...,...,...,...,...,...,...,...,...,...
1024,1023,,,SVKSVK,[SVK] Johanka IVANOVICOVA[SVK] Olivia KADLECOVA,,7760583887,40,1,Europe,Slovakia
1025,1026,,,TURTUR,[TUR] Yasemen BEKTAS[TUR] Cansu ERCETIN,,8662191818,2,1,Europe,Türkiye
1026,1026,,,SUISUI,[SUI] Milena SCHNIDER[SUI] Jenjira STADELMANN,,9752959566,2,1,Europe,Switzerland
1027,1026,,,PYFPYF,[PYF] Heirautea CURET[PYF] Esther TAU,,9190687972,2,1,Oceania,Tahiti


In [121]:
# Dropping the columns 'x1', 'x2', 'x3'
wd_df = wd_df.drop(columns=['x1', 'x2', 'x3'])

In [122]:
# Add 'Category' column
wd_df['Category'] = 'WD'

In [123]:
wd_df

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHNCHN,[CHN] CHEN Qing Chen[CHN] JIA Yi Fan,9412565144,104406,13,Asia,China,WD
1,2,CHNCHN,[CHN] LIU Sheng Shu[CHN] TAN Ning,8159959880,99396,21,Asia,China,WD
2,3,KORKOR,[KOR] BAEK Ha Na[KOR] LEE So Hee,5670691292,98116,15,Asia,Korea,WD
3,4,JPNJPN,[JPN] Nami MATSUYAMA[JPN] Chiharu SHIDA,6521268282,91175,18,Asia,Japan,WD
4,5,CHNCHN,[CHN] ZHANG Shu Xian[CHN] ZHENG Yu,8591485331,81491,14,Asia,China,WD
...,...,...,...,...,...,...,...,...,...
1024,1023,SVKSVK,[SVK] Johanka IVANOVICOVA[SVK] Olivia KADLECOVA,7760583887,40,1,Europe,Slovakia,WD
1025,1026,TURTUR,[TUR] Yasemen BEKTAS[TUR] Cansu ERCETIN,8662191818,2,1,Europe,Türkiye,WD
1026,1026,SUISUI,[SUI] Milena SCHNIDER[SUI] Jenjira STADELMANN,9752959566,2,1,Europe,Switzerland,WD
1027,1026,PYFPYF,[PYF] Heirautea CURET[PYF] Esther TAU,9190687972,2,1,Oceania,Tahiti,WD


In [124]:
#wd_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\WD Player.csv', index = False)

### **Mixed Doubles**

In [126]:
# Mixed Doubles URL, filtered by 100 results on each page.
xd_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42633&category=476&C476FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [127]:
# After checking with the filter by 100 results on each page.
# We need this info to work with the last page which doesn't contain 100 results as the previous ones.
xd_page_number = 15
xd_results = 1402

In [128]:
# From the Test above, now we know all the columns needed for the DataFrame. Create a list of column names.
list = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']

In [129]:
# Create DataFrame
xd_df = pd.DataFrame(columns = list)
xd_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country


In [130]:
# Create a loop to add data to all pages in DataFrame 
# Loop through pages 1 to 19
for xd_page in range(1, xd_page_number +1):
    if xd_page < xd_page_number: # From pages 1-18 with 100 results on each page
        # Format the URL with the current page number
        xd_url = xd_base_url.format(xd_page)
        # Make a request to the URL
        xd_response = requests.get(xd_url, headers=headers)
        xd_soup = BeautifulSoup(xd_response.text, 'html.parser')
        xd_ranking_table = xd_soup.find('table', class_='ruler')
        xd_all_rows = xd_ranking_table.find_all('tr')[2:102]
        for xd_row in xd_all_rows:
            xd_row_data = xd_row.find_all('td')
            xd_individual_row_data = [xd_data.text.strip() for xd_data in xd_row_data]
            xd_length = len(xd_df)
            if xd_length < xd_results:
                xd_df.loc[xd_length] = xd_individual_row_data
            else:
                break
            
    else: # Page 19 with less than 100 results on this page
        # Format the URL with the current page number
        xd_url = xd_base_url.format(xd_page)
        # Make a request to the URL
        xd_response = requests.get(xd_url, headers=headers)
        xd_soup = BeautifulSoup(xd_response.text, 'html.parser')
        xd_ranking_table = xd_soup.find('table', class_='ruler')
        xd_all_rows = xd_ranking_table.find_all('tr')[2:(xd_results-((xd_page_number-1)*100) +2)]
        for xd_row in xd_all_rows:
            xd_row_data = xd_row.find_all('td')
            xd_individual_row_data = [xd_data.text.strip() for xd_data in xd_row_data]
            xd_length = len(xd_df)
            if xd_length < xd_results:
                xd_df.loc[xd_length] = xd_individual_row_data
            else:
                break

xd_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHNCHN,[CHN] ZHENG Si Wei[CHN] HUANG Ya Qiong,,8304663168,106850,13,Asia,China
1,2,,,CHNCHN,[CHN] FENG Yan Zhe[CHN] HUANG Dong Ping,,6526789426,99100,16,Asia,China
2,3,,,CHNCHN,[CHN] JIANG Zhen Bang[CHN] WEI Ya Xin,,6302957246,94050,19,Asia,China
3,4,,,KORKOR,[KOR] SEO Seung Jae[KOR] CHAE Yu Jung,,6651393074,83220,14,Asia,Korea
4,5,,,JPNJPN,[JPN] Yuta WATANABE[JPN] Arisa IGARASHI,,5824079558,80970,14,Asia,Japan
...,...,...,...,...,...,...,...,...,...,...,...
1397,1376,,,ISRMLT,[ISR] Mark ARONCHIK[MLT] Emily ABELA,,6982091376,20,1,Europe,IsraelMalta
1398,1376,,,ESTEST,[EST] Dennis KUMAR[EST] Mia SAKARIAS,,8564580515,20,1,Europe,Estonia
1399,1376,,,DENDEN,[DEN] Victor Roed SKOV[DEN] LæRke Wiktoria WENDEL,,7843769090,20,1,Europe,Denmark
1400,1376,,,MLTMLT,[MLT] Jeremy Mark GATT[MLT] Martina CLARK,,3638768347,20,1,Europe,Malta


In [131]:
# Dropping the columns 'x1', 'x2', 'x3'
xd_df = xd_df.drop(columns=['x1', 'x2', 'x3'])

In [132]:
# Add 'Category' column
xd_df['Category'] = 'XD'

In [133]:
xd_df

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHNCHN,[CHN] ZHENG Si Wei[CHN] HUANG Ya Qiong,8304663168,106850,13,Asia,China,XD
1,2,CHNCHN,[CHN] FENG Yan Zhe[CHN] HUANG Dong Ping,6526789426,99100,16,Asia,China,XD
2,3,CHNCHN,[CHN] JIANG Zhen Bang[CHN] WEI Ya Xin,6302957246,94050,19,Asia,China,XD
3,4,KORKOR,[KOR] SEO Seung Jae[KOR] CHAE Yu Jung,6651393074,83220,14,Asia,Korea,XD
4,5,JPNJPN,[JPN] Yuta WATANABE[JPN] Arisa IGARASHI,5824079558,80970,14,Asia,Japan,XD
...,...,...,...,...,...,...,...,...,...
1397,1376,ISRMLT,[ISR] Mark ARONCHIK[MLT] Emily ABELA,6982091376,20,1,Europe,IsraelMalta,XD
1398,1376,ESTEST,[EST] Dennis KUMAR[EST] Mia SAKARIAS,8564580515,20,1,Europe,Estonia,XD
1399,1376,DENDEN,[DEN] Victor Roed SKOV[DEN] LæRke Wiktoria WENDEL,7843769090,20,1,Europe,Denmark,XD
1400,1376,MLTMLT,[MLT] Jeremy Mark GATT[MLT] Martina CLARK,3638768347,20,1,Europe,Malta,XD


In [134]:
#xd_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\XD Player.csv', index = False)

In [176]:
# List of DataFrames to concatenate
dfs = [ms_df, ws_df, md_df, wd_df, xd_df]

# Concatenate the DataFrames
all_players = pd.concat(dfs, ignore_index=True)

# Display the resulting DataFrame
all_players

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,[CHN] SHI Yu Qi,57945,101815,18,Asia,China,MS
1,2,DEN,[DEN] Viktor AXELSEN,25831,93790,14,Europe,Denmark,MS
2,3,DEN,[DEN] Anders ANTONSEN,91554,84241,22,Europe,Denmark,MS
3,4,JPN,[JPN] Kodai NARAOKA,62063,84197,22,Asia,Japan,MS
4,5,MAS,[MAS] LEE Zii Jia,81561,83716,22,Asia,Malaysia,MS
...,...,...,...,...,...,...,...,...,...
6910,1376,ISRMLT,[ISR] Mark ARONCHIK[MLT] Emily ABELA,6982091376,20,1,Europe,IsraelMalta,XD
6911,1376,ESTEST,[EST] Dennis KUMAR[EST] Mia SAKARIAS,8564580515,20,1,Europe,Estonia,XD
6912,1376,DENDEN,[DEN] Victor Roed SKOV[DEN] LæRke Wiktoria WENDEL,7843769090,20,1,Europe,Denmark,XD
6913,1376,MLTMLT,[MLT] Jeremy Mark GATT[MLT] Martina CLARK,3638768347,20,1,Europe,Malta,XD


In [178]:
all_players.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\All_Players.csv', index = False)

In [218]:
df1 = pd.read_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\All_Players.csv')
df1

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,[CHN] SHI Yu Qi,57945,101815,18,Asia,China,MS
1,2,DEN,[DEN] Viktor AXELSEN,25831,93790,14,Europe,Denmark,MS
2,3,DEN,[DEN] Anders ANTONSEN,91554,84241,22,Europe,Denmark,MS
3,4,JPN,[JPN] Kodai NARAOKA,62063,84197,22,Asia,Japan,MS
4,5,MAS,[MAS] LEE Zii Jia,81561,83716,22,Asia,Malaysia,MS
...,...,...,...,...,...,...,...,...,...
6910,1376,ISRMLT,[ISR] Mark ARONCHIK[MLT] Emily ABELA,6982091376,20,1,Europe,IsraelMalta,XD
6911,1376,ESTEST,[EST] Dennis KUMAR[EST] Mia SAKARIAS,8564580515,20,1,Europe,Estonia,XD
6912,1376,DENDEN,[DEN] Victor Roed SKOV[DEN] LæRke Wiktoria WENDEL,7843769090,20,1,Europe,Denmark,XD
6913,1376,MLTMLT,[MLT] Jeremy Mark GATT[MLT] Martina CLARK,3638768347,20,1,Europe,Malta,XD


In [220]:
# Function to format the 6-character values
def format_category(value):
    if len(value) == 6:
        return value[:3] + '/' + value[3:]
    return value

# Apply the function to the 'Country Code' column
df1['Country Code'] = df1['Country Code'].apply(format_category)

# Display the resulting DataFrame
df1

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,[CHN] SHI Yu Qi,57945,101815,18,Asia,China,MS
1,2,DEN,[DEN] Viktor AXELSEN,25831,93790,14,Europe,Denmark,MS
2,3,DEN,[DEN] Anders ANTONSEN,91554,84241,22,Europe,Denmark,MS
3,4,JPN,[JPN] Kodai NARAOKA,62063,84197,22,Asia,Japan,MS
4,5,MAS,[MAS] LEE Zii Jia,81561,83716,22,Asia,Malaysia,MS
...,...,...,...,...,...,...,...,...,...
6910,1376,ISR/MLT,[ISR] Mark ARONCHIK[MLT] Emily ABELA,6982091376,20,1,Europe,IsraelMalta,XD
6911,1376,EST/EST,[EST] Dennis KUMAR[EST] Mia SAKARIAS,8564580515,20,1,Europe,Estonia,XD
6912,1376,DEN/DEN,[DEN] Victor Roed SKOV[DEN] LæRke Wiktoria WENDEL,7843769090,20,1,Europe,Denmark,XD
6913,1376,MLT/MLT,[MLT] Jeremy Mark GATT[MLT] Martina CLARK,3638768347,20,1,Europe,Malta,XD


In [212]:
import re

In [222]:
# Function to process the 'Player' column
def format_player(value):
    # Add '/' before the second '[yyy]'
    value = re.sub(r'(\[\w{3}\])([^[]+)(\[\w{3}\])', r'\1\2/\3', value)
    # Remove all occurrences of '[xxx]'
    value = re.sub(r'\[\w{3}\]', '', value)
    # Remove any leading/trailing whitespace
    return value.strip()

# Apply the function to the 'Player' column
df1['Player'] = df1['Player'].apply(format_player)

# Display the resulting DataFrame
df1

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,SHI Yu Qi,57945,101815,18,Asia,China,MS
1,2,DEN,Viktor AXELSEN,25831,93790,14,Europe,Denmark,MS
2,3,DEN,Anders ANTONSEN,91554,84241,22,Europe,Denmark,MS
3,4,JPN,Kodai NARAOKA,62063,84197,22,Asia,Japan,MS
4,5,MAS,LEE Zii Jia,81561,83716,22,Asia,Malaysia,MS
...,...,...,...,...,...,...,...,...,...
6910,1376,ISR/MLT,Mark ARONCHIK/ Emily ABELA,6982091376,20,1,Europe,IsraelMalta,XD
6911,1376,EST/EST,Dennis KUMAR/ Mia SAKARIAS,8564580515,20,1,Europe,Estonia,XD
6912,1376,DEN/DEN,Victor Roed SKOV/ LæRke Wiktoria WENDEL,7843769090,20,1,Europe,Denmark,XD
6913,1376,MLT/MLT,Jeremy Mark GATT/ Martina CLARK,3638768347,20,1,Europe,Malta,XD


In [228]:
df1['Country']

0                 China
1               Denmark
2               Denmark
3                 Japan
4              Malaysia
             ...       
6910        IsraelMalta
6911            Estonia
6912            Denmark
6913              Malta
6914    ScotlandEstonia
Name: Country, Length: 6915, dtype: object

In [232]:
country_list = df1['Country'].tolist()
country_list

['China',
 'Denmark',
 'Denmark',
 'Japan',
 'Malaysia',
 'China',
 'Indonesia',
 'Thailand',
 'Chinese Taipei',
 'Indonesia',
 'Singapore',
 'Chinese Taipei',
 'Japan',
 'Japan',
 'China',
 'Hong Kong China',
 'China',
 'India',
 'France',
 'India',
 'Hong Kong China',
 'France',
 'China',
 'Chinese Taipei',
 'Chinese Taipei',
 'Canada',
 'France',
 'Denmark',
 'Malaysia',
 'Japan',
 'Indonesia',
 'Malaysia',
 'Japan',
 'Chinese Taipei',
 'India',
 'Thailand',
 'Chinese Taipei',
 'India',
 'Korea',
 'Singapore',
 'India',
 'Ireland',
 'Denmark',
 'India',
 'Indonesia',
 'Hong Kong China',
 'Denmark',
 'France',
 'Belgium',
 'Japan',
 'Malaysia',
 'Netherlands',
 'Brazil',
 'Japan',
 'Guatemala',
 'China',
 'Indonesia',
 'India',
 'Malaysia',
 'Finland',
 'Vietnam',
 'Czechia',
 'China',
 'El Salvador',
 'Azerbaijan',
 'Finland',
 'Thailand',
 'Spain',
 'Malaysia',
 'India',
 'Israel',
 'Brazil',
 'Vietnam',
 'Chinese Taipei',
 'Kazakhstan',
 'India',
 'India',
 'Italy',
 'Sri Lanka',


In [234]:
# Function to separate concatenated country names
def separate_countries(value):
    for country in country_list:
        if value.startswith(country):
            rest = value[len(country):]
            if rest and any(rest.startswith(other_country) for other_country in country_list):
                return country + '/' + rest
    return value

# Apply the function to the 'Country' column
df1['Country'] = df1['Country'].apply(separate_countries)

# Display the resulting DataFrame
df1

Unnamed: 0,Rank,Country Code,Player,Member ID,Points,Tournaments,Confederation,Country,Category
0,1,CHN,SHI Yu Qi,57945,101815,18,Asia,China,MS
1,2,DEN,Viktor AXELSEN,25831,93790,14,Europe,Denmark,MS
2,3,DEN,Anders ANTONSEN,91554,84241,22,Europe,Denmark,MS
3,4,JPN,Kodai NARAOKA,62063,84197,22,Asia,Japan,MS
4,5,MAS,LEE Zii Jia,81561,83716,22,Asia,Malaysia,MS
...,...,...,...,...,...,...,...,...,...
6910,1376,ISR/MLT,Mark ARONCHIK/ Emily ABELA,6982091376,20,1,Europe,Israel/Malta,XD
6911,1376,EST/EST,Dennis KUMAR/ Mia SAKARIAS,8564580515,20,1,Europe,Estonia,XD
6912,1376,DEN/DEN,Victor Roed SKOV/ LæRke Wiktoria WENDEL,7843769090,20,1,Europe,Denmark,XD
6913,1376,MLT/MLT,Jeremy Mark GATT/ Martina CLARK,3638768347,20,1,Europe,Malta,XD


In [236]:
df1.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\Player list.csv', index = False)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's Singles URL, Filtered by 100 results on each page.
ms_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this trip to work with the last page which doesn't contain 100 results as the previous ones.
ms_page_number = 1
ms_results = 1831

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
ms_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for ms_page in range(1, ms_page_number + 20):
    if ms_page < 19:  # Pages 1-18 with 100 results on each page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = [data.text.strip() if data.find('a') is None else data.find('a')['href'] for data in row_data]
            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data
            
            if len(ms_df) >= ms_results:
                break
    else:  # Page 19 with less than 100 results on this page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(ms_results - ((ms_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = [data.text.strip() if data.find('a') is None else data.find('a')['href'] for data in row_data]
            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data

            if len(ms_df) >= ms_results:
                break

# Display the resulting DataFrame
print(ms_df)


      Rank x1 x2 Country Code                               Player  \
0        1                CHN   player.aspx?id=42511&player=990139   
1        2                DEN   player.aspx?id=42511&player=147387   
2        3                DEN   player.aspx?id=42511&player=396249   
3        4                JPN  player.aspx?id=42511&player=1561759   
4        5                MAS   player.aspx?id=42511&player=318753   
...    ... .. ..          ...                                  ...   
1826  1767                UGA  player.aspx?id=42511&player=7270299   
1827  1767                UGA  player.aspx?id=42511&player=7798125   
1828  1767                UGA  player.aspx?id=42511&player=6354531   
1829  1767                UGA  player.aspx?id=42511&player=7272031   
1830  1767                UGA  player.aspx?id=42511&player=7274677   

                                                     x3 Member ID  Points  \
0     /player-profile/38CBD70E-A643-4170-9E6A-288634...     57945  101815   
1    

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's Singles URL, Filtered by 100 results on each page.
ms_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this trip to work with the last page which doesn't contain 100 results as the previous ones.
ms_page_number = 1
ms_results = 1831

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
ms_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for ms_page in range(1, ms_page_number + 20):
    if ms_page < 19:  # Pages 1-18 with 100 results on each page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    url = data.find('a')['href'] if data.find('a') else ''
                    individual_row_data.append(url)
                else:
                    individual_row_data.append(data.text.strip())

            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data
            
            if len(ms_df) >= ms_results:
                break
    else:  # Page 19 with less than 100 results on this page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(ms_results - ((ms_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    url = data.find('a')['href'] if data.find('a') else ''
                    individual_row_data.append(url)
                else:
                    individual_row_data.append(data.text.strip())

            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data

            if len(ms_df) >= ms_results:
                break

# Display the resulting DataFrame
ms_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHN,[CHN] SHI Yu Qi,/player-profile/38CBD70E-A643-4170-9E6A-288634...,57945,101815,18,Asia,China
1,2,,,DEN,[DEN] Viktor AXELSEN,/player-profile/3BF0C3EA-7BFB-4CC9-A955-3712D8...,25831,93790,14,Europe,Denmark
2,3,,,DEN,[DEN] Anders ANTONSEN,/player-profile/473960FD-334B-476F-9821-8ED939...,91554,84241,22,Europe,Denmark
3,4,,,JPN,[JPN] Kodai NARAOKA,/player-profile/7804BE61-F954-4C46-9E72-FED84B...,62063,84197,22,Asia,Japan
4,5,,,MAS,[MAS] LEE Zii Jia,/player-profile/21E9EAF0-5C86-416F-A942-C3ECA0...,81561,83716,22,Asia,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...
1826,1767,,,UGA,[UGA] Hassan Jamil MAKINDA,/player-profile/7D272D8F-8011-4BB0-AA9E-9C79FE...,33916,20,1,Africa,Uganda
1827,1767,,,UGA,[UGA] Latif NGOBI,/player-profile/A3785297-69E0-4845-BAB5-0235E6...,29626,20,1,Africa,Uganda
1828,1767,,,UGA,[UGA] Abaasi BBAALE,/player-profile/19CD2BAB-6CE5-4ACC-834D-C8C677...,89938,20,1,Africa,Uganda
1829,1767,,,UGA,[UGA] Preet Nikulkumar PATEL,/player-profile/41C69713-E359-41AC-BC17-3CF2E5...,34706,20,1,Africa,Uganda


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's Singles URL, Filtered by 100 results on each page.
ms_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this trip to work with the last page which doesn't contain 100 results as the previous ones.
ms_page_number = 1
ms_results = 1831

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
ms_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for ms_page in range(1, ms_page_number + 20):
    if ms_page < 19:  # Pages 1-18 with 100 results on each page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    url = data.find('a')['href'] if data.find('a') else ''
                    complete_url = 'https://bwf.tournamentsoftware.com' + url if url else ''
                    individual_row_data.append(complete_url)
                else:
                    individual_row_data.append(data.text.strip())

            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data
            
            if len(ms_df) >= ms_results:
                break
    else:  # Page 19 with less than 100 results on this page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(ms_results - ((ms_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    url = data.find('a')['href'] if data.find('a') else ''
                    complete_url = 'https://bwf.tournamentsoftware.com' + url if url else ''
                    individual_row_data.append(complete_url)
                else:
                    individual_row_data.append(data.text.strip())

            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data

            if len(ms_df) >= ms_results:
                break

# Display the resulting DataFrame
ms_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHN,[CHN] SHI Yu Qi,https://bwf.tournamentsoftware.com/player-prof...,57945,101815,18,Asia,China
1,2,,,DEN,[DEN] Viktor AXELSEN,https://bwf.tournamentsoftware.com/player-prof...,25831,93790,14,Europe,Denmark
2,3,,,DEN,[DEN] Anders ANTONSEN,https://bwf.tournamentsoftware.com/player-prof...,91554,84241,22,Europe,Denmark
3,4,,,JPN,[JPN] Kodai NARAOKA,https://bwf.tournamentsoftware.com/player-prof...,62063,84197,22,Asia,Japan
4,5,,,MAS,[MAS] LEE Zii Jia,https://bwf.tournamentsoftware.com/player-prof...,81561,83716,22,Asia,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...
1826,1767,,,UGA,[UGA] Hassan Jamil MAKINDA,https://bwf.tournamentsoftware.com/player-prof...,33916,20,1,Africa,Uganda
1827,1767,,,UGA,[UGA] Latif NGOBI,https://bwf.tournamentsoftware.com/player-prof...,29626,20,1,Africa,Uganda
1828,1767,,,UGA,[UGA] Abaasi BBAALE,https://bwf.tournamentsoftware.com/player-prof...,89938,20,1,Africa,Uganda
1829,1767,,,UGA,[UGA] Preet Nikulkumar PATEL,https://bwf.tournamentsoftware.com/player-prof...,34706,20,1,Africa,Uganda


In [7]:
#ms_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\MS.csv', index = False)

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's DOuble URL, Filtered by 100 results on each page.
md_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42633&category=474&C474FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this trip to work with the last page which doesn't contain 100 results as the previous ones.
md_page_number = 15
md_results = 1429

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
md_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for md_page in range(1, md_page_number + 1):
    if md_page < 19:  # Pages 1-18 with 100 results on each page
        md_url = md_base_url.format(md_page)
        response = requests.get(md_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    url = data.find_all('a')['href'] if data.find_all('a') else ''
                    complete_url = 'https://bwf.tournamentsoftware.com' + url if url else ''
                    individual_row_data.append(complete_url)
                else:
                    individual_row_data.append(data.text.strip())

            md_df_length = len(md_df)
            md_df.loc[md_df_length] = individual_row_data
            
            if len(md_df) >= md_results:
                break
    else:  # Page 19 with less than 100 results on this page
        md_url = md_base_url.format(md_page)
        response = requests.get(md_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(md_results - ((md_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    url = data.find_all('a')['href'] if data.find_all('a') else ''
                    complete_url = 'https://bwf.tournamentsoftware.com' + url if url else ''
                    individual_row_data.append(complete_url)
                else:
                    individual_row_data.append(data.text.strip())

            md_df_length = len(md_df)
            md_df.loc[md_df_length] = individual_row_data

            if len(md_df) >= md_results:
                break

# Display the resulting DataFrame
md_df

TypeError: list indices must be integers or slices, not str

In [11]:
#md_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\MD.csv', index = False)

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's Singles URL, Filtered by 100 results on each page.
md_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42633&category=474&C474FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this trip to work with the last page which doesn't contain 100 results as the previous ones.
md_page_number = 15
md_results = 1429

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
md_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for md_page in range(1, md_page_number + 1):
    if md_page < 19:  # Pages 1-18 with 100 results on each page
        md_url = md_base_url.format(md_page)
        response = requests.get(md_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    urls = [a['href'] for a in data.find_all('a', href=True)]
                    complete_urls = ', '.join(['https://bwf.tournamentsoftware.com' + url for url in urls])
                    individual_row_data.append(complete_urls)
                else:
                    individual_row_data.append(data.text.strip())

            md_df_length = len(md_df)
            md_df.loc[md_df_length] = individual_row_data
            
            if len(md_df) >= md_results:
                break
    else:  # Page 19 with less than 100 results on this page
        md_url = md_base_url.format(md_page)
        response = requests.get(md_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(md_results - ((md_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    urls = [a['href'] for a in data.find_all('a', href=True)]
                    complete_urls = ', '.join(['https://bwf.tournamentsoftware.com' + url for url in urls])
                    individual_row_data.append(complete_urls)
                else:
                    individual_row_data.append(data.text.strip())

            md_df_length = len(md_df)
            md_df.loc[md_df_length] = individual_row_data

            if len(md_df) >= md_results:
                break

# Display the resulting DataFrame
md_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHNCHN,[CHN] LIANG Wei Keng[CHN] WANG Chang,https://bwf.tournamentsoftware.com/player-prof...,9053155414,96798,14,Asia,China
1,2,,,DENDEN,[DEN] Kim ASTRUP[DEN] Anders Skaarup RASMUSSEN,https://bwf.tournamentsoftware.com/player-prof...,9298044414,91753,20,Europe,Denmark
2,3,,,KORKOR,[KOR] KANG Min Hyuk[KOR] SEO Seung Jae,https://bwf.tournamentsoftware.com/player-prof...,7726266513,90415,20,Asia,Korea
3,4,,,MASMAS,[MAS] Aaron CHIA[MAS] SOH Wooi Yik,https://bwf.tournamentsoftware.com/player-prof...,5620399389,84315,16,Asia,Malaysia
4,5,,,CHNCHN,[CHN] HE Ji Ting[CHN] REN Xiang Yu,https://bwf.tournamentsoftware.com/player-prof...,5967384812,83961,23,Asia,China
...,...,...,...,...,...,...,...,...,...,...,...
1424,1415,,,ESPESP,[ESP] Fernando CIVICO[ESP] Ricardo RETTIG,https://bwf.tournamentsoftware.com/player-prof...,9632969882,20,1,Europe,Spain
1425,1426,,,BELBEL,[BEL] Elias BRACKE[BEL] Senne HOUTHOOFD,https://bwf.tournamentsoftware.com/player-prof...,6725155858,15,1,Europe,Belgium
1426,1427,,,BELBEL,[BEL] Charles FOUYN[BEL] Baptiste ROLIN,https://bwf.tournamentsoftware.com/player-prof...,9970396561,2,1,Europe,Belgium
1427,1427,,,FIJFIJ,[FIJ] Jared CHUNG[FIJ] Robert LOO,https://bwf.tournamentsoftware.com/player-prof...,8267255618,2,1,Oceania,Fiji


In [20]:
md_df.to_csv(r'C:\Users\Minh.Nguyen\OneDrive - Stats Perform\_badminton - WORLD - BWF\MD.csv', index = False)

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's Singles URL, Filtered by 100 results on each page.
ms_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results.
# We need this trip to work with the last page which doesn't contain 100 results as the previous ones.
ms_page_number = 1
ms_results = 1831

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
md_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for md_page in range(1, md_page_number + 20):
    if md_page < 19:  # Pages 1-18 with 100 results on each page
        md_url = md_base_url.format(md_page)
        response = requests.get(md_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    urls = [a['href'] for a in data.find_all('a', href=True)]
                    complete_urls = ', '.join(['https://bwf.tournamentsoftware.com' + url for url in urls])
                    individual_row_data.append(complete_urls)
                elif idx == 6:  # The 7th column is 'Member ID'
                    texts = [text for text in data.stripped_strings]
                    concatenated_texts = '/'.join(texts)
                    individual_row_data.append(concatenated_texts)
                else:
                    individual_row_data.append(data.text.strip())

            md_df_length = len(md_df)
            md_df.loc[md_df_length] = individual_row_data
            
            if len(md_df) >= md_results:
                break
    else:  # Page 19 with less than 100 results on this page
        md_url = md_base_url.format(md_page)
        response = requests.get(md_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(md_results - ((md_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    urls = [a['href'] for a in data.find_all('a', href=True)]
                    complete_urls = ', '.join(['https://bwf.tournamentsoftware.com' + url for url in urls])
                    individual_row_data.append(complete_urls)
                elif idx == 6:  # The 7th column is 'Member ID'
                    texts = [text for text in data.stripped_strings]
                    concatenated_texts = '/'.join(texts)
                    individual_row_data.append(concatenated_texts)
                else:
                    individual_row_data.append(data.text.strip())

            md_df_length = len(md_df)
            md_df.loc[md_df_length] = individual_row_data

            if len(md_df) >= md_results:
                break

# Display the resulting DataFrame
md_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHNCHN,[CHN] LIANG Wei Keng[CHN] WANG Chang,https://bwf.tournamentsoftware.com/player-prof...,90531/55414,96798,14,Asia,China
1,2,,,DENDEN,[DEN] Kim ASTRUP[DEN] Anders Skaarup RASMUSSEN,https://bwf.tournamentsoftware.com/player-prof...,92980/44414,91753,20,Europe,Denmark
2,3,,,KORKOR,[KOR] KANG Min Hyuk[KOR] SEO Seung Jae,https://bwf.tournamentsoftware.com/player-prof...,77262/66513,90415,20,Asia,Korea
3,4,,,MASMAS,[MAS] Aaron CHIA[MAS] SOH Wooi Yik,https://bwf.tournamentsoftware.com/player-prof...,56203/99389,84315,16,Asia,Malaysia
4,5,,,CHNCHN,[CHN] HE Ji Ting[CHN] REN Xiang Yu,https://bwf.tournamentsoftware.com/player-prof...,59673/84812,83961,23,Asia,China
...,...,...,...,...,...,...,...,...,...,...,...
1424,1415,,,ESPESP,[ESP] Fernando CIVICO[ESP] Ricardo RETTIG,https://bwf.tournamentsoftware.com/player-prof...,96329/69882,20,1,Europe,Spain
1425,1426,,,BELBEL,[BEL] Elias BRACKE[BEL] Senne HOUTHOOFD,https://bwf.tournamentsoftware.com/player-prof...,67251/55858,15,1,Europe,Belgium
1426,1427,,,BELBEL,[BEL] Charles FOUYN[BEL] Baptiste ROLIN,https://bwf.tournamentsoftware.com/player-prof...,99703/96561,2,1,Europe,Belgium
1427,1427,,,FIJFIJ,[FIJ] Jared CHUNG[FIJ] Robert LOO,https://bwf.tournamentsoftware.com/player-prof...,82672/55618,2,1,Oceania,Fiji


In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Men's Singles URL, Filtered by 100 results on each page.
ms_base_url = 'https://bwf.tournamentsoftware.com/ranking/category.aspx?id=42511&category=472&C472FOC=&p={}&ps=100'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# After checking with the filter by 100 results on each page. There are a total of 19 pages and 1831 results. 
# We need this information to work with the last page which contains less than 100 results compared to the previous pages.
ms_page_number = 19
ms_results = 1831

# Create DataFrame
list_of_columns = ['Rank', 'x1', 'x2', 'Country Code', 'Player', 'x3', 'Member ID', 'Points', 'Tournaments', 'Confederation', 'Country']
ms_df = pd.DataFrame(columns=list_of_columns)

# Create a loop to add data to all pages in DataFrame
# Loop through pages 1 to 19
for ms_page in range(1, ms_page_number + 1):
    if ms_page < 19:  # Pages 1-18 with 100 results on each page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:102]  # Skip the header rows

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    urls = [a['href'] for a in data.find_all('a', href=True)]
                    complete_urls = ', '.join(['https://bwf.tournamentsoftware.com' + url for url in urls])
                    individual_row_data.append(complete_urls)
                elif idx == 6:  # The 7th column is 'Member ID'
                    texts = [text for text in data.stripped_strings]
                    concatenated_texts = '/'.join(texts)
                    individual_row_data.append(concatenated_texts)
                else:
                    individual_row_data.append(data.text.strip())

            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data
            
            if len(ms_df) >= ms_results:
                break
    else:  # Page 19 with less than 100 results on this page
        ms_url = ms_base_url.format(ms_page)
        response = requests.get(ms_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        ranking_table = soup.find('table', class_='ruler')
        all_rows = ranking_table.find_all('tr')[2:(ms_results - ((ms_page_number - 1) * 100) + 2)]

        for row in all_rows:
            row_data = row.find_all('td')
            individual_row_data = []

            for idx, data in enumerate(row_data):
                if idx == 5:  # The 6th column is 'x3'
                    urls = [a['href'] for a in data.find_all('a', href=True)]
                    complete_urls = ', '.join(['https://bwf.tournamentsoftware.com' + url for url in urls])
                    individual_row_data.append(complete_urls)
                elif idx == 6:  # The 7th column is 'Member ID'
                    texts = [text for text in data.stripped_strings]
                    concatenated_texts = '/'.join(texts)
                    individual_row_data.append(concatenated_texts)
                else:
                    individual_row_data.append(data.text.strip())

            ms_df_length = len(ms_df)
            ms_df.loc[ms_df_length] = individual_row_data

            if len(ms_df) >= ms_results:
                break

# Display the resulting DataFrame
ms_df

Unnamed: 0,Rank,x1,x2,Country Code,Player,x3,Member ID,Points,Tournaments,Confederation,Country
0,1,,,CHN,[CHN] SHI Yu Qi,https://bwf.tournamentsoftware.com/player-prof...,57945,101815,18,Asia,China
1,2,,,DEN,[DEN] Viktor AXELSEN,https://bwf.tournamentsoftware.com/player-prof...,25831,93790,14,Europe,Denmark
2,3,,,DEN,[DEN] Anders ANTONSEN,https://bwf.tournamentsoftware.com/player-prof...,91554,84241,22,Europe,Denmark
3,4,,,JPN,[JPN] Kodai NARAOKA,https://bwf.tournamentsoftware.com/player-prof...,62063,84197,22,Asia,Japan
4,5,,,MAS,[MAS] LEE Zii Jia,https://bwf.tournamentsoftware.com/player-prof...,81561,83716,22,Asia,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...
1826,1767,,,UGA,[UGA] Hassan Jamil MAKINDA,https://bwf.tournamentsoftware.com/player-prof...,33916,20,1,Africa,Uganda
1827,1767,,,UGA,[UGA] Latif NGOBI,https://bwf.tournamentsoftware.com/player-prof...,29626,20,1,Africa,Uganda
1828,1767,,,UGA,[UGA] Abaasi BBAALE,https://bwf.tournamentsoftware.com/player-prof...,89938,20,1,Africa,Uganda
1829,1767,,,UGA,[UGA] Preet Nikulkumar PATEL,https://bwf.tournamentsoftware.com/player-prof...,34706,20,1,Africa,Uganda
