In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import numpy as np
from time import sleep

In [2]:
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.granfondoguide.com/Events/ProCyclingCalendar2022'
driver.get(url)



Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [C:\Users\adema\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [3]:
# initialize storages
date, event, location, route = ([] for i in range(4))
soup = BeautifulSoup(driver.page_source, 'html.parser')
tbody = soup.find('tbody')

# get table data according to their columns and indices
# tr for the titles and td for the data with in them
def get_table_data(col,i):
    for title in tbody.find_all('tr'):
        data = title.find_all('td')[i].text.strip()
        col.append(data)
    return col

In [4]:
# Uncomment the following function calls

get_table_data(date, 0)
get_table_data(event, 1)
get_table_data(location, 2)
get_table_data(route, 3)

['Routes',
 '9 stages',
 '21 stages',
 '219km',
 '7 stages',
 '21 stages',
 '250 km',
 '7 stages',
 '200km',
 '206km',
 '205 km',
 '255 km',
 '6 stages',
 '7 stages',
 '5 stages',
 '6 stages',
 '5 stages',
 '7 stages',
 '196 km',
 '201km',
 '176 km',
 '7 stages',
 '7 stages',
 '291 km',
 '7 stages',
 '203 km',
 '243 km',
 '200km',
 '255km',
 '6 stages',
 '208 km',
 '250 km',
 '8 stages',
 '205km',
 '258 km',
 '5 stages',
 '196km',
 '229 km',
 '250 km',
 '6 stages',
 '217 km']

In [5]:
# Date column
df = pd.DataFrame(date, columns = {'Date'})

# The function returns a dataframe having their indices removed
# which will help us integrate the columns all together

def define_df(dataframe, title):
    dtf= pd.DataFrame(dataframe, columns = {title})
    dtf.reset_index(drop = True, inplace=True)
    return dtf

df_event = define_df(date, 'Event')
df_loc = define_df(location, 'Location')
df_route  = define_df(route, 'Routes')

# Combine and concatenate the columns data
all_together = [df, df_event, df_loc, df_route]
dff = pd.concat(all_together, axis=1)
# remove the first row
dff.drop(index=df.index[0], axis=0, inplace=True)

#Export to csv file
#dff.to_csv('df.csv')

dff

Unnamed: 0,Date,Event,Location,Routes
1,Jun 12 - Jun 19,Jun 12 - Jun 19,"Baar, Switzerland",9 stages
2,Jul 01 - Jul 24,Jul 01 - Jul 24,"Copenhagen, Denmark",21 stages
3,Jul 30,Jul 30,"San-Sebastian, Spain",219km
4,Aug 08 - Aug 14,Aug 08 - Aug 14,"Radzymin, Poland",7 stages
5,Aug 19 - Sep 11,Aug 19 - Sep 11,"Utrecht, Netherlands",21 stages
6,Aug 21,Aug 21,"Hamburg, Germany",250 km
7,Aug 29 - Sep 04,Aug 29 - Sep 04,"Breda, Netherlands",7 stages
8,Sep 09,Sep 09,"Quebec City, Canada",200km
9,Sep 11,Sep 11,"Montreal, Canada",206km
10,Sep 14,Sep 14,"Beaufays, Belgium",205 km


In [6]:
# filtering out countries that have different stages, the rest countries are in Kilometers which don't have stages
# Let's add each url (link) before and filter them out afterwards to make the process easier
dff[dff.Routes.str.contains('stages')]

Unnamed: 0,Date,Event,Location,Routes
1,Jun 12 - Jun 19,Jun 12 - Jun 19,"Baar, Switzerland",9 stages
2,Jul 01 - Jul 24,Jul 01 - Jul 24,"Copenhagen, Denmark",21 stages
4,Aug 08 - Aug 14,Aug 08 - Aug 14,"Radzymin, Poland",7 stages
5,Aug 19 - Sep 11,Aug 19 - Sep 11,"Utrecht, Netherlands",21 stages
7,Aug 29 - Sep 04,Aug 29 - Sep 04,"Breda, Netherlands",7 stages
12,Jan 20 - Jan 28,Jan 20 - Jan 28,"Adelaide, Australia",6 stages
13,Jan 29 - Feb 05,Jan 29 - Feb 05,"San Juan, Argentina",7 stages
14,Feb 01 - Feb 05,Feb 01 - Feb 05,"Castelló de la Plana, Spain",5 stages
15,Feb 07 - Feb 12,Feb 07 - Feb 12,"Al Sawadi Beach, Oman",6 stages
16,Feb 15 - Feb 19,Feb 15 - Feb 19,"Loule, Portugal",5 stages


#### Finding links

In [7]:
store_links = []
for tr in tbody.find_all('td', {'class' : 'colTitle'}):
    for a in tr.find_all('a', href = True):
        store_links.append(a['href'])
        print(a['href'])

https://www.granfondoguide.com/Events/Index/6027/2022-tour-of-suisse
https://www.granfondoguide.com/Events/Index/7004/2022-tour-de-france
https://www.granfondoguide.com/Events/Index/6002/2022-clásica-san-sebastián
https://www.granfondoguide.com/Events/Index/6011/2022-tour-of-poland
https://www.granfondoguide.com/Events/Index/6004/2022-la-vuelta
https://www.granfondoguide.com/Events/Index/6013/2022-euroeyes-cyclassics-hamburg
https://www.granfondoguide.com/Events/Index/6003/2022-benelux-tour
https://www.granfondoguide.com/Events/Index/6005/2022-grand-prix-cycliste-de-quebec
https://www.granfondoguide.com/Events/Index/6006/2022-grand-prix-cycliste-de-montreal
https://www.granfondoguide.com/Events/Index/6930/2022-grand-prix-de-wallonie
https://www.granfondoguide.com/Events/Index/6007/2022-il-lombardia
https://www.granfondoguide.com/Events/Index/5933/santos-festival-of-cycling
https://www.granfondoguide.com/Events/Index/6532/2023-vuelta-a-san-juan
https://www.granfondoguide.com/Events/Inde

In [8]:
# As stated earlier, let's integrate the links in csv format along with the previous data
df_url = pd.DataFrame(store_links, columns={'URLs'})
all_together = [df, df_event, df_loc, df_route, df_url]
dff = pd.concat(all_together, axis=1)
dff

Unnamed: 0,Date,Event,Location,Routes,URLs
0,Date,Date,Location,Routes,https://www.granfondoguide.com/Events/Index/60...
1,Jun 12 - Jun 19,Jun 12 - Jun 19,"Baar, Switzerland",9 stages,https://www.granfondoguide.com/Events/Index/70...
2,Jul 01 - Jul 24,Jul 01 - Jul 24,"Copenhagen, Denmark",21 stages,https://www.granfondoguide.com/Events/Index/60...
3,Jul 30,Jul 30,"San-Sebastian, Spain",219km,https://www.granfondoguide.com/Events/Index/60...
4,Aug 08 - Aug 14,Aug 08 - Aug 14,"Radzymin, Poland",7 stages,https://www.granfondoguide.com/Events/Index/60...
5,Aug 19 - Sep 11,Aug 19 - Sep 11,"Utrecht, Netherlands",21 stages,https://www.granfondoguide.com/Events/Index/60...
6,Aug 21,Aug 21,"Hamburg, Germany",250 km,https://www.granfondoguide.com/Events/Index/60...
7,Aug 29 - Sep 04,Aug 29 - Sep 04,"Breda, Netherlands",7 stages,https://www.granfondoguide.com/Events/Index/60...
8,Sep 09,Sep 09,"Quebec City, Canada",200km,https://www.granfondoguide.com/Events/Index/60...
9,Sep 11,Sep 11,"Montreal, Canada",206km,https://www.granfondoguide.com/Events/Index/69...


In [9]:
dff[dff.Routes.str.contains('stages')]

Unnamed: 0,Date,Event,Location,Routes,URLs
1,Jun 12 - Jun 19,Jun 12 - Jun 19,"Baar, Switzerland",9 stages,https://www.granfondoguide.com/Events/Index/70...
2,Jul 01 - Jul 24,Jul 01 - Jul 24,"Copenhagen, Denmark",21 stages,https://www.granfondoguide.com/Events/Index/60...
4,Aug 08 - Aug 14,Aug 08 - Aug 14,"Radzymin, Poland",7 stages,https://www.granfondoguide.com/Events/Index/60...
5,Aug 19 - Sep 11,Aug 19 - Sep 11,"Utrecht, Netherlands",21 stages,https://www.granfondoguide.com/Events/Index/60...
7,Aug 29 - Sep 04,Aug 29 - Sep 04,"Breda, Netherlands",7 stages,https://www.granfondoguide.com/Events/Index/60...
12,Jan 20 - Jan 28,Jan 20 - Jan 28,"Adelaide, Australia",6 stages,https://www.granfondoguide.com/Events/Index/65...
13,Jan 29 - Feb 05,Jan 29 - Feb 05,"San Juan, Argentina",7 stages,https://www.granfondoguide.com/Events/Index/93...
14,Feb 01 - Feb 05,Feb 01 - Feb 05,"Castelló de la Plana, Spain",5 stages,https://www.granfondoguide.com/Events/Index/65...
15,Feb 07 - Feb 12,Feb 07 - Feb 12,"Al Sawadi Beach, Oman",6 stages,https://www.granfondoguide.com/Events/Index/93...
16,Feb 15 - Feb 19,Feb 15 - Feb 19,"Loule, Portugal",5 stages,https://www.granfondoguide.com/Events/Index/79...


In [122]:
# Links related only to countries that have stages
pd.set_option('max_colwidth', 100) # if not set, otherwise doesn't show the full link and when you click it won't work
stage_urls = []
stage_links = dff[dff.Routes.str.contains('stages')].URLs
stage_urls.append(str(stage_links))
stage_links

1                       https://www.granfondoguide.com/Events/Index/7004/2022-tour-de-france
2                https://www.granfondoguide.com/Events/Index/6002/2022-clásica-san-sebastián
4                            https://www.granfondoguide.com/Events/Index/6004/2022-la-vuelta
5          https://www.granfondoguide.com/Events/Index/6013/2022-euroeyes-cyclassics-hamburg
7        https://www.granfondoguide.com/Events/Index/6005/2022-grand-prix-cycliste-de-quebec
12                   https://www.granfondoguide.com/Events/Index/6532/2023-vuelta-a-san-juan
13    https://www.granfondoguide.com/Events/Index/9305/v2023-volta-a-la-comunitat-valenciana
14                        https://www.granfondoguide.com/Events/Index/6569/2023-tour-of-oman
15                    https://www.granfondoguide.com/Events/Index/9306/2023-volta-ao-algarve
16                            https://www.granfondoguide.com/Events/Index/7990/2023-uae-tour
17               https://www.granfondoguide.com/Events/Index/5937/2023