Libs

In [18]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

Global variables

In [19]:
# ChromeOptions to define specific features of that browser.
chrome_options = ChromeOptions()
# With "--headless=new" Selenium opens the browser minimized.
chrome_options.add_argument("--headless=new") 
# DataFrame model
main_df = pd.DataFrame(columns = ["Stadium", "Capacity", "Spectators", "Average",	"Matches", "sold out", "Capacity", "Club", "Year"]) 
# List of all seasons that should have their data extracted.
seasons = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
# List of leagues endpoints.
leagues_prefixs = ["SA1", "SA2L"]
dfs = []

Main function

In [20]:
# This function extracts the Attendecens data using a loop in "seasons" list.

def extract_attendences():
  for i in range(len(seasons)): 
    # Each season is accessed with its respective index.
    url = f'https://www.transfermarkt.com/saudi-pro-league/besucherzahlen/wettbewerb/SA1/saison_id/{seasons[i]}/plus/1'
    # Here I start the Chrome driver, to access Selenium functionalities.
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    # scrapingHTML function removes the table found on the website, transforming it into a DataFrame
    html = scrapingHTML(driver, i)
    print(f'Season {seasons[i]} - Scraping done!')

  # After the loop. We join the dataframes and send them to the RAW layer as CSV file.
  df_og = pd.concat(dfs)
  df_og.to_csv("./RAW/ArabicSoccer/Attendances/Arabic_Attendances_2007_2023.csv", encoding='utf-8', index=False)

NoSuchElementExceptionHandler function

In [21]:
# This function was created to avoid code repetition.
# Its purpose is to make the NoSuchElementException Handler.
def NoSuchElementExceptionHandler(driver, ID, XPATH):
  try:
        WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it(driver.find_element(By.ID, ID)))
        driver.find_element(By.XPATH, XPATH).click()
  except NoSuchElementException:
    # If the error occurs and the cookie pop-up does not appear, driver.find_element is ignored.
    pass

Function to get data with Selenium and BS4

In [22]:
def scrapingHTML(driver, i):
  # NoSuchElementException Handler.
  NoSuchElementExceptionHandler(driver, "sp_message_iframe_851946", "//*[@id='notice']/div[3]/div[3]/button")
  # After closing the pop-up, we search for the table using its XPATH.
  table = driver.find_element(By.XPATH, '//*[@id="yw1"]/table')
  # Using the "get_attribute" method with the "OuterHTML" parameter we extract the HTML code from the table.  
  data = table.get_attribute('outerHTML')
  # Allowed to scroll through data in html. Then I use the find method to find the table tag.
  soup = BeautifulSoup(data, 'html.parser')
  html = soup.find(name='table')

  # To transform HTML into DataFrame it is necessary to transform it into String as it is a BeautifulSoup Object.
  # The pandas method "read_html()" returns a list of Dataframes, as we need the first one, I use [0].
  df = pd.read_html(str(html))[0]
  df["Club"] = ""
  # Create a column with the respective season.
  df["Year"] = seasons[i]
  
  # Insert the DataFrame into the list.
  dfs.append(df)
  # Close the navigator.
  driver.quit()
  return html

Extract Teams info

In [23]:
def teams_list_extract(league):
  # Here we loop the list with league prefixes.
  for i in range(len(league)):
    url = f"https://www.transfermarkt.us/saudi-pro-league/startseite/wettbewerb/{league[i]}"
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    NoSuchElementExceptionHandler(driver, "sp_message_iframe_851946", "//*[@id='notice']/div[3]/div[3]/button")
    
    table = driver.find_element(By.XPATH, '//*[@id="yw1"]/table')
    data = table.get_attribute('outerHTML')
    soup = BeautifulSoup(data, 'html.parser')
    html = soup.find(name='table')

    df = pd.read_html(str(html))[0]
    # Create a column with the respective league name.
    df["League"] = league[i]

    # Here the Dataframe is saved as CSV, in the RAW layer.
    df.to_csv(f"./RAW/ArabicSoccer/Teams/{league[i]}_teams_2023.csv", encoding='utf-8', index=False)

Extract All Champions - SPL

In [24]:
def extract_champions(league):
  # Here we loop the list with league prefixes.
  for i in range(len(league)):
    url = f"https://www.transfermarkt.us/saudi-professional-league/erfolge/wettbewerb/{league[i]}"
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    NoSuchElementExceptionHandler(driver, "sp_message_iframe_851946", "//*[@id='notice']/div[3]/div[3]/button")
    
    table = driver.find_element(By.XPATH, '//*[@id="yw1"]/table')
    data = table.get_attribute('outerHTML')
    soup = BeautifulSoup(data, 'html.parser')
    html = soup.find(name='table')
    df = pd.read_html(str(html))[0]
    
    # Entering the name of the "League" column, according to the loop index
    # Considering that in the list leagues_prefixs = ["SA1", "SA2L"]
    if i == 1:
      # SA1 is Saudi Pro League
      df["League"] = "Saudi Pro League"
    else:
      # SA2l is Yelo League
      df["League"] = "Yelo League"

     # Here the Dataframe is saved as CSV, in the RAW layer.
    df.to_csv(f"./RAW/ArabicSoccer/Champions/{league[i]}_Champions.csv", encoding='utf-8', index=False)

Performing the functions

In [25]:
extract_champions(leagues_prefixs)
extract_attendences()
teams_list_extract(leagues_prefixs)

Season 2007 - Scraping done!
Season 2008 - Scraping done!
Season 2009 - Scraping done!
Season 2010 - Scraping done!
Season 2011 - Scraping done!
Season 2012 - Scraping done!
Season 2013 - Scraping done!
Season 2014 - Scraping done!
Season 2015 - Scraping done!
Season 2016 - Scraping done!
Season 2017 - Scraping done!
Season 2018 - Scraping done!
Season 2019 - Scraping done!
Season 2020 - Scraping done!
Season 2021 - Scraping done!
Season 2022 - Scraping done!
Season 2023 - Scraping done!
