In [24]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import os
from pathlib import Path
import pandas as pd
import urllib.request
from tqdm import tqdm

## Read the URLs with csv-files

In [15]:
with open('urls.txt') as file:
    odds_urls = [line.rstrip() for line in file]

odds_urls

['https://www.football-data.co.uk/englandm.php',
 'https://www.football-data.co.uk/scotlandm.php',
 'https://www.football-data.co.uk/germanym.php',
 'https://www.football-data.co.uk/italym.php',
 'https://www.football-data.co.uk/spainm.php',
 'https://www.football-data.co.uk/francem.php',
 'https://www.football-data.co.uk/netherlandsm.php',
 'https://www.football-data.co.uk/belgiumm.php',
 'https://www.football-data.co.uk/portugalm.php',
 'https://www.football-data.co.uk/turkeym.php',
 'https://www.football-data.co.uk/greecem.php',
 'https://www.football-data.co.uk/Argentina.php',
 'https://www.football-data.co.uk/Austria.php',
 'https://www.football-data.co.uk/Brazil.php',
 'https://www.football-data.co.uk/China.php',
 'https://www.football-data.co.uk/Denmark.php',
 'https://www.football-data.co.uk/Finland.php',
 'https://www.football-data.co.uk/Ireland.php',
 'https://www.football-data.co.uk/Japan.php',
 'https://www.football-data.co.uk/Mexico.php',
 'https://www.football-data.co.uk/

## Boot up the Selenium Driver

In [16]:
options = Options()
options.headless = True
options.add_argument('--window-size=2560,1400')
options.add_argument('log-level=1')
DRIVER_PATH = Path('./chromedriver/chromedriver.exe').absolute()
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

## Get all CSV-files

In [25]:
# Set BASEPATH for storing the csv-files
BASE_PATH = './raw_data/'

# Iterate over the odds URLS
for odds_url in odds_urls:
  
  # Create an empty DF
  df = pd.DataFrame()

  # Get country name from the URL
  country = odds_url.split('/')[-1].split('.')[0].lower()

  # Keep trying until scraping completes :)
  page_scraped_finished = False
  while(not page_scraped_finished):

    # Read the page with the driver
    print('Reading URL:', odds_url)
    driver.get(odds_url)

    # Get all links from the page (odds_url)
    links = driver.find_elements_by_xpath("//a[@href]")

    # Iterate over the links on the page
    for link in tqdm(links):

      # Get the URL of the link as text
      try:
        link_url = link.get_attribute("href")
      except StaleElementReferenceException:
        print('StaleElementReferenceException, trying again :)')

      # Check if the URL of the link contains .csv or if it's some random link
      if '.csv' in link_url:
        
        # TODO: Maybe we need this metadata for saving DFs (?)
        # # Extract some metadata from the URL (year and league)
        # meta_data = link_url.split('/')[-2:]
        # year = meta_data[0]
        # league = meta_data[1].replace('.csv', '')

        # Open the URL safely with urllib to avoid errors with Pandas
        with urllib.request.urlopen(link_url) as resp:
          new_df = pd.read_csv(resp, encoding='cp1252', on_bad_lines='skip')

        # Concat the new matches to our big DF
        df = pd.concat([df, new_df])

    page_scraped_finished = True

  # Create the path if it does not exist
  if not os.path.exists(BASE_PATH):
      os.makedirs(BASE_PATH)

  # Reset indices
  df = df.reset_index(drop=True)

  # Save DF
  df.to_csv(BASE_PATH + '/' + country + '.csv')


Reading URL: https://www.football-data.co.uk/englandm.php


100%|██████████| 285/285 [01:19<00:00,  3.60it/s]


Reading URL: https://www.football-data.co.uk/scotlandm.php


100%|██████████| 257/257 [00:50<00:00,  5.14it/s]


Reading URL: https://www.football-data.co.uk/germanym.php


100%|██████████| 207/207 [00:29<00:00,  7.10it/s]


Reading URL: https://www.football-data.co.uk/italym.php


100%|██████████| 203/203 [00:28<00:00,  7.25it/s]


Reading URL: https://www.football-data.co.uk/spainm.php


100%|██████████| 204/204 [00:29<00:00,  7.01it/s]


Reading URL: https://www.football-data.co.uk/francem.php


100%|██████████| 204/204 [00:28<00:00,  7.20it/s]


Reading URL: https://www.football-data.co.uk/netherlandsm.php


100%|██████████| 177/177 [00:14<00:00, 12.25it/s]


Reading URL: https://www.football-data.co.uk/belgiumm.php


100%|██████████| 175/175 [00:13<00:00, 12.85it/s]


Reading URL: https://www.football-data.co.uk/portugalm.php


100%|██████████| 176/176 [00:13<00:00, 12.59it/s]


Reading URL: https://www.football-data.co.uk/turkeym.php


100%|██████████| 176/176 [00:14<00:00, 12.27it/s]


Reading URL: https://www.football-data.co.uk/greecem.php


100%|██████████| 176/176 [00:13<00:00, 12.86it/s]


Reading URL: https://www.football-data.co.uk/Argentina.php


100%|██████████| 149/149 [00:02<00:00, 56.33it/s]


Reading URL: https://www.football-data.co.uk/Austria.php


100%|██████████| 149/149 [00:02<00:00, 61.78it/s]


Reading URL: https://www.football-data.co.uk/Brazil.php


100%|██████████| 149/149 [00:02<00:00, 54.04it/s]


Reading URL: https://www.football-data.co.uk/China.php


100%|██████████| 149/149 [00:02<00:00, 55.85it/s]


Reading URL: https://www.football-data.co.uk/Denmark.php


100%|██████████| 149/149 [00:02<00:00, 56.30it/s]


Reading URL: https://www.football-data.co.uk/Finland.php


100%|██████████| 149/149 [00:02<00:00, 55.22it/s]


Reading URL: https://www.football-data.co.uk/Ireland.php


100%|██████████| 149/149 [00:02<00:00, 53.62it/s]


Reading URL: https://www.football-data.co.uk/Japan.php


100%|██████████| 149/149 [00:02<00:00, 53.29it/s]


Reading URL: https://www.football-data.co.uk/Mexico.php


100%|██████████| 149/149 [00:02<00:00, 51.96it/s]


Reading URL: https://www.football-data.co.uk/Norway.php


100%|██████████| 149/149 [00:02<00:00, 56.72it/s]


Reading URL: https://www.football-data.co.uk/Poland.php


100%|██████████| 149/149 [00:02<00:00, 57.21it/s]


Reading URL: https://www.football-data.co.uk/Romania.php


100%|██████████| 149/149 [00:02<00:00, 54.32it/s]


Reading URL: https://www.football-data.co.uk/Russia.php


100%|██████████| 149/149 [00:02<00:00, 52.33it/s]


Reading URL: https://www.football-data.co.uk/Sweden.php


100%|██████████| 149/149 [00:02<00:00, 55.55it/s]


Reading URL: https://www.football-data.co.uk/Switzerland.php


100%|██████████| 149/149 [00:02<00:00, 56.18it/s]


Reading URL: https://www.football-data.co.uk/USA.php


100%|██████████| 149/149 [00:02<00:00, 52.60it/s]
