# Data scraping

This notebook aims to scrap data from websites.

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re

## football-data.co.uk

In [10]:
URL = "https://www.football-data.co.uk"
TARGET_DIR = "./../../data/soccer/football-data.co.uk/vrac"
URL_MAIN_PAGE = "data.php"

In [None]:
def extract_country_links_football_data_co_uk():
  """
  Extracts country names and their corresponding .php files from HTML content.

  Returns:
      list: A list of str containing the country .php name
  """
  response = requests.get(f"{URL}/{URL_MAIN_PAGE}")
  html_content = response.content
  soup = BeautifulSoup(html_content, 'html.parser')

  target_cell = soup.find_all('td', valign='top')[2]

  links = target_cell.find_all('a', href=True)

  country_links = []
  for link in links:
    href = link.get('href')
    if href and href.endswith('.php'):
      type = link.text.strip().split()[-1]
      if type == "Results":
        country_links.append(href)

  return country_links


In [11]:
def download_country_football_data_co_uk(page):
  """
  Download all csv files from a country page on football-data.co.uk
  """

  response = requests.get(f"{URL}/{page}")
  soup = BeautifulSoup(response.content, 'html.parser')
  csv_links = []
  for link in soup.find_all('a'):
    href = link.get('href')
    if href and href.endswith('.csv'):
      csv_links.append(f"{URL}/{href}")
  for link in csv_links:
    csv_response = requests.get(link)
    file_name = link.split('/')[-2] + '-' + link.split('/')[-1]
    with open(os.path.join(TARGET_DIR, file_name), 'wb') as f:
      f.write(csv_response.content)

In [52]:
pages = extract_country_links_football_data_co_uk()
print(pages)

['englandm.php', 'scotlandm.php', 'germanym.php', 'italym.php', 'spainm.php', 'francem.php', 'netherlandsm.php', 'belgiumm.php', 'portugalm.php', 'turkeym.php', 'greecem.php', 'Argentina.php', 'Austria.php', 'Brazil.php', 'China.php', 'Denmark.php', 'Finland.php', 'Ireland.php', 'Japan.php', 'Mexico.php', 'Norway.php', 'Poland.php', 'Romania.php', 'Russia.php', 'Sweden.php', 'Switzerland.php', 'USA.php']


In [53]:
for page in pages:
  download_country_football_data_co_uk(page)