# Web Scraper
**Goal:** In this notebook we will build a simple webscraper to get the eredivisie data from the web and store this in a csv file to be used in our analysis. The website contains all Eredivisie results since the season '1956-1957'. 

**Future:** 
- See whether there is more information to be found that could benefit the model and scrape this data.
- Create web scraper for upcoming matches (not on current site).

**Version:** 1.0

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from io import StringIO

## First website

In [2]:
def extract_eredevisie_results(url: str,
                            form_data: dict) -> pd.DataFrame:
    """
    Function to extract the results of the Eredivisie from the website and parse these into a DataFrame.

    Returns:
        - pd.DataFrame: DataFrame containing the results of the Eredivisie.
    """
    # Get the table from the website
    response = requests.post(url, data=form_data)
    soup = BeautifulSoup(response.content, 'html.parser')
    all_tables = soup.find_all('table')
    target_table = all_tables[4]    # The table with the results is the 5th table on the page

    # Extract the data from the table
    headers = []
    data = []

    # Extract the headers
    for head in target_table.find_all('tr')[0].find_all('th'):
        headers.append(head.text.strip())

    # Extract the data
    for row in target_table.find_all('tr')[1:]:
        # Get all columns in the row
        cols = row.find_all('td')
        # Extract data from each column
        row_data = [col.text.strip() for col in cols]
        # Append the row data to the main data list
        data.append(row_data)

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data, columns=headers)

    return df

In [3]:
# Define the URL to scrape
url = 'https://www.eredivisiestats.nl/wedstrijden.php'

# Prepare form data for the POST request
form_data = {
        'sorteer1': 'datum ASC',
        'sorteer2': 'thuisclub ASC',
        'submit': 'OK',
        'onderling': '' 
    }

# Get the results
df_eredivisie_results = extract_eredevisie_results(url, form_data)

# Show the first 10 rows of the DataFrame
df_eredivisie_results.head(10)

Unnamed: 0,Seizoen,Datum,Thuisclub,Uitclub,Thuisscore,Uitscore
0,1956-1957,1956-09-02,Ajax,NAC,1,0
1,1956-1957,1956-09-02,BVV,Elinkwijk,1,2
2,1956-1957,1956-09-02,DOS,Sparta,2,3
3,1956-1957,1956-09-02,Fortuna `54,Eindhoven,4,1
4,1956-1957,1956-09-02,NOAD,BVC Amsterdam,1,3
5,1956-1957,1956-09-02,PSV,MVV,1,3
6,1956-1957,1956-09-02,SC Enschede,Rapid JC,5,2
7,1956-1957,1956-09-02,VVV,GVAV,1,0
8,1956-1957,1956-09-02,Willem II,Feijenoord,3,3
9,1956-1957,1956-09-09,BVC Amsterdam,Willem II,0,6


In [4]:
# Store the DataFrame to a CSV file
df_eredivisie_results.to_csv('../Files/eredivisie_results.csv', index=False)    

## Second website
More data to be found from: https://www.football-data.co.uk/netherlandsm.php 

In [None]:
def scrape_football_data(url: str, 
                        base_url: str) -> pd.DataFrame:
    """
    Function to scrape football data from the given base URL.

    Args:
        base_url (str): The base URL of the website to scrape.

    Returns:
        pd.DataFrame: DataFrame containing the scraped data.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get the first table csv file
    csv_file = soup.find('a', href=re.compile(r'\.csv$', re.IGNORECASE)).get('href')
    first_csv = pd.read_csv(base_url + csv_file)

    cols = first_csv.columns.tolist()

    # Initialize empty DataFrame to store all data
    df_full = pd.DataFrame()

    for link in soup.find_all('a', href=re.compile(r'\.csv$', re.IGNORECASE)):
        csv_url = base_url + link.get('href')
        
        try:
            # Read the CSV
            temp_df = pd.read_csv(csv_url)
            
            # If first CSV, use it as base
            if df_full.empty:
                df_full = temp_df
            else:
                # Concatenate with existing data
                df_full = pd.concat([df_full, temp_df], ignore_index=True)
                
        except Exception as e:
            continue

    columns = [
        # Basic match info
        'Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam',
        'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR',
        
        # Match statistics
        'HS', 'AS', 'HST', 'AST',  # Shots
        'HC', 'AC',  # Corners
        'HF', 'AF',  # Fouls 
        'HY', 'AY', 'HR', 'AR',  # Cards
        
        # Betting odds (1X2 market)
            'B365H', 'B365D', 'B365A'  # Bet365
    ]

    # Keep only the columns in list
    df_full = df_full[columns]

    return df_full

In [57]:
# Define the URL to scrape
url = 'https://www.football-data.co.uk/netherlandsm.php'
base_url = 'https://www.football-data.co.uk/'

# Scrape the data
df_full = scrape_football_data(url, base_url)

# Save the DataFrame to a CSV file
df_full.to_csv('../Files/eredivisie_football_data.csv', index=False)