In [None]:
import os  # Importing the os module for file operations
import csv  # Importing the csv module for CSV file handling

import pandas as pd  # Importing pandas for data manipulation
import requests  # Importing requests for HTTP requests
from bs4 import BeautifulSoup  # Importing BeautifulSoup for HTML parsing


In [None]:
# Path to the CSV file containing head-to-head links
head_to_head_links = '/Users/joathcarrera/Desktop/CSE115A/Soccer-Match-Predictor/H2H_Stats/Urls/urls.csv'

# Directory where matchday data will be saved
output_directory = '/Users/joathcarrera/Desktop/CSE115A/Soccer-Match-Predictor/H2H_Stats/Matchdays/'

# API key for ScraperAPI
api_key = 'b3349e3c5ce9f2853b2b8cee2c0052a7'

# Base URL for ScraperAPI
api_url = 'https://api.scraperapi.com/'


In [None]:
# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Open the CSV file containing head-to-head links
with open(head_to_head_links, 'r') as f:
    reader = csv.reader(f)
    
    # Skip the header row
    next(reader) 
    
    # Initialize matchday count and create the first matchday directory
    matchday_count = 1
    matchday_dir = os.path.join(output_directory, f'Matchday_{matchday_count}')
    os.makedirs(matchday_dir, exist_ok=True)

    
    # Iterate over each row in the CSV file
    for idx, row in enumerate(reader):
        
        # Get the URL from the current row
        url = row[0]
        
        # Create the payload for the API request
        payload = {'api_key': api_key, 'url': url}
        
        # Make the HTTP request to ScraperAPI
        response = requests.get(api_url, params=payload)

        
        if response.status_code != 200:
            continue
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        scorebox = soup.find('div', class_='scorebox')

        if scorebox:
            teams_data = []

            # Extract team data from the scorebox
            for div in scorebox.find_all('div', recursive=False):
                team_data = [item.get_text(strip=True) for item in div.find_all('div')]
                teams_data.append(team_data)

            if teams_data:
                # Ensure all team data rows have the same length
                max_columns = max(len(team_data) for team_data in teams_data)

                for team_data in teams_data:
                    while len(team_data) < max_columns:
                        team_data.append('')

                # Define the columns for the DataFrame
                columns = ['Team'] + [f'Column{i}' for i in range(1, max_columns)]

                df = pd.DataFrame(teams_data, columns=columns)

                # Generate filename from the URL
                filename = os.path.basename(url).replace('History', '').replace('.html', '') + '.csv'
                filepath = os.path.join(matchday_dir, filename)

                # Save the DataFrame to a CSV file
                df.to_csv(filepath, index=False)

            else:
                print(f"No team data found for URL: {url}")
        else:
            print(f"No scorebox found for URL: {url}")

        # Create a new directory for every 9 URLs processed       
        if (idx + 1) % 9 == 0:
            matchday_count += 1
            matchday_dir = os.path.join(output_directory, f'Matchday_{matchday_count}')
            os.makedirs(matchday_dir, exist_ok=True)