In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re # For cleaning text

def get_soup(url):
    """Fetches and parses HTML content from a URL."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Check for HTTP errors
        return BeautifulSoup(response.text, 'lxml') # Use lxml parser
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None

def extract_match_data(soup):
    """
    Extracts match data (League, Teams, Time) from the parsed HTML soup.
    *** CSS Selectors MUST be adapted for the target website ***
    """
    matches = []
    # --- THESE SELECTORS ARE EXAMPLES - INSPECT YOUR TARGET SITE'S HTML ---
    # Find containers for each league section (might be a div, section, etc.)
    league_sections = soup.select('div.league-section-class') # Placeholder class

    if not league_sections:
         league_sections = soup.select('table.fixtures-table') # Alternative structure

    for section in league_sections:
        # Extract league name (might be in a heading within the section)
        league_name_tag = section.select_one('h2.league-name-class') # Placeholder
        league_name = league_name_tag.text.strip() if league_name_tag else "Unknown League"

        # Find individual match containers within the section
        match_items = section.select('div.match-item-class') # Placeholder
        if not match_items:
             match_items = section.select('tr.match-row') # Alternative structure

        for item in match_items:
            # Extract team names
            home_team_tag = item.select_one('span.home-team-class') # Placeholder
            away_team_tag = item.select_one('span.away-team-class') # Placeholder
            # Extract time/date
            time_tag = item.select_one('span.match-time-class') # Placeholder
            date_tag = item.select_one('span.match-date-class') # Placeholder (might be part of league section)

            home_team = home_team_tag.text.strip() if home_team_tag else "N/A"
            away_team = away_team_tag.text.strip() if away_team_tag else "N/A"
            match_time = time_tag.text.strip() if time_tag else "N/A"
            match_date = date_tag.text.strip() if date_tag else "N/A" # Or get from section if needed

            if home_team != "N/A" and away_team != "N/A": # Basic validation
                matches.append({
                    'League': league_name,
                    'Date': match_date,
                    'Time': match_time,
                    'Home_Team': home_team,
                    'Away_Team': away_team
                })
    return matches

def naive_prediction(home_team, away_team):
    """
    Generates a VERY naive, random 'prediction'.
    *** THIS IS NOT BASED ON ANY REAL ANALYSIS ***
    """
    options = [f"{home_team} Win", f"{away_team} Win", "Draw"]
    # Slightly bias towards home win, very common naive approach (still not accurate)
    weights = [0.45, 0.30, 0.25]
    return random.choices(options, weights=weights, k=1)[0]

# --- Main Execution ---

# ** IMPORTANT: Replace with a URL you are allowed to scrape **
# ** Check robots.txt and Terms of Service first! **
# ** This example uses a placeholder URL **
# Example target (inspect structure first): Maybe a major news site's fixture page
target_url = 'YOUR_TARGET_URL_HERE' # e.g., 'https://www.bbc.com/sport/football/scores-fixtures' (structure will differ)

if target_url == 'YOUR_TARGET_URL_HERE':
    print("Please replace 'YOUR_TARGET_URL_HERE' with an actual URL you intend to scrape.")
else:
    print(f"Scraping fixtures from: {target_url}")
    soup = get_soup(target_url)

    if soup:
        match_list = extract_match_data(soup)

        if match_list:
            df = pd.DataFrame(match_list)

            # Add the naive prediction column
            df['Prediction (Naive)'] = df.apply(lambda row: naive_prediction(row['Home_Team'], row['Away_Team']), axis=1)

            print("\n--- Extracted Matches and Naive Predictions ---")
            print(df.to_string()) # .to_string() often shows more rows/cols in output

            # Optional: Save to CSV
            # df.to_csv('football_fixtures.csv', index=False)
            # print("\nData saved to football_fixtures.csv")

        else:
            print("No match data could be extracted. Check the CSS selectors or website structure.")
    else:
        print("Could not retrieve or parse the website.")

Please replace 'YOUR_TARGET_URL_HERE' with an actual URL you intend to scrape.


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re # For cleaning text

def get_soup(url):
    """Fetches and parses HTML content from a URL."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Check for HTTP errors
        return BeautifulSoup(response.text, 'lxml') # Use lxml parser
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None

def extract_match_data(soup):
    """
    Extracts match data (League, Teams, Time) from the parsed HTML soup.
    *** CSS Selectors MUST be adapted for the target website ***
    """
    matches = []
    # --- THESE SELECTORS ARE EXAMPLES - INSPECT YOUR TARGET SITE'S HTML ---
    # Find containers for each league section (might be a div, section, etc.)
    league_sections = soup.select('div.league-section-class') # Placeholder class

    if not league_sections:
         league_sections = soup.select('table.fixtures-table') # Alternative structure

    for section in league_sections:
        # Extract league name (might be in a heading within the section)
        league_name_tag = section.select_one('h2.league-name-class') # Placeholder
        league_name = league_name_tag.text.strip() if league_name_tag else "Unknown League"

        # Find individual match containers within the section
        match_items = section.select('div.match-item-class') # Placeholder
        if not match_items:
             match_items = section.select('tr.match-row') # Alternative structure

        for item in match_items:
            # Extract team names
            home_team_tag = item.select_one('span.home-team-class') # Placeholder
            away_team_tag = item.select_one('span.away-team-class') # Placeholder
            # Extract time/date
            time_tag = item.select_one('span.match-time-class') # Placeholder
            date_tag = item.select_one('span.match-date-class') # Placeholder (might be part of league section)

            home_team = home_team_tag.text.strip() if home_team_tag else "N/A"
            away_team = away_team_tag.text.strip() if away_team_tag else "N/A"
            match_time = time_tag.text.strip() if time_tag else "N/A"
            match_date = date_tag.text.strip() if date_tag else "N/A" # Or get from section if needed

            if home_team != "N/A" and away_team != "N/A": # Basic validation
                matches.append({
                    'League': league_name,
                    'Date': match_date,
                    'Time': match_time,
                    'Home_Team': home_team,
                    'Away_Team': away_team
                })
    return matches

def naive_prediction(home_team, away_team):
    """
    Generates a VERY naive, random 'prediction'.
    *** THIS IS NOT BASED ON ANY REAL ANALYSIS ***
    """
    options = [f"{home_team} Win", f"{away_team} Win", "Draw"]
    # Slightly bias towards home win, very common naive approach (still not accurate)
    weights = [0.45, 0.30, 0.25]
    return random.choices(options, weights=weights, k=1)[0]

# --- Main Execution ---

# ** IMPORTANT: Replace with a URL you are allowed to scrape **
# ** Check robots.txt and Terms of Service first! **
# ** This example uses a placeholder URL **
# Example target (inspect structure first): Maybe a major news site's fixture page
target_url = 'YOUR_TARGET_URL_HERE' # e.g., 'https://www.bbc.com/sport/football/scores-fixtures' (structure will differ)

if target_url == 'YOUR_TARGET_URL_HERE':
    print("Please replace 'YOUR_TARGET_URL_HERE' with an actual URL you intend to scrape.")
else:
    print(f"Scraping fixtures from: {target_url}")
    soup = get_soup(target_url)

    if soup:
        match_list = extract_match_data(soup)

        if match_list:
            df = pd.DataFrame(match_list)

            # Add the naive prediction column
            df['Prediction (Naive)'] = df.apply(lambda row: naive_prediction(row['Home_Team'], row['Away_Team']), axis=1)

            print("\n--- Extracted Matches and Naive Predictions ---")
            print(df.to_string()) # .to_string() often shows more rows/cols in output

            # Optional: Save to CSV
            # df.to_csv('football_fixtures.csv', index=False)
            # print("\nData saved to football_fixtures.csv")

        else:
            print("No match data could be extracted. Check the CSS selectors or website structure.")
    else:
        print("Could not retrieve or parse the website.")

Please replace 'YOUR_TARGET_URL_HERE' with an actual URL you intend to scrape.
