We start by scraping FotMob for the urls that we need to grab the commentary

In [None]:
# Import necessary modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
import os

# Function to initialize Selenium WebDriver
def initialize_driver():
    # Set up Chrome options for better performance and to prevent popups
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode to avoid UI
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    # Specify the path to your own ChromeDriver
    chromedriver_path = os.path.expanduser("/Users/jamesngugi/Downloads/chromedriver-mac-arm64/chromedriver")
    # Initialize the driver with the specified ChromeDriver
    driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)
    return driver

# Function to extract links from a specific URL
def extract_links(url):
    driver = initialize_driver()
    try:
        # Open the given URL
        driver.get(url)
        time.sleep(2)  # Give some time for the page to load
        
        # Extract links only from the specified div with class "slick-slide slick-active slick-current"
        div_elements = driver.find_elements(By.CLASS_NAME, 'slick-slide.slick-active.slick-current')
        links = []
        for div in div_elements:
            anchor_tags = div.find_elements(By.TAG_NAME, 'a')
            links.extend([anchor.get_attribute('href') for anchor in anchor_tags if anchor.get_attribute('href')])
        
    finally:
        # Quit the driver once done
        driver.quit()
    return links

# Function to extract links from multiple URLs
def extract_links_from_urls(urls):
    all_links = []
    for url in urls:
        print(f"Extracting links from: {url}")
        links = extract_links(url)
        all_links.extend(links)
    return all_links

# Example usage
if __name__ == "__main__":
    base_url = "https://www.fotmob.com/leagues/47/matches/premier-league?season=2023-2024&group=by-round&round="
    urls = [f"{base_url}{i}" for i in range(1, 39)]
    
    links_list = extract_links_from_urls(urls)
    

# Ensure to run `pip install selenium` before running this script.


We are going to remove one a game that was abandoned because a player collapsed and keep the replayed game of the same exact match.

In [None]:
links_list.remove("https://www.fotmob.com/matches/luton-town-vs-afc-bournemouth/2ea97q#4193691")

In [13]:
#add the exact tab in the url we want
links_list = [s + ":tab=ticker" for s in links_list] 
print(len(links_list))
# for link in links_list:
#         print(link)
print(links_list[0])

380
https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450:tab=ticker


### Scraping to get the actual commentary

Now we scrape to get the actual commentary. We use all of the links in the list: "links_list".

In [33]:
import sys
import logging
import time  # For adding delays if necessary
import pandas as pd  # For creating the DataFrame
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re  # For extracting quoted text from divs

# Configure logging to display INFO and higher-level messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# List of URLs to process
urls = [links_list[0]]  # Ensure this list is defined with your target URLs

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration (Windows)
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model (Linux)
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("--window-size=1920,1080")  # Ensure content is fully loaded in headless mode

# Add arguments to make headless mode less detectable
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])

# Suppress unnecessary logging from Selenium
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])

# Path to the Chromedriver executable
webdriver_service = Service('/Users/jamesngugi/Downloads/chromedriver-mac-arm64/chromedriver')  # Update as needed

# Initialize the WebDriver with exception handling
try:
    driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
    logging.info("WebDriver initialized successfully.")
except Exception as e:
    logging.error(f"Error initializing WebDriver: {e}")
    sys.exit(1)

# Function to process each URL
def process_url(url):
    logging.info(f"Processing URL: {url}")
    try:
        driver.get(url)
        logging.info(f"Page loaded: {url}")
    except Exception as e:
        logging.error(f"Error loading URL {url}: {e}")
        return None, None, None

    # Extract the title to determine the home and away teams
    try:
        title_element = driver.find_element(By.TAG_NAME, 'title')
        title_text = title_element.get_attribute('innerText')
        logging.info(f"Extracted title: {title_text}")
        
        # Extract home and away teams based on title format
        title_main_part = title_text.split(" - ")[0]  # Adjust based on actual title format
        home_team, away_team = title_main_part.split(" vs ")
        logging.info(f"Home team: {home_team}, Away team: {away_team}")
    except Exception as e:
        logging.error(f"Error extracting title or parsing teams from {url}: {e}")
        return None, None, None

    # Initialize WebDriverWait
    wait = WebDriverWait(driver, 10)  # Wait up to 10 seconds for elements to appear

    # Interact with the toggle button
    try:
        toggle_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'css-19d3bb6-Toggle')))
        if toggle_button.get_attribute("aria-checked") == "true":
            toggle_button.click()
            logging.info(f"Clicked toggle button to set to false on {url}")
        else:
            logging.info(f"Toggle button already set to false on {url}")
    except Exception as e:
        logging.error(f"Error interacting with toggle button on {url}: {e}")
        return None, None, None

    # Wait for the content to load after toggling
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'css-1wqh8j4-LiveTickerTextOnly')))
        logging.info(f"Content loaded after toggling on {url}")
    except Exception as e:
        logging.error(f"Error waiting for content on {url}: {e}")
        return None, None, None

    # Extract text from all relevant elements in the order they appear
    try:
        # Use XPath to select all relevant divs and p elements in the order they appear
        ticker_elements = driver.find_elements(By.XPATH, """
            //div[contains(@class, 'LiveTickerTextOnly') or contains(@class, 'LiveTickerItemContent')]
            | //p[contains(@class, 'LiveTickerTextOnly')]
        """)

        extracted_texts = []
        for elem in ticker_elements:
            classes = elem.get_attribute('class')
            if 'css-ttt5nx-LiveTickerItemContent' in classes:
                # Extract text excluding nested <span> elements
                text = driver.execute_script("""
                    var element = arguments[0];
                    var text = '';
                    for (var i = 0; i < element.childNodes.length; i++) {
                        if (element.childNodes[i].nodeType === Node.TEXT_NODE) {
                            text += element.childNodes[i].textContent.trim() + ' ';
                        }
                    }
                    return text.trim();
                """, elem)
                if text:
                    extracted_texts.append(text)
            else:
                # For other classes, extract the full text
                text = elem.text.strip()
                if text:
                    extracted_texts.append(text)

        # Reverse the list to have the latest texts first
        concatenated_text = ' '.join(reversed(extracted_texts))
        logging.info(f"Extracted text from {url}")
        return home_team, away_team, concatenated_text
    except Exception as e:
        logging.error(f"Error extracting text on {url}: {e}")
        return None, None, None

# List to store data for DataFrame
data = []

# Process each URL
for url in urls:
    home_team, away_team, text = process_url(url)
    if home_team and away_team and text:
        data.append({'home': home_team, 'away': away_team, 'url': url, 'commentary': text})
    else:
        logging.warning(f"No valid data extracted from {url}")

    # Optional: Sleep for a short duration to be polite to the server
    # time.sleep(1)  # Sleep for 1 second

# Close the WebDriver
driver.quit()
logging.info("WebDriver closed.")

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['home', 'away', 'url', 'commentary'])

# Add an indexing column 'game'
df.insert(0, 'game', range(1, len(df) + 1))

# Output the DataFrame
print(df.to_string())
logging.info("DataFrame created and printed.")


2024-12-03 20:12:11,261 - INFO - WebDriver initialized successfully.
2024-12-03 20:12:11,263 - INFO - Processing URL: https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450:tab=ticker
2024-12-03 20:12:12,384 - INFO - Page loaded: https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450:tab=ticker
2024-12-03 20:12:12,391 - INFO - Extracted title: Burnley vs Manchester City - live score, predicted lineups and H2H stats
2024-12-03 20:12:12,392 - INFO - Home team: Burnley, Away team: Manchester City
2024-12-03 20:12:12,405 - INFO - Toggle button already set to false on https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450:tab=ticker
2024-12-03 20:12:12,410 - INFO - Content loaded after toggling on https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450:tab=ticker
2024-12-03 20:12:12,689 - INFO - Extracted text from https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450:tab=ticker
2024-12-03 20:12:12,774

   game     home             away                                                                                  url                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [34]:
# df.to_csv('test_output.tsv', sep='\t', index=False)
print(df.loc[0, 'commentary'])

Hello and welcome to live coverage of the Premier League clash between Burnley and Manchester City at Turf Moor. The Premier League is back for a new season, and it gets underway with an intriguing match-up between a newly promoted side and the reigning champions. Burnley could have hardly been given a tougher opening fixture, but they should come into it with confidence after cruising to the Championship title last time out in their first season back in the second tier. Led by Vincent Kompany, a Citizens' legend from his playing career, Burnley reached 101 points and secured promotion with seven games to play, and they will be hoping to get off the mark with a positive result on home soil. City, meanwhile, claimed a historic treble last term, finally winning their first Champions League title while also taking Premier League and FA Cup glory. It has been a transfer window of change with key players of recent years such as Riyad Mahrez and Ilkay Gundogan leaving the club, but they stil