In [None]:
from selenium import webdriver #core component used to control a web browser programmatically
from selenium.webdriver.common.by import By #Provides a convenient way to locate elements on a web page
from selenium.webdriver.chrome.service import Service #class introduced to encapsulate and manage the ChromeDriver process
from selenium.webdriver.support.ui import WebDriverWait #Selenium class used to implement explicit waits
from selenium.webdriver.chrome.options import Options #class in Selenium used to customize and configure the behavior of the Chrome browser
from selenium.webdriver.support import expected_conditions as EC #set of predefined conditions to wait for
import time
import pandas as pd 
from datetime import datetime

ModuleNotFoundError: No module named 'regex'

In [None]:
# enter the cricbuzz full commentary urls of the matches whose 
urls = [
    "https://www.cricbuzz.com/cricket-full-commentary/114960/kkr-vs-rcb-1st-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115012/csk-vs-rcb-8th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115048/rcb-vs-gt-14th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115095/mi-vs-rcb-20th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115111/rcb-vs-dc-24th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115138/rr-vs-rcb-28th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115174/rcb-vs-pbks-34th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115230/rcb-vs-rr-42nd-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115257/dc-vs-rcb-46th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115302/rcb-vs-csk-52nd-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/118865/rcb-vs-srh-65th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/118898/lsg-vs-rcb-70th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/118907/pbks-vs-rcb-qualifier-1-indian-premier-league-2025",
    "https://www.cricbuzz.com/live-cricket-full-commentary/118928/rcb-vs-pbks-final-indian-premier-league-2025"
]

Code to iterate over multiple urls 

In [22]:
def scrape_match_commentary(url):
    """Scrape commentary for a single match URL"""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    commentary_data = []
    
    try:
        print(f"\nProcessing match: {url}")
        driver.get(url)
        time.sleep(5)  # Give more time for initial load
        
        # Get match details from URL
        match_id = url.split('/')[-2]
        teams = url.split('/')[-1].split('-vs-')[:2]
        match_info = f"{teams[0].upper()} vs {teams[1].upper()}"
        
        # Wait for and click RCB innings tab
        try:
            print("Looking for RCB innings tab...")
            rcb_button = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'RCB')]"))
            )
            print("Clicking RCB innings tab...")
            driver.execute_script("arguments[0].click();", rcb_button)
            time.sleep(3)
        except Exception as e:
            print(f"Error finding/clicking RCB tab: {str(e)}")
            return []

        # Scroll to load full commentary
        last_height = driver.execute_script("return document.body.scrollHeight")
        for _ in range(10):  # Scroll multiple times
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Get commentary using both methods
        commentary_lines = []
        
        # Method 1: Using CSS selectors
        selectors = [".cb-com-ln", ".cb-commentary-row", ".cb-col.cb-col-100"]
        for selector in selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    text = element.text.strip()
                    if text and re.match(r'^\d+\.[1-6]\s+\w+.*to\s+\w+', text):
                        parts = text.split(' ', 1)
                        if len(parts) >= 2 and '.' in parts[0]:
                            if not is_over_summary(parts[1]):
                                commentary_lines.append({
                                    'match_id': match_id,
                                    'teams': match_info,
                                    'over': parts[0],
                                    'commentary': parts[1],
                                    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                                })
            except Exception as e:
                print(f"Error with selector {selector}: {str(e)}")

        # Method 2: Using XPath for specific patterns
        try:
            pattern_elements = driver.find_elements(
                By.XPATH, 
                "//*[contains(text(), 'to ') and (contains(text(), 'run') or contains(text(), 'wicket') or contains(text(), 'boundary'))]"
            )
            
            for element in pattern_elements:
                text = element.text.strip()
                if len(text) > 20 and re.match(r'^\d+\.[1-6]\s+\w+.*to\s+\w+', text):
                    parts = text.split(' ', 1)
                    if len(parts) >= 2 and '.' in parts[0] and not is_over_summary(parts[1]):
                        commentary_lines.append({
                            'match_id': match_id,
                            'teams': match_info,
                            'over': parts[0],
                            'commentary': parts[1],
                            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        })
        except Exception as e:
            print(f"Error with pattern matching: {str(e)}")

        # Remove duplicates while preserving order
        seen = set()
        unique_commentary = []
        for item in commentary_lines:
            key = (item['over'], item['commentary'])
            if key not in seen:
                seen.add(key)
                unique_commentary.append(item)

        print(f"Found {len(unique_commentary)} unique commentary entries for {match_info}")
        return unique_commentary

    except Exception as e:
        print(f"Major error processing {url}: {str(e)}")
        return []

    finally:
        driver.quit()

def main():
    all_commentary = []
    failed_urls = []
    
    print("Starting commentary scraping for all matches...")
    
    for i, url in enumerate(urls, 1):
        print(f"\nProcessing match {i}/{len(urls)}")
        match_comments = scrape_match_commentary(url)
        
        if match_comments:
            all_commentary.extend(match_comments)
            print(f"Successfully scraped {len(match_comments)} comments from match {i}")
        else:
            failed_urls.append(url)
            print(f"Failed to scrape comments from match {i}")
            
        time.sleep(3)  # Pause between matches to avoid overloading

    # Create DataFrame and save to CSV
    if all_commentary:
        df = pd.DataFrame(all_commentary)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"rcb_all_matches_commentary_{timestamp}.csv"
        df.to_csv(filename, index=False, encoding='utf-8')
        
        print(f"\nScraped {len(all_commentary)} total comments across {len(urls) - len(failed_urls)} matches")
        print(f"Data saved to: {filename}")
        
        if failed_urls:
            print("\nFailed to scrape these URLs:")
            for url in failed_urls:
                print(url)
    else:
        print("\nFailed to scrape any commentary from any match")

if __name__ == "__main__":
    main()

Starting commentary scraping for all matches...

Processing match 1/14



Processing match: https://www.cricbuzz.com/cricket-full-commentary/114960/kkr-vs-rcb-1st-match-indian-premier-league-2025
Looking for RCB innings tab...
Clicking RCB innings tab...
Found 96 unique commentary entries for KKR vs RCB-1ST-MATCH-INDIAN-PREMIER-LEAGUE-2025
Successfully scraped 96 comments from match 1

Processing match 2/14

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115012/csk-vs-rcb-8th-match-indian-premier-league-2025
Looking for RCB innings tab...
Clicking RCB innings tab...
Found 121 unique commentary entries for CSK vs RCB-8TH-MATCH-INDIAN-PREMIER-LEAGUE-2025
Successfully scraped 121 comments from match 2

Processing match 3/14

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115048/rcb-vs-gt-14th-match-indian-premier-league-2025
Looking for RCB innings tab...
Clicking RCB innings tab...
Found 115 unique commentary entries for RCB vs GT-14TH-MATCH-INDIAN-PREMIER-LEAGUE-2025
Successfully scraped 115 comments from match 3

Proc