In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd 
from datetime import datetime

In [3]:
# URL of every match of RCB (except the second match with KKR because that was rainwashed)
urls = [
    "https://www.cricbuzz.com/cricket-full-commentary/114960/kkr-vs-rcb-1st-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115012/csk-vs-rcb-8th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115048/rcb-vs-gt-14th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115095/mi-vs-rcb-20th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115111/rcb-vs-dc-24th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115138/rr-vs-rcb-28th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115174/rcb-vs-pbks-34th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115230/rcb-vs-rr-42nd-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115257/dc-vs-rcb-46th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/115302/rcb-vs-csk-52nd-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/118865/rcb-vs-srh-65th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/118898/lsg-vs-rcb-70th-match-indian-premier-league-2025",
    "https://www.cricbuzz.com/cricket-full-commentary/118907/pbks-vs-rcb-qualifier-1-indian-premier-league-2025",
    "https://www.cricbuzz.com/live-cricket-full-commentary/118928/rcb-vs-pbks-final-indian-premier-league-2025"
]


In [5]:
# Create function to scrape commentary
def scrape_match_commentary(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(options=options)
    
    comments_data = []
    try:
        print(f"\nProcessing match: {url}")
        driver.get(url)
        
        # Click "RCB Inns" tab
        rcb_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'RCB Inns')]"))
        )
        rcb_tab.click()
        time.sleep(2)

        # Wait for commentary block to appear
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p.cb-com-ln"))
        )

        # Scroll to load full commentary
        for _ in range(20):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.5)

        # Collect all commentary entries with overs
        commentary_blocks = driver.find_elements(By.CSS_SELECTOR, "div.cb-col-8")
        
        match_id = url.split('/')[-2]  # Extract match ID from URL
        teams = url.split('/')[-1].split('-vs-')[:2]  # Extract teams playing
        
        for block in commentary_blocks:
            try:
                over = block.find_element(By.CSS_SELECTOR, "div.cb-col-27").text.strip()
                comment = block.find_element(By.CSS_SELECTOR, "p.cb-com-ln").text.strip()
                if comment:
                    comments_data.append({
                        'Match_ID': match_id,
                        'Teams': f"{teams[0].upper()} vs {teams[1].upper()}",
                        'Over': over,
                        'Commentary': comment,
                        'Timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    })
            except:
                continue
                
    finally:
        driver.quit()
        
    return comments_data

# Create empty list to store all commentaries
all_commentaries = []

# Iterate through each URL and collect commentaries
for url in urls:
    match_comments = scrape_match_commentary(url)
    all_commentaries.extend(match_comments)
    print(f"Collected {len(match_comments)} comments from match")

# Create DataFrame with all commentaries
df_all = pd.DataFrame(all_commentaries)

# Save to CSV with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"rcb_all_matches_commentary_{timestamp}.csv"
df_all.to_csv(filename, index=False, encoding='utf-8')

print(f"\nAll commentaries saved to {filename}")
print(f"Total comments collected: {len(df_all)}")
print("\nFirst few entries:")
print(df_all.head())


Processing match: https://www.cricbuzz.com/cricket-full-commentary/114960/kkr-vs-rcb-1st-match-indian-premier-league-2025
Collected 0 comments from match

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115012/csk-vs-rcb-8th-match-indian-premier-league-2025
Collected 0 comments from match

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115048/rcb-vs-gt-14th-match-indian-premier-league-2025
Collected 0 comments from match

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115095/mi-vs-rcb-20th-match-indian-premier-league-2025
Collected 0 comments from match

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115111/rcb-vs-dc-24th-match-indian-premier-league-2025
Collected 0 comments from match

Processing match: https://www.cricbuzz.com/cricket-full-commentary/115138/rr-vs-rcb-28th-match-indian-premier-league-2025
Collected 0 comments from match

Processing match: https://www.cricbuzz.com/cricket-full-commentary/11