In [24]:
!pip install selenium pandas webdriver-manager


Defaulting to user installation because normal site-packages is not writeable


In [25]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


In [26]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening browser
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [27]:
series_url = "https://www.espncricinfo.com/series/indian-premier-league-2022-1298423/match-schedule-fixtures-and-results"
driver.get(series_url)
time.sleep(5)

match_links = []
matches = driver.find_elements(By.XPATH, "//a[contains(@href, '/live-cricket-score')]")

for match in matches:
    link = match.get_attribute("href")
    if link and link not in match_links:
        match_links.append(link)

print(f"Total Matches Found: {len(match_links)}")


Total Matches Found: 75


In [None]:
all_data = []

for match_url in match_links:
    driver.get(match_url)
    time.sleep(5)

    try:
        # Extract match details
        match_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
        match_info = driver.find_element(By.CLASS_NAME, "ds-text-tight-m").text.split(",")

        match_date = match_info[0].strip() if len(match_info) > 0 else "N/A"
        match_venue = match_info[1].strip() if len(match_info) > 1 else "N/A"

        scores = driver.find_elements(By.CLASS_NAME, "ds-text-compact-s.ds-font-bold")
        team1_score = scores[0].text.strip() if len(scores) > 0 else "N/A"
        team2_score = scores[1].text.strip() if len(scores) > 1 else "N/A"

        match_winner = driver.find_element(By.XPATH, "//p[contains(text(), 'won')]").text.strip()

        # Navigate to Ball-by-Ball Commentary
        commentary_url = match_url.replace("live-cricket-score", "ball-by-ball-commentary")
        driver.get(commentary_url)
        time.sleep(5)

        balls = driver.find_elements(By.XPATH, "//div[contains(@class, 'ds-text-tight-m')]")

        for ball in balls:
            try:
                over_element = ball.find_element(By.XPATH, ".//span[contains(@class, 'ds-text-tight-s')]")
                over = over_element.text.strip() if over_element else "N/A"

                commentary_text = ball.text.strip()
                commentary_parts = commentary_text.split(", ")

                bowler_batter = commentary_parts[0].split(" to ")
                bowler = bowler_batter[0].strip() if len(bowler_batter) == 2 else "Unknown"
                batter = bowler_batter[1].strip() if len(bowler_batter) == 2 else "Unknown"

                ball_type = "Normal"
                shot_type = "None"
                runs = "0"
                speed = "N/A"

                for part in commentary_parts:
                    if "wide" in part.lower():
                        ball_type = "Wide"
                    elif "no ball" in part.lower():
                        ball_type = "No Ball"
                    elif "out" in part.lower():
                        ball_type = "Wicket"
                    elif "dot ball" in part.lower():
                        ball_type = "Dot Ball"
                    elif "four" in part.lower():
                        shot_type = "Boundary"
                        runs = "4"
                    elif "six" in part.lower():
                        shot_type = "Six"
                        runs = "6"
                    elif "single" in part.lower():
                        shot_type = "Single"
                        runs = "1"
                    elif "double" in part.lower():
                        shot_type = "Double"
                        runs = "2"

                    if "km/h" in part:
                        speed = part.strip()

                    if part.strip().isdigit():
                        runs = part.strip()

                all_data.append([match_name, match_winner, match_date, match_venue, team1_score, team2_score,
                                 over, bowler, batter, ball_type, shot_type, speed, runs])
            
            except Exception as e:
                print(f"Error: {e}")

    except Exception as e:
        print(f"Skipping match due to error: {e}")

driver.quit()


Skipping match due to error: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//p[contains(text(), 'won')]"}
  (Session info: chrome=133.0.6943.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x01250283+25139]
	(No symbol) [0x011DB234]
	(No symbol) [0x010B04A3]
	(No symbol) [0x010F8338]
	(No symbol) [0x010F868B]
	(No symbol) [0x01140F62]
	(No symbol) [0x0111CF84]
	(No symbol) [0x0113E6E1]
	(No symbol) [0x0111CD36]
	(No symbol) [0x010EBD29]
	(No symbol) [0x010ED064]
	GetHandleVerifier [0x0155B143+3215603]
	GetHandleVerifier [0x015722BA+3310186]
	GetHandleVerifier [0x0156C4D2+3286146]
	GetHandleVerifier [0x012E9C80+654384]
	(No symbol) [0x011E45BD]
	(No symbol) [0x011E14A8]
	(No symbol) [0x011E1647]
	(No symbol) [0x011D3D20]
	BaseThreadInitThunk [0x75125D49+25]
	RtlInitializeExceptionChain [0x7733CDEB+107]
	RtlGetAppContai

In [15]:
def main():
    driver = setup_driver()
    series_dict = {
        "IPL 2021": "https://www.espncricinfo.com/series/indian-premier-league-2021-1249214/match-schedule-fixtures-and-results",
        "IPL 2022": "https://www.espncricinfo.com/series/indian-premier-league-2022-1298423/match-schedule-fixtures-and-results",
        "IPL 2023": "https://www.espncricinfo.com/series/indian-premier-league-2023-1359714/match-schedule-fixtures-and-results"
    }
    
    all_data = []
    
    for series, url in series_dict.items():
        match_links = get_match_links(driver, url)
        
        for match_url in match_links:
            print(f"Scraping: {match_url}")
            commentary_data = scrape_commentary(driver, match_url, series, series[-4:])
            all_data.extend(commentary_data)
    
    driver.quit()
    
    df = pd.DataFrame(all_data)
    df

In [16]:
import pandas as pd
from IPython.display import display

# Create DataFrame with structured data
df = pd.DataFrame(all_data, columns=[
    "Match Name", "Match Won By", "Match Date", "Match Venue",
    "Team 1 Score", "Team 2 Score", "Over", "Bowler", "Batter",
    "Ball Type", "Shot Type", "Speed (km/h)", "Runs", "Series Name", "Series Year"
])

# Set display options for better visualization
pd.set_option("display.max_rows", 50)  # Adjust based on your data size
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 1000)  # Adjust column width

# Display the table
display(df)




ValueError: 15 columns passed, passed data had 16 columns

In [25]:
df.to_csv(r'C:\Users\loges\OneDrive\Documents\Day6 output\Scrape all Match.csv')

In [6]:
import pandas as pd
from IPython.display import display

# Sample Data - Replace this with actual scraped data
all_data = [
    ["Match 1", "Team A", "2022-03-26", "Stadium X", "180/5", "175/7", 1.1, "Bowler 1", "Batter 1", "Legal", "Cover Drive", 140, 4],
    ["Match 1", "Team A", "2022-03-26", "Stadium X", "180/5", "175/7", 1.2, "Bowler 1", "Batter 1", "Legal", "Pull Shot", 142, 6],
    ["Match 2", "Team B", "2022-03-27", "Stadium Y", "200/3", "198/9", 2.1, "Bowler 2", "Batter 2", "Wide", "N/A", 0, 1],
    ["Match 2", "Team B", "2022-03-27", "Stadium Y", "200/3", "198/9", 2.2, "Bowler 2", "Batter 2", "Legal", "Straight Drive", 138, 4],
]

# Define column names
columns = ["Match Name", "Match Won By", "Match Date", "Match Venue",
           "Team 1 Score", "Team 2 Score", "Over", "Bowler", "Batter",
           "Ball Type", "Shot Type", "Speed (km/h)", "Runs"]

# Convert data into a Pandas DataFrame
df = pd.DataFrame(all_data, columns=columns)

# Ensure all rows are displayed
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns

# Display the entire DataFrame
display(df)



Unnamed: 0,Match Name,Match Won By,Match Date,Match Venue,Team 1 Score,Team 2 Score,Over,Bowler,Batter,Ball Type,Shot Type,Speed (km/h),Runs
0,Match 1,Team A,2022-03-26,Stadium X,180/5,175/7,1.1,Bowler 1,Batter 1,Legal,Cover Drive,140,4
1,Match 1,Team A,2022-03-26,Stadium X,180/5,175/7,1.2,Bowler 1,Batter 1,Legal,Pull Shot,142,6
2,Match 2,Team B,2022-03-27,Stadium Y,200/3,198/9,2.1,Bowler 2,Batter 2,Wide,,0,1
3,Match 2,Team B,2022-03-27,Stadium Y,200/3,198/9,2.2,Bowler 2,Batter 2,Legal,Straight Drive,138,4


In [7]:
df.to_csv(r'C:\Users\loges\OneDrive\Documents\Day6 output\Scrape all Match.csv')

In [9]:
df

Unnamed: 0,Match Name,Match Won By,Match Date,Match Venue,Team 1 Score,Team 2 Score,Over,Bowler,Batter,Ball Type,Shot Type,Speed (km/h),Runs
0,Match 1,Team A,2022-03-26,Stadium X,180/5,175/7,1.1,Bowler 1,Batter 1,Legal,Cover Drive,140,4
1,Match 1,Team A,2022-03-26,Stadium X,180/5,175/7,1.2,Bowler 1,Batter 1,Legal,Pull Shot,142,6
2,Match 2,Team B,2022-03-27,Stadium Y,200/3,198/9,2.1,Bowler 2,Batter 2,Wide,,0,1
3,Match 2,Team B,2022-03-27,Stadium Y,200/3,198/9,2.2,Bowler 2,Batter 2,Legal,Straight Drive,138,4


In [12]:
import pandas as pd

# Ensure all_data is properly initialized
all_data = []  

# Sample data (Replace this with actual scraped data)
all_data.append(["IPL 2021", "2021", "MI vs CSK", "Wankhede Stadium", "2021-04-10",
                 "MI", "180/5", "CSK", "175/7", "1.1", "Bumrah", "Dhoni",
                 "Legal", "Cover Drive", "140", 4])

all_data.append(["IPL 2022", "2022", "RCB vs KKR", "Chinnaswamy", "2022-04-12",
                 "RCB", "160/7", "KKR", "155/8", "3.2", "Ferguson", "Kohli",
                 "Legal", "Pull Shot", "145", 6])

# Define column names
columns = ["Series Name", "Series Year", "Match Name", "Match Venue", "Match Date",
           "Team 1", "Team 1 Score", "Team 2", "Team 2 Score",
           "Over", "Bowler", "Batter", "Ball Type", "Shot Type", "Speed (km/h)", "Runs"]

# Convert extracted data into a Pandas DataFrame
df = pd.DataFrame(all_data, columns=columns)

# Ensure all rows and columns are displayed
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.expand_frame_repr", False)  # Prevents column truncation

# Print all rows of the DataFrame
print(df.to_string(index=False))




Series Name Series Year Match Name      Match Venue Match Date Team 1 Team 1 Score Team 2 Team 2 Score Over   Bowler Batter Ball Type   Shot Type Speed (km/h)  Runs
   IPL 2021        2021  MI vs CSK Wankhede Stadium 2021-04-10     MI        180/5    CSK        175/7  1.1   Bumrah  Dhoni     Legal Cover Drive          140     4
   IPL 2022        2022 RCB vs KKR      Chinnaswamy 2022-04-12    RCB        160/7    KKR        155/8  3.2 Ferguson  Kohli     Legal   Pull Shot          145     6


In [13]:
df

Unnamed: 0,Series Name,Series Year,Match Name,Match Venue,Match Date,Team 1,Team 1 Score,Team 2,Team 2 Score,Over,Bowler,Batter,Ball Type,Shot Type,Speed (km/h),Runs
0,IPL 2021,2021,MI vs CSK,Wankhede Stadium,2021-04-10,MI,180/5,CSK,175/7,1.1,Bumrah,Dhoni,Legal,Cover Drive,140,4
1,IPL 2022,2022,RCB vs KKR,Chinnaswamy,2022-04-12,RCB,160/7,KKR,155/8,3.2,Ferguson,Kohli,Legal,Pull Shot,145,6


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

# Initialize WebDriver
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (optional)
    driver = webdriver.Chrome(options=options)
    return driver

# Get match links from series page
def get_match_links(driver, series_url):
    driver.get(series_url)
    time.sleep(3)  # Allow page to load
    match_links = []
    