In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
import os
import time


# For not opening browser again and again. it is actually opening but with no GUI
options = webdriver.ChromeOptions()
options.add_argument("headless")

base_url = "https://www.cricbuzz.com"
parent_url = (
    "https://www.cricbuzz.com/cricket-series/6732/icc-cricket-world-cup-2023/matches"
)
Summary_data = {}
Scorecard_data = {}


def exception(data):
    data["Overs"] = [np.nan for i in range(23)]
    data["Maiden"] = [np.nan for i in range(23)]
    data["Runs_Given"] = [np.nan for i in range(23)]
    data["Wickets"] = [np.nan for i in range(23)]
    data["No-Balls"] = [np.nan for i in range(23)]
    data["Wides"] = [np.nan for i in range(23)]
    data["Economy"] = [np.nan for i in range(23)]
    data["Team"][11] = "Pakistan"
    data["Team"].append("South Africa")
    return data


def get_index(data, word):
    ind = 0
    try:
        ind = data["Players"].index(word)
    except:
        try:
            ind = data["Players"].index(word.split()[0])
        except:
            try:
                ind = data["Players"].index(word.split()[1])
            except:
                for i in data["Players"]:
                    if word in i:
                        ind = data["Players"].index(i)
                        break
    return ind


def get_soup(u):
    response = requests.get(u)
    Soup = BeautifulSoup(response.text, "html.parser")
    return Soup


def get_type(n):
    if n <= 45:
        return "Group"
    elif n == 46 or n == 47:
        return "Knockout"
    elif n == 48:
        return "Final"


def get_details(Soup):
    details = Soup.find("div", class_="cb-col cb-col-100 cb-font-13").find_all(
        "div", class_="cb-col cb-col-73"
    )
    date = details[1].text.strip()
    time = details[3].text.strip()
    ground = details[4].text.split(",")[0].strip()
    city = details[4].text.split(",")[1].strip()
    umpire1 = details[5].text.split(",")[0].strip()
    umpire2 = details[5].text.split(",")[1].strip()
    umpire3 = details[6].text.strip()
    referee = details[7].text.strip()
    toss_w = details[2].text.split(" won ")[0].strip()
    toss_d = details[2].text.split(" and ")[1].strip()

    t1_p = details[8].text.strip()
    t1_c = [c.split("(c")[0] for c in t1_p.split(",") if "(c" in c][0].strip()
    t1_b = details[9].text.strip()
    t2_p = details[11].text.strip()
    t2_b = details[12].text.strip()
    t2_c = [c.split("(c")[0] for c in t2_p.split(",") if "(c" in c][0].strip()

    s = Soup.find_all("span", class_="pull-right")
    t1_s = int(s[0].text.split("-")[0])
    t1_w = int(s[0].text.split("-")[1].split()[0])
    t1_o = float(s[0].text.split("-")[1].split()[1].lstrip("("))

    t2_s = int(s[1].text.split("-")[0])
    t2_w = int(s[1].text.split("-")[1].split()[0])
    t2_o = float(s[1].text.split("-")[1].split()[1].lstrip("("))

    r = Soup.find(
        "div", class_="cb-col cb-scrcrd-status cb-col-100 cb-text-complete ng-scope"
    ).text.split(" won ")
    winner = r[0].strip()
    result = r[1].strip().lstrip("by ")

    return (
        date,
        time,
        ground,
        city,
        umpire1,
        umpire2,
        umpire3,
        referee,
        toss_w,
        toss_d,
        t1_p,
        t1_c,
        t1_b,
        t2_p,
        t2_c,
        t2_b,
        t1_s,
        t1_o,
        t1_w,
        t2_s,
        t2_o,
        t2_w,
        winner,
        result,
    )


def updating_details(Soup):
    (
        date,
        time,
        ground,
        city,
        umpire1,
        umpire2,
        umpire3,
        referee,
        toss_w,
        toss_d,
        t1_p,
        t1_c,
        t1_b,
        t2_p,
        t2_c,
        t2_b,
        t1_s,
        t1_o,
        t1_w,
        t2_s,
        t2_o,
        t2_w,
        winner,
        result,
    ) = get_details(Soup)

    # initializing Dataframe for easily converting to csv
    Summary_data["Date"] = Summary_data.get("Date", []) + [date]
    Summary_data["Time"] = Summary_data.get("Time", []) + [time]
    Summary_data["Venue"] = Summary_data.get("Venue", []) + [ground]
    Summary_data["City"] = Summary_data.get("City", []) + [city]
    Summary_data["Umpire-1"] = Summary_data.get("Umpire-1", []) + [umpire1]
    Summary_data["Umpire-2"] = Summary_data.get("Umpire-2", []) + [umpire2]
    Summary_data["Third-Umpire"] = Summary_data.get("Third-Umpire", []) + [umpire3]
    Summary_data["Match-Referee"] = Summary_data.get("Match-Referee", []) + [referee]
    Summary_data["Toss-Winner"] = Summary_data.get("Toss-Winner", []) + [toss_w]
    Summary_data["Toss-Decision"] = Summary_data.get("Toss-Decision", []) + [toss_d]
    Summary_data["Team-1_Captain"] = Summary_data.get("Team-1_Captain", []) + [t1_c]
    Summary_data["Team-1_Playing"] = Summary_data.get("Team-1_Playing", []) + [t1_p]
    Summary_data["Team-1_Bench"] = Summary_data.get("Team-1_Bench", []) + [t1_b]
    Summary_data["Team-2_Captain"] = Summary_data.get("Team-2_Captain", []) + [t2_c]
    Summary_data["Team-2_Playing"] = Summary_data.get("Team-2_Playing", []) + [t2_p]
    Summary_data["Team-2_Bench"] = Summary_data.get("Team-2_Bench", []) + [t2_b]
    Summary_data["Team-1_Score"] = Summary_data.get("Team-1_Score", []) + [t1_s]
    Summary_data["Team-1_Wickets-Gone"] = Summary_data.get(
        "Team-1_Wickets-Gone", []
    ) + [t1_w]
    Summary_data["Team-1_Overs-Played"] = Summary_data.get(
        "Team-1_Overs-Played", []
    ) + [t1_o]
    Summary_data["Team-2_Score"] = Summary_data.get("Team-2_Score", []) + [t2_s]
    Summary_data["Team-2_Wickets-Gone"] = Summary_data.get(
        "Team-2_Wickets-Gone", []
    ) + [t2_w]
    Summary_data["Team-2_Overs-Played"] = Summary_data.get(
        "Team-2_Overs-Played", []
    ) + [t2_o]
    Summary_data["Winner-Team"] = Summary_data.get("Winner-Team", []) + [winner]
    Summary_data["Win-By"] = Summary_data.get("Win-By", []) + [result]


def get_summary(SouP, POM, number):
    Summary_data["Match-No"] = Summary_data.get("Match-No", []) + [number + 1]
    Summary_data["Match-Type"] = Summary_data.get("Match-Type", []) + [
        get_type(number + 1)
    ]
    match = SouP.find("h1").text.split(" vs ")
    team1 = match[0]
    team2 = match[1].split(",")[0]
    Summary_data["Team-1"] = Summary_data.get("Team-1", []) + [team1]
    Summary_data["Team-2"] = Summary_data.get("Team-2", []) + [team2]
    updating_details(SouP)
    Summary_data["Player-Of-The-Match"] = Summary_data.get(
        "Player-Of-The-Match", []
    ) + [POM]
    pass


def get_score_details(Divs):
    batting = {
        "batter": [],
        "Description": [],
        "R": [],
        "B": [],
        "4": [],
        "6": [],
        "SR": [],
    }
    bowling = {
        "bowler": [],
        "O": [],
        "M": [],
        "R": [],
        "W": [],
        "NB": [],
        "WD": [],
        "Econ": [],
    }
    temp = Divs[0].find_all("div", class_="cb-col cb-col-33")[1:]
    players = len(temp) - 1
    div1 = Divs[0].find_all("div", class_="cb-col cb-col-100 cb-scrd-itms")

    for div in div1[: players + 1]:
        div = div.find_all("div", recursive=True)

        batting["batter"] = batting["batter"] + [(div[0].text.strip())]
        batting["Description"] = batting["Description"] + [(div[1].text.strip())]
        batting["R"] = batting["R"] + [(div[2].text.strip())]
        batting["B"] = batting["B"] + [(div[3].text.strip())]
        batting["4"] = batting["4"] + [(div[4].text.strip())]
        batting["6"] = batting["6"] + [(div[5].text.strip())]
        batting["SR"] = batting["SR"] + [(div[6].text.strip())]
    if players < 11:
        div = Divs[0].find_all("div")[-1].find_all("a")
        names = [a.text.strip() for a in div]
        for name in names:
            batting["batter"] = batting["batter"] + [name]
            batting["Description"] = batting["Description"] + [np.nan]
            batting["R"] = batting["R"] + [np.nan]
            batting["B"] = batting["B"] + [np.nan]
            batting["4"] = batting["4"] + [np.nan]
            batting["6"] = batting["6"] + [np.nan]
            batting["SR"] = batting["SR"] + [np.nan]

    div2 = Divs[3].find_all("div", recursive=False)
    for div in div2[1:]:
        div = div.find_all("div", recursive=False)
        bowling["bowler"] = bowling["bowler"] + [div[0].text.strip()]
        bowling["O"] = bowling["O"] + [div[1].text.strip()]
        bowling["M"] = bowling["M"] + [div[2].text.strip()]
        bowling["R"] = bowling["R"] + [div[3].text.strip()]
        bowling["W"] = bowling["W"] + [div[4].text.strip()]
        bowling["NB"] = bowling["NB"] + [div[5].text.strip()]
        bowling["WD"] = bowling["WD"] + [div[6].text.strip()]
        bowling["Econ"] = bowling["Econ"] + [div[7].text.strip()]

    return batting, bowling


def get_scorecard(SouP, IND):
    Team = []

    Innings = [
        SouP.find("div", id="innings_1").find_all("div", recursive=False),
        SouP.find("div", id="innings_2").find_all("div", recursive=False),
    ]

    # Getting Teams First of all
    team_1 = Innings[0][0].div.text.split("Innings")[0].strip()
    team_2 = Innings[1][0].div.text.split("Innings")[0].strip()
    Team = [team_1 if x < 11 else team_2 for x in range(22)]

    # Getting details of One Team
    all_Scorecard = {
        "Team": Team,
        "Players": [],
        "Description": [],
        "Runs_Scored": [],
        "Balls_Played": [],
        "Fours": [],
        "Sixes": [],
        "Strike-Rate": [],
        "Overs": [np.nan for i in range(22)],
        "Maiden": [np.nan for i in range(22)],
        "Runs_Given": [np.nan for i in range(22)],
        "Wickets": [np.nan for i in range(22)],
        "No-Balls": [np.nan for i in range(22)],
        "Wides": [np.nan for i in range(22)],
        "Economy": [np.nan for i in range(22)],
    }
    if IND == 25:
        all_Scorecard = exception(all_Scorecard)

    bat1, bowl2 = get_score_details(Innings[0])
    bat2, bowl1 = get_score_details(Innings[1])
    for inning in [1, 2]:
        if inning == 1:
            bat = bat1
            bowl = bowl1
        else:
            bat = bat2
            bowl = bowl2

        all_Scorecard["Players"] = all_Scorecard["Players"] + bat["batter"]
        all_Scorecard["Description"] = all_Scorecard["Description"] + bat["Description"]
        all_Scorecard["Runs_Scored"] = all_Scorecard["Runs_Scored"] + bat["R"]
        all_Scorecard["Balls_Played"] = all_Scorecard["Balls_Played"] + bat["B"]
        all_Scorecard["Fours"] = all_Scorecard["Fours"] + bat["4"]
        all_Scorecard["Sixes"] = all_Scorecard["Sixes"] + bat["6"]
        all_Scorecard["Strike-Rate"] = all_Scorecard["Strike-Rate"] + bat["SR"]

        for loop_ind, name in enumerate(bowl["bowler"]):
            ind = get_index(all_Scorecard, name)
            all_Scorecard["Overs"][ind] = bowl["O"][loop_ind]
            all_Scorecard["Maiden"][ind] = bowl["M"][loop_ind]
            all_Scorecard["Runs_Given"][ind] = bowl["R"][loop_ind]
            all_Scorecard["Wickets"][ind] = bowl["W"][loop_ind]
            all_Scorecard["No-Balls"][ind] = bowl["NB"][loop_ind]
            all_Scorecard["Wides"][ind] = bowl["WD"][loop_ind]
            all_Scorecard["Economy"][ind] = bowl["Econ"][loop_ind]
    return all_Scorecard

def get_commentary(url):
    text1=""
    text2=""
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    tabs = driver.find_elements(By.CLASS_NAME, "cb-nav-pill-1")
    for index in [1, 2]:
        tab = tabs[index]

        # Wait for the tab to be clickable
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "cb-nav-pill-1"))
        )

        # Click on the tab
        tab.click()

        # Allow some time for the content to load (you can adjust this time based on the page loading speed)
        time.sleep(2)

        # Get page source after clicking the tab
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        commentary_section = soup.find_all('div', {'class': 'cb-col cb-col-100 ng-scope'})
        if commentary_section:
            for section in commentary_section:
                balls = section.find_all('div', class_="cb-mat-mnu-wrp cb-ovr-num ng-binding")
                ball_commentaries = section.find_all('p', class_="cb-com-ln ng-binding ng-scope cb-col cb-col-90")

                for ball, commentary in zip(balls, ball_commentaries):
                    ball_text = ball.text.strip()
                    commentary_text = commentary.text.strip()
                    if index==1:    
                        text1=ball_text+" "+commentary_text+"\n"+text1
                    else:
                        text2=ball_text+" "+commentary_text+"\n"+text2
        else:
            print(f"Commentary section not found for tab {index + 1}")
    text=text1+text2
    return text

In [3]:
resp = requests.get(parent_url)

# with open("file.html","w") as file:
#     file.write(resp.text)

soup = BeautifulSoup(resp.text, "html.parser")

# first finding urls of all matches
urls = soup.find_all("a", class_="cb-text-complete")
urls = [base_url + url.attrs["href"] for url in urls]


for ind, url in enumerate(urls[:]):  # we need index
    # Getting soup
    soup = get_soup(url)
    
    # First Getting POM because its on first page.
    pom = soup.find("a", class_="cb-link-undrln").text

    # Now operations for url of scorecard page
    url = soup.find_all("a", class_="cb-nav-tab")[1]
    url = base_url + url.attrs["href"]
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")
    print(url)

    # Now getting all required data for summary dataset.
    get_summary(soup,pom,ind)


    # Now getting all required data for Scorecard datasets.
    scorecard=get_scorecard(soup,ind)
    
    # Now getting Commentary
    url = soup.find_all("a", class_="cb-nav-tab")[4]
    url = base_url + url.attrs["href"]
    commentary=get_commentary(url)
    df_score=pd.DataFrame(scorecard)
    try:
        os.makedirs("Cricket-World-Cup-2023/Match-"+str(ind+1))
    except:
        try:
            os.mkdir("Cricket-World-Cup-2023/Match-"+str(ind+1))
        except:
            pass
    df_score.to_csv("Cricket-World-Cup-2023/Match-"+str(ind+1)+"/Match-"+str(ind+1)+" Scorecard.csv",index=False)
    with open("Cricket-World-Cup-2023/Match-"+str(ind+1)+"/Match-"+str(ind+1)+" Commentary.txt","w") as file:
        file.write(commentary)

df=pd.DataFrame(Summary_data)
# print(df)
df.to_csv("Cricket-World-Cup-2023/Summary.csv",index=False)

https://www.cricbuzz.com/live-cricket-scorecard/75413/eng-vs-nz-1st-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75420/pak-vs-ned-2nd-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75427/ban-vs-afg-3rd-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75434/rsa-vs-sl-4th-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75437/ind-vs-aus-5th-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75444/nz-vs-ned-6th-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75451/eng-vs-ban-7th-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75462/pak-vs-sl-8th-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75458/ind-vs-afg-9th-match-icc-cricket-world-cup-2023
https://www.cricbuzz.com/live-cricket-scorecard/75465/aus-vs-rsa-10th-match-icc-cricke

AttributeError: 'NoneType' object has no attribute 'text'