In [None]:
import requests
from bs4 import BeautifulSoup

# ------- 1.a Interaction Code ------ #
url = 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Proceed with parsing the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')


    # ------- 1.b Parser Code ------------ #
    links = []
    table_body = soup.select('#main-container > div.ds-relative > div > div.ds-flex.ds-space-x-5 > div.ds-grow > div:nth-child(2) > div > div:nth-child(1) > div.ds-overflow-x-auto.ds-scrollbar-hide > table > tbody')[0]
    table_rows = table_body.find_all('tr')

    for row in table_rows:
        tds = row.find_all('td')
        relative_url = tds[6].find('a')['href']
        row_url = "https://www.espncricinfo.com" + relative_url
        links.append(row_url)

    # This 'links' list now contains all the URLs of match summaries.
else:
    print("Failed to retrieve data from the website. Status code:", response.status_code)

In [None]:
batting_summaries = []
bowling_summaries = []

def parse_batting_rows(rows, team, match_info):
            for index, row in enumerate(rows):
                tds = row.find_all('td')
                if len(tds) >= 8:  # Ensuring we only process valid batting rows
                    batting_summaries.append({
                        "match": match_info,
                        "teamInnings": team,
                        "battingPos": index + 1,
                        "batsmanName": tds[0].find('a').text.strip(),
                        "dismissal": tds[1].text.strip(),
                        "runs": tds[2].text.strip(),
                        "balls": tds[3].text.strip(),
                        "4s": tds[5].text.strip(),
                        "6s": tds[6].text.strip(),
                        "SR": tds[7].text.strip()
                    })


def parse_bowling_rows(rows, bowling_team, match_info):
        for row in rows:
            tds = row.find_all('td')
            if len(tds) >= 11:  # Ensuring we only process valid bowling rows
                bowling_summaries.append({
                    "match": match_info,
                    "bowlingTeam": bowling_team,
                    "bowlerName": tds[0].find('a').text.strip(),
                    "overs": tds[1].text.strip(),
                    "maiden": tds[2].text.strip(),
                    "runs": tds[3].text.strip(),
                    "wickets": tds[4].text.strip(),
                    "economy": tds[5].text.strip(),
                    "0s": tds[6].text.strip(),
                    "4s": tds[7].text.strip(),
                    "6s": tds[8].text.strip(),
                    "wides": tds[9].text.strip(),
                    "noBalls": tds[10].text.strip(),
                })



for match_url in links:
    # ------- 2.a Interaction Code ------ #
    response = requests.get(match_url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Proceed with parsing the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # ------- 2.b Parser Code ------------ #
        match_details = soup.find_all('div',class_='ds-bg-fill-canvas')

        teams= match_details[0].find_all('span',class_='ds-text-tight-xs')
        team1 = teams[0].text.replace(" Innings","")
        team2 = teams[1].text.replace(" Innings","")
        match_info = f"{team1} Vs {team2}"

        # Extracting Batting Summary
        table_div= soup.find_all('div',class_ = 'ds-p-0')
        first_innings_rows = (table_div[1].find_all('table'))[0].select('tbody > tr')
        second_innings_rows = (table_div[2].find_all('table'))[0].select('tbody > tr')


        # Extracting Bowling Summary
        first_innings_bowlers = (table_div[1].find_all('table'))[1].select('tbody > tr')
        second_innings_bowlers = (table_div[2].find_all('table'))[1].select('tbody > tr')

        # Parsing first innings
        parse_batting_rows(first_innings_rows, team1, match_info)

        # Parsing second innings
        parse_batting_rows(second_innings_rows, team2, match_info)

        # The 'batting_summaries' list now contains all the batting data for the matches.

        # Parsing first innings
        parse_bowling_rows(first_innings_bowlers, team1, match_info)

        # Parsing second innings
        parse_bowling_rows(second_innings_bowlers, team2, match_info)

        # The 'bowling_summaries' list now contains all the bowling data for the matches.



    else:
        print("Failed to retrieve data from the website. Status code:", response.status_code)





In [None]:
print("batting_summaries total:", len(batting_summaries))
print("bowling summaries total:",len(bowling_summaries))
print()
print(batting_summaries[0])
print()
print(bowling_summaries[0])


batting_summaries total: 699
bowling summaries total: 500

{'match': 'Pakistan Vs England', 'teamInnings': 'Pakistan', 'battingPos': 1, 'batsmanName': 'Mohammad Rizwan\xa0†', 'dismissal': 'b Curran', 'runs': '15', 'balls': '14', '4s': '0', '6s': '1', 'SR': '107.14'}

{'match': 'Pakistan Vs England', 'bowlingTeam': 'Pakistan', 'bowlerName': 'Ben Stokes', 'overs': '4', 'maiden': '0', 'runs': '32', 'wickets': '1', 'economy': '8.00', '0s': '6', '4s': '1', '6s': '0', 'wides': '2', 'noBalls': '1'}


In [None]:
import json

# Specify the file path where you want to store the JSON file
file_path = 't20_wc_batting_summary.json'

# Writing the list to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(batting_summaries, json_file, indent=4)

print(f"List has been stored in {file_path}")

file_path = 't20_wc_bowling_summary.json'

# Writing the list to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(bowling_summaries, json_file, indent=4)

print(f"List has been stored in {file_path}")


List has been stored in t20_wc_batting_summary.json
List has been stored in t20_wc_bowling_summary.json
