**Getting necessary libraries**

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date

**Against Permissions accessing**

In [2]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8", 
    "Dnt": "1", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", 
}

# 1: Match Summary

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Send a GET request to the website
url = 'https://www.espncricinfo.com/records/tournament/team-match-results/indian-premier-league-2023-15129'
response = requests.get(url)

# Step 2: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Step 3: Find the table rows and extract the data from the cells
allRows = soup.select('table > tbody > tr')
matchSummary = []

for element in allRows:
    tds = element.find_all('td')
    team1 = tds[0].text.strip()
    team2 = tds[1].text.strip()
    winner = tds[2].text.strip()
    margin = tds[3].text.strip()
    ground = tds[4].text.strip()
    matchDate = tds[5].text.strip()
    scorecard_link = "https://www.espncricinfo.com" + tds[6].find('a')['href']

    matchSummary.append({
        'team1': team1,
        'team2': team2,
        'winner': winner,
        'margin': margin,
        'ground': ground,
        'matchDate': matchDate,
        'scorecard_link': scorecard_link
    })

# Step 4: Creating a new list to filter out matches with a valid margin
newMatchSummary = []

for match in matchSummary:
    if match['margin'] != '-':
        if ('wickets' in match['margin']) or ('wicket' in match['margin']):
            finT2 = match['winner']
            finT1 = match['team1'] if match['team1'] != match['winner'] else match['team2']
        elif ('runs' in match['margin']) or ('run' in match['margin']):
            finT1 = match['winner']
            finT2 = match['team1'] if match['team1'] != match['winner'] else match['team2']

        newMatchSummary.append({
            'team1': finT1,
            'team2': finT2,
            'winner': match['winner'],
            'margin': match['margin'],
            'matchDate': match['matchDate'],
            'scorecard_link': match['scorecard_link']
        })

# Step 5: Convert the data into a pandas DataFrame
df_match_summary = pd.DataFrame(newMatchSummary)

In [4]:
print(df_match_summary.shape)
df_match_summary.head()

(73, 6)


Unnamed: 0,team1,team2,winner,margin,matchDate,scorecard_link
0,Super Kings,Titans,Titans,5 wickets,"Mar 31, 2023",https://www.espncricinfo.com/series/indian-pre...
1,Punjab Kings,KKR,Punjab Kings,7 runs,"Apr 1, 2023",https://www.espncricinfo.com/series/indian-pre...
2,Super Giants,Capitals,Super Giants,50 runs,"Apr 1, 2023",https://www.espncricinfo.com/series/indian-pre...
3,Royals,Sunrisers,Royals,72 runs,"Apr 2, 2023",https://www.espncricinfo.com/series/indian-pre...
4,Mumbai,RCB,RCB,8 wickets,"Apr 2, 2023",https://www.espncricinfo.com/series/indian-pre...


In [5]:
df_match_summary.tail()

Unnamed: 0,team1,team2,winner,margin,matchDate,scorecard_link
68,RCB,Titans,Titans,6 wickets,"May 21, 2023",https://www.espncricinfo.com/series/indian-pre...
69,Super Kings,Titans,Super Kings,15 runs,"May 23, 2023",https://www.espncricinfo.com/series/indian-pre...
70,Mumbai,Super Giants,Mumbai,81 runs,"May 24, 2023",https://www.espncricinfo.com/series/indian-pre...
71,Titans,Mumbai,Titans,62 runs,"May 26, 2023",https://www.espncricinfo.com/series/indian-pre...
72,Titans,Super Kings,Super Kings,5 wickets,"May 28-29, 2023",https://www.espncricinfo.com/series/indian-pre...


In [6]:
#let's replace last row value with one date 
df_match_summary['matchDate'] = df_match_summary['matchDate'].replace('May 28-29, 2023', 'May 29, 2023')

In [8]:
#save the dataframe
df_match_summary.to_csv('match_summary.csv', index = False)

### Batting Summary

In [9]:
batting_summary = []

# Step 1: Iterate through each row in the dataframe
for _, row in df_match_summary.iterrows():
    team1 = row['team1']
    team2 = row['team2']
    match_info = f"{team1} Vs {team2}"
    scorecard_link = row['scorecard_link']

    # Step 2: Navigate to the scorecard link and extract batting rows
    response = requests.get(scorecard_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    tables = soup.select('div > table.ci-scorecard-table')
    first_inning_rows = [row for row in tables[0].select('tbody > tr') if len(row.select('td')) >= 8]
    second_inning_rows = [row for row in tables[1].select('tbody > tr') if len(row.select('td')) >= 8]
    
    
    #Step3: Extract batting data for the first innings
    for i in range(0, len(first_inning_rows)):
        element = first_inning_rows[i]
        index = i
    
        tds = element.find_all('td')
        
        #for dismal (handling specially)
        if tds[1].find('span') is not None:
            dismissal =  tds[1].find('span').find('span').text
        else:
            dismissal = tds[1].text.strip()

        batting_summary.append({
            "match": match_info,
            "teamInnings": team1,
            "battingPos": index + 1,
            "batsmanName": tds[0].find('a').find_all('span')[1].text.replace(' ', ''),
            "dismissal": dismissal,
            "runs": tds[2].find('strong').text,
            "balls": tds[3].text,
            "4s": tds[5].text,
            "6s": tds[6].text,
            "SR": tds[7].text,
            "matchDate": row['matchDate']
        })
        
        
    # Step 4: Extract batting data for the second innings
    for i in range(0, len(second_inning_rows)):
        element = second_inning_rows[i]
        index = i
    
        tds = element.find_all('td')
        
        #for dismal (handling specially)
        if tds[1].find('span') is not None:
            dismissal =  tds[1].find('span').find('span').text
        else:
            dismissal = tds[1].text.strip()

        batting_summary.append({
            "match": match_info,
            "teamInnings": team2,
            "battingPos": index + 1,
            "batsmanName": tds[0].find('a').find_all('span')[1].text.replace(' ', ''),
            "dismissal": dismissal,
            "runs": tds[2].find('strong').text,
            "balls": tds[3].text,
            "4s": tds[5].text,
            "6s": tds[6].text,
            "SR": tds[7].text,
            "matchDate": row['matchDate']
        })

        
# Step 5: Create the final DataFrame
batting_summary_df = pd.DataFrame(batting_summary)

In [10]:
print(batting_summary_df.shape)

batting_summary_df.head(5)

(1173, 11)


Unnamed: 0,match,teamInnings,battingPos,batsmanName,dismissal,runs,balls,4s,6s,SR,matchDate
0,Super Kings Vs Titans,Super Kings,1,DevonConway,b Mohammed Shami,1,6,0,0,16.66,"Mar 31, 2023"
1,Super Kings Vs Titans,Super Kings,2,RuturajGaikwad,c Shubman Gill b Joseph,92,50,4,9,184.0,"Mar 31, 2023"
2,Super Kings Vs Titans,Super Kings,3,MoeenAli,c †Saha b Rashid Khan,23,17,4,1,135.29,"Mar 31, 2023"
3,Super Kings Vs Titans,Super Kings,4,BenStokes,c †Saha b Rashid Khan,7,6,1,0,116.66,"Mar 31, 2023"
4,Super Kings Vs Titans,Super Kings,5,AmbatiRayudu,b Little,12,12,0,1,100.0,"Mar 31, 2023"


In [11]:
batting_summary_df.tail(5)

Unnamed: 0,match,teamInnings,battingPos,batsmanName,dismissal,runs,balls,4s,6s,SR,matchDate
1168,Titans Vs Super Kings,Super Kings,3,ShivamDube,not out,32,21,0,2,152.38,"May 29, 2023"
1169,Titans Vs Super Kings,Super Kings,4,AjinkyaRahane,c Shankar b Sharma,27,13,2,2,207.69,"May 29, 2023"
1170,Titans Vs Super Kings,Super Kings,5,AmbatiRayudu,c & b Sharma,19,8,1,2,237.5,"May 29, 2023"
1171,Titans Vs Super Kings,Super Kings,6,MSDhoni (c)†,c Miller b Sharma,0,1,0,0,0.0,"May 29, 2023"
1172,Titans Vs Super Kings,Super Kings,7,RavindraJadeja,not out,15,6,1,1,250.0,"May 29, 2023"


In [12]:
#save the dataframe

batting_summary_df.to_csv('batting_summary.csv', index = False)

### Bowling Summary

In [13]:
bowling_summary = []

# Step 1: Iterate through each row in the dataframe
for _, row in df_match_summary.iterrows():
    team1 = row['team1']
    team2 = row['team2']
    match_info = f"{team1} Vs {team2}"
    scorecard_link = row['scorecard_link']

    # Step 2: Navigate to the scorecard link and extract bowling rows
    response = requests.get(scorecard_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    tables = soup.select('div > table.ds-table')
    first_inning_rows = [row for row in tables[1].select('tbody > tr') if len(row.select('td')) >= 11]
    second_inning_rows = [row for row in tables[3].select('tbody > tr') if len(row.select('td')) >= 11]

    
    # Step3: first innings bowling_summary
    for index, element in enumerate(first_inning_rows):
        tds = element.find_all('td')
        bowling_summary.append({
            "match": match_info,
            "bowlingTeam": team2,
            "bowlerName": tds[0].find('a').find('span').text.replace(' ', ''),
            "overs": tds[1].text,
            "maiden": tds[2].text,
            "runs": tds[3].text,
            "wickets": tds[4].text,
            "economy": tds[5].text,
            "0s": tds[6].text,
            "4s": tds[7].text,
            "6s": tds[8].text,
            "wides": tds[9].text,
            "noBalls": tds[10].text,
            "matchDate": row['matchDate']
        })
        

        
    #Step4: Second innings bowling_summary
    for index, element in enumerate(second_inning_rows):
        tds = element.find_all('td')
        bowling_summary.append({
            "match": match_info,
            "bowlingTeam": team1,
            "bowlerName": tds[0].find('a').find('span').text.replace(' ', ''),
            "overs": tds[1].text,
            "maiden": tds[2].text,
            "runs": tds[3].text,
            "wickets": tds[4].text,
            "economy": tds[5].text,
            "0s": tds[6].text,
            "4s": tds[7].text,
            "6s": tds[8].text,
            "wides": tds[9].text,
            "noBalls": tds[10].text,
            "matchDate": row['matchDate']
        })


# Step 5: Create the final DataFrame
bowling_summary_df = pd.DataFrame(bowling_summary)

In [14]:
print(bowling_summary_df.shape)
bowling_summary_df.head(5)

(892, 14)


Unnamed: 0,match,bowlingTeam,bowlerName,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,noBalls,matchDate
0,Super Kings Vs Titans,Titans,MohammedShami,4,0,29,2,7.25,13,2,2,0,1,"Mar 31, 2023"
1,Super Kings Vs Titans,Titans,HardikPandya,3,0,28,0,9.33,6,2,2,0,0,"Mar 31, 2023"
2,Super Kings Vs Titans,Titans,JoshLittle,4,0,41,1,10.25,10,4,3,0,0,"Mar 31, 2023"
3,Super Kings Vs Titans,Titans,RashidKhan,4,0,26,2,6.5,10,2,1,0,0,"Mar 31, 2023"
4,Super Kings Vs Titans,Titans,AlzarriJoseph,4,0,33,2,8.25,8,0,3,0,0,"Mar 31, 2023"


In [15]:
print(bowling_summary_df.shape)
bowling_summary_df.tail(6)

(892, 14)


Unnamed: 0,match,bowlingTeam,bowlerName,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,noBalls,matchDate
886,Titans Vs Super Kings,Titans,MohammedShami,3,0,29,0,9.66,5,4,0,0,0,"May 29, 2023"
887,Titans Vs Super Kings,Titans,HardikPandya,1,0,14,0,14.0,1,1,1,1,0,"May 29, 2023"
888,Titans Vs Super Kings,Titans,RashidKhan,3,0,44,0,14.66,2,4,3,0,0,"May 29, 2023"
889,Titans Vs Super Kings,Titans,NoorAhmad,3,0,17,2,5.66,6,0,0,3,0,"May 29, 2023"
890,Titans Vs Super Kings,Titans,JoshLittle,2,0,30,0,15.0,1,0,3,0,0,"May 29, 2023"
891,Titans Vs Super Kings,Titans,MohitSharma,3,0,36,3,12.0,4,2,3,0,0,"May 29, 2023"


In [16]:
#save the dataframe

bowling_summary_df.to_csv('bowling_summary.csv', index = False)

### Players Links and Data

In [17]:
### Part1: Let's get all players links

players_links = []

for _, row in df_match_summary.iterrows():
    team1 = row['team1']
    team2 = row['team2']
    match_info = f"{team1} Vs {team2}"
    scorecard_link = row['scorecard_link']

    # Step 2: Navigate to the scorecard link and extract bowling rows
    response = requests.get(scorecard_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #step3: Getting the batting players
    tables = soup.select('div > table.ci-scorecard-table')
    first_inning_bat_rows = [row for row in tables[0].select('tbody > tr') if len(row.select('td')) >= 8]
    second_inning_bat_rows = [row for row in tables[1].select('tbody > tr') if len(row.select('td')) >= 8]
    
    
    for index, element in enumerate(first_inning_bat_rows):
        tds = element.find_all('td')
        players_links.append({
            "name": tds[0].find('a').find('span').find('span').text.replace(' ', ''),
            "team": team1,
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })
    
    
    for index, element in enumerate(second_inning_bat_rows):
        tds = element.find_all('td')
        players_links.append({
            "name": tds[0].find('a').find('span').find('span').text.replace(' ', ''),
            "team": team2,
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })

          
    
    #step4: Getting the bolwing players 
    tables = soup.select('div > table.ds-table')
    first_inning_bowl_rows = [row for row in tables[1].select('tbody > tr') if len(row.select('td')) >= 11]
    second_inning_bowl_rows = [row for row in tables[3].select('tbody > tr') if len(row.select('td')) >= 11]
    
    

    for index, element in enumerate(first_inning_bowl_rows):
        tds = element.find_all('td')
        players_links.append({
            "name": tds[0].find('a').find('span').text.replace(' ', ''),
            "team": team2,
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })
        
    
    for index, element in enumerate(second_inning_bowl_rows):
        tds = element.find_all('td')
        players_links.append({
            "name": tds[0].find('a').find('span').text.replace(' ', ''),
            "team": team1,
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })
            
            
            

# Step 4: Create the final DataFrame
players_links_df = pd.DataFrame(players_links)

In [18]:
# Remove duplicates based on the "name" column
players_links_df = players_links_df.drop_duplicates(subset=['name'])

In [19]:
print(players_links_df.shape)
players_links_df.head(5)

(328, 3)


Unnamed: 0,name,team,link
0,DevonConway,Super Kings,https://www.espncricinfo.com/cricketers/devon-...
1,RuturajGaikwad,Super Kings,https://www.espncricinfo.com/cricketers/rutura...
2,MoeenAli,Super Kings,https://www.espncricinfo.com/cricketers/moeen-...
3,BenStokes,Super Kings,https://www.espncricinfo.com/cricketers/ben-st...
4,AmbatiRayudu,Super Kings,https://www.espncricinfo.com/cricketers/ambati...


In [20]:
print(players_links_df.shape)
players_links_df.tail(5)

(328, 3)


Unnamed: 0,name,team,link
1973,DarshanNalkande,Titans,https://www.espncricinfo.com/cricketers/darsha...
1989,ChrisJordan,Mumbai,https://www.espncricinfo.com/cricketers/chris-...
2001,MohsinKhan,Super Giants,https://www.espncricinfo.com/cricketers/mohsin...
2024,VishnuVinod †,Mumbai,https://www.espncricinfo.com/cricketers/vishnu...
2028,KumarKartikeya,Mumbai,https://www.espncricinfo.com/cricketers/kumar-...


In [21]:
#save this intermediate dataframe

players_links_df.to_csv('intermediate_players_links.csv', index = False)

### Let's get actual Players Data

In [22]:
players_links_df = pd.read_csv('intermediate_players_links.csv')

In [23]:
players_links_df.head(2)

Unnamed: 0,name,team,link
0,DevonConway,Super Kings,https://www.espncricinfo.com/cricketers/devon-...
1,RuturajGaikwad,Super Kings,https://www.espncricinfo.com/cricketers/rutura...


In [24]:
# Create an empty list to store the extracted data
player_data_list = []

# Iterate over each row in the dataframe
for index, row in players_links_df.iterrows():
    # Get the link for the current row
    name = row['name']
    team = row['team']
    link = row['link']
    
    # Send a GET request to the link and get the HTML content
    response = requests.get(link)
    content = response.content
    
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')
    
    
    #get all the required details
    div_elements = soup.select('div.ds-grid > div')
    
    
    #1.batting_style
    try:
        batting_style_div = next((div for div in div_elements if div.find('p').text.strip() == 'Batting Style'), None)
        battingStyle = batting_style_div.find('span').text.strip()
    except Exception as e:
        battingStyle = "Not Available"

    
    
    #2.bowling style
    try:
        bowling_style_div = next((div for div in div_elements if div.find('p').text.strip() == 'Bowling Style'), None)
        bowlingStyle = bowling_style_div.find('span').text.strip()
    except Exception as e:
        bowlingStyle = "Not Available"
           
    
    #3: playing role
    try:
        playing_role_div = next((div for div in div_elements if div.find('p').text.strip() == 'Playing Role'), None)
        playingRole = playing_role_div.find('span').text.strip()
    except Exception as e:
        playingRole = "Not Available"
    
    
    #Create a dictionary with the extracted data for the current player
    player_data_list.append({
        'name': name,
        'team': team,
        'battingStyle': battingStyle,
        'bowlingStyle': bowlingStyle,
        'playingRole': playingRole
    })
    


# Create a new dataframe from the player data list
player_info_df = pd.DataFrame(player_data_list)

In [26]:
print(player_info_df.shape)
player_info_df.head(5)

(328, 5)


Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole
0,DevonConway,Super Kings,Left hand Bat,Right arm Medium,Wicketkeeper Batter
1,RuturajGaikwad,Super Kings,Right hand Bat,Right arm Offbreak,Batter
2,MoeenAli,Super Kings,Left hand Bat,Right arm Offbreak,Batting Allrounder
3,BenStokes,Super Kings,Left hand Bat,Right arm Fast medium,Allrounder
4,AmbatiRayudu,Super Kings,Right hand Bat,Right arm Offbreak,Middle order Batter


In [27]:
player_info_df.tail(5)

Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole
323,DarshanNalkande,Titans,Right hand Bat,Right arm Fast medium,Bowler
324,ChrisJordan,Mumbai,Right hand Bat,Right arm Fast medium,Bowler
325,MohsinKhan,Super Giants,Left hand Bat,Left arm Medium fast,Bowler
326,VishnuVinod †,Mumbai,Right hand Bat,Not Available,Wicketkeeper Batter
327,KumarKartikeya,Mumbai,Right hand Bat,"Slow Left arm Orthodox, Left arm Wrist spin",Bowler


In [28]:
#save the dataframe
player_info_df.to_csv('players_data.csv', index = False)