#Web Scrapping

In [103]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#Getting html page
url = "https://www.howstat.com/Cricket/Statistics/Series/SeriesListMenu.asp#IPLs" 
data = requests.get(url) 
soup = BeautifulSoup(data.text,'html.parser') 

#Getting all seasons links
season_links = []
for a_tag in soup.find_all('a', href=True):
    href = a_tag['href']
    if '/IPL/SeriesMatches' in href:
        season_links.append(href)

#Sparsing links to form absolute links 
season_links_abs = [f"https://www.howstat.com/Cricket/Statistics/{season_link.split('../')[-1]}" for season_link in season_links]
season_year_map = {
    '2008': '2008',
    '2009': '2009',
    '2010': '2010',
    '2011': '2011',
    '2012': '2012',
    '2013': '2013',
    '2014': '2014',
    '2015': '2015',
    '2016': '2016',
    '2017': '2017',
    '2018': '2018',
    '2019': '2019',
    '2020': '2020',
    '2021': '2021',
    '2022': '2022',
    '2023': '2023',
}
#Getting 2008 html page
all_matches_data = []
for season_url in season_links_abs[1:]:
    data_matches = requests.get(season_url)
    soup_01 = BeautifulSoup(data_matches.text, 'html.parser')
    season_year = next((year for year in season_year_map if year in season_url), None)
    matches_data = []
    table = soup_01.find('table',class_='TableLined')
    rows = table.find_all('tr')[1:]  # Skipping the header row

    if table:
        rows = table.find_all('tr')[1:]
        for row in rows:
            cols = row.find_all('td')

            if len(cols) >= 6:  # Ensure the row has enough data
                match = cols[0].get_text(strip=True)
                date = cols[1].get_text(strip=True)
                teams = cols[2].get_text(strip=True)
                ground = cols[3].get_text(strip=True)
                result = cols[4].get_text(strip=True)
                scorecard_link_tag = cols[5].find('a', href=True)
                scorecard_link = scorecard_link_tag['href'] if scorecard_link_tag else None
            
                # Append the match data to the list
                matches_data.append([match, date, teams, ground, result, scorecard_link,season_year])
    all_matches_data.extend(matches_data)

df = pd.DataFrame(all_matches_data, columns=['Match', 'Date', 'Teams', 'Ground', 'Result','Scorecard','Season'])

df['ID'] = range(1, len(df)+1)

df = df[['ID', 'Match', 'Date', 'Teams', 'Ground', 'Result', 'Scorecard', 'Season']]

df.to_csv('ipl_matches.csv', index=False)

#Cleaning


In [104]:
import pandas as pd
file = 'ipl_matches.csv'
df = pd.read_csv(file)

df[['Winner', 'Margin']] = df['Result'].str.extract(r'(.+) won by (.+)', expand=True)
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Day_of_Week'] = df['Date'].dt.day_name()
df['Month'] = df['Date'].dt.month_name()
df['Year'] = df['Date'].dt.year

df[['Team1', 'Team2']] = df['Teams'].str.split(' v ', expand=True)

df.to_csv(file, index=False)

#Scraping Scorecard Data

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://www.howstat.com/Cricket/Statistics/IPL/"

file = "ipl_matches.csv"
df = pd.read_csv(file)

def get_scorecard_data(scorecard_url):
    full_url = base_url+scorecard_url
    response = requests.get(full_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    batting_data=[]
    bowling_data=[]

    batting_section_1 = soup.find('table', class_='ScorecardMain')
    if batting_section_1:
        batting_row = batting_section_1.find_all('tr')[2:]
        for row in batting_row:
            cols = row.find_all('td')
            if len(cols) >= 8:
                player_name = cols[0].get_text(strip=True)
                dismissal = cols[1].get_text(strip=True)
                runs = cols[2].get_text(strip=True)
                balls = cols[3].get_text(strip=True)
                fours = cols[4].get_text(strip=True)
                sixes = cols[5].get_text(strip=True)
                sr = cols[6].get_text(strip=True)
                percentage = cols[7].get_text(strip=True)

                bowler = None
                if 'b' in dismissal:
                    bowler = dismissal.split('b')[1].strip()
                
                batting_data.append({
                    'Player': player_name,
                    'Runs': runs,
                    'Balls': balls,
                    '4s': fours,
                    '6s': sixes,
                    'SR': sr,
                    'Bowler': bowler,
                    'MatchID': None,  
                    'Team': "Team 1"  
                })

    bowling_section_1 = soup.find_all('table', class_='ScorecardBowling')
    if bowling_section_1:
        bowling_table = bowling_section_1[0]
        bowling_rows = bowling_table.find_all('tr')[2:]
        for row in bowling_rows:
            cols = row.find_all('td')
            if len(cols) >= 7:
                player_name = cols[0].get_text(strip=True)
                overs = cols[1].get_text(strip=True)
                runs = cols[2].get_text(strip=True)
                wickets = cols[3].get_text(strip=True)
                economy_rate = cols[4].get_text(strip=True)
                percentage_wickets = cols[6].get_text(strip=True)
                
                bowling_data.append({
                    'Player': player_name,
                    'Overs': overs,
                    'Runs': runs,
                    'Wickets': wickets,
                    'ER': economy_rate,
                    'MatchID': None, 
                    'Team': "Team 2"  
                })
    batting_section_2 = soup.find_all('table', class_='ScorecardMain')[1]  # Access the second table (Team 2)
    if batting_section_2:
        batting_rows_2 = batting_section_2.find_all('tr')[2:]  # Skipping the header row
        for row in batting_rows_2:
            cols = row.find_all('td')
            if len(cols) >= 8:
                player_name = cols[0].get_text(strip=True)
                dismissal = cols[1].get_text(strip=True)
                runs = cols[2].get_text(strip=True)
                balls = cols[3].get_text(strip=True)
                fours = cols[4].get_text(strip=True)
                sixes = cols[5].get_text(strip=True)
                sr = cols[6].get_text(strip=True)
                percentage = cols[7].get_text(strip=True)

                bowler = None
                if 'b' in dismissal:
                    bowler = dismissal.split('b')[1].strip()

                # Append batting data for Team 2
                batting_data.append({
                    'Player': player_name,
                    'Runs': runs,
                    'Balls': balls,
                    '4s': fours,
                    '6s': sixes,
                    'SR': sr,
                    'Bowler': bowler,
                    'MatchID': None,  
                    'Team': "Team 2"  
                })

    bowling_section_2 = soup.find_all('table', class_='ScorecardBowling')[1]  # Access the second table (Team 2)
    if bowling_section_2:
        bowling_rows_2 = bowling_section_2.find_all('tr')[2:]  # Skipping the header row
        for row in bowling_rows_2:
            cols = row.find_all('td')
            if len(cols) >= 7:
                player_name = cols[0].get_text(strip=True)
                overs = cols[1].get_text(strip=True)
                runs = cols[2].get_text(strip=True)
                wickets = cols[3].get_text(strip=True)
                economy_rate = cols[4].get_text(strip=True)
                percentage_wickets = cols[6].get_text(strip=True)

                # Append bowling data for Team 2
                bowling_data.append({
                    'Player': player_name,
                    'Overs': overs,
                    'Runs': runs,
                    'Wickets': wickets,
                    'ER': economy_rate,
                    'MatchID': None, 
                    'Team': "Team 1"  
                })

    return batting_data, bowling_data

all_batting_data = []
all_bowling_data = []

for index, row in df.iterrows():
    scorecard_url = row['Scorecard']
    if scorecard_url:  # Only process if the scorecard link exists
        batting_data, bowling_data = get_scorecard_data(scorecard_url)
        
        # Fill in match details for batting data
        for player_data in batting_data:
            player_data['MatchID'] = row['ID']
            player_data['Match'] = row['Match']
            player_data['Date'] = row['Date']
            player_data['Teams'] = row['Teams']
            all_batting_data.append(player_data)
        
        # Fill in match details for bowling data
        for player_data in bowling_data:
            player_data['MatchID'] = row['ID']
            player_data['Match'] = row['Match']
            player_data['Date'] = row['Date']
            player_data['Teams'] = row['Teams']
            all_bowling_data.append(player_data)
    


# Convert the collected batting and bowling data into DataFrames
batting_df = pd.DataFrame(all_batting_data)
bowling_df = pd.DataFrame(all_bowling_data)

batting_df.to_csv('ipl_batting_data.csv', index=False)
bowling_df.to_csv('ipl_bowling_data.csv', index=False)

Unnamed: 0,Player,Overs,Runs,Wickets,ER,MatchID,Team,Match,Date,Teams
0,Zaheer Khan,4.0,0,38,1,1,Team 2,1,2008-04-18,Royal Challengers Bengaluru v Kolkata Knight R...
1,Ashley Noffke,4.0,0,40,1,1,Team 2,1,2008-04-18,Royal Challengers Bengaluru v Kolkata Knight R...
2,Jacques Kallis,4.0,0,48,1,1,Team 2,1,2008-04-18,Royal Challengers Bengaluru v Kolkata Knight R...
3,Sunil Joshi,3.0,0,26,0,1,Team 2,1,2008-04-18,Royal Challengers Bengaluru v Kolkata Knight R...
4,Cameron White,1.0,0,24,0,1,Team 2,1,2008-04-18,Royal Challengers Bengaluru v Kolkata Knight R...
...,...,...,...,...,...,...,...,...,...,...
5501,Deepak Chahar,2.0,0,23,0,1122,Team 2,16,2025-04-04,Lucknow Super Giants v Mumbai Indians
5502,Ashwani Kumar,3.0,0,39,1,1122,Team 2,16,2025-04-04,Lucknow Super Giants v Mumbai Indians
5503,Mitchell Santner,4.0,0,46,0,1122,Team 2,16,2025-04-04,Lucknow Super Giants v Mumbai Indians
5504,Vignesh Puthur,4.0,0,31,1,1122,Team 2,16,2025-04-04,Lucknow Super Giants v Mumbai Indians
