## References:

1. https://www.espncricinfo.com
2. https://medium.com/swlh/web-scraping-cricinfo-data-c134fce79a33

In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
class IPLDataScrapper:

    def __init__(self, year, data_dir='Data'):
        self.domain = 'https://www.espncricinfo.com'
        self.data_dir = data_dir
        if year >= 2010 and year <= 2020:
            self.year = year
            if self.year >= 2014 and self.year <= 2015:
                self.season_url = self.domain + '/scores/series/8048/season/{}/pepsi-indian-premier-league?view=results'.format(self.year)
            else:
                self.season_url = self.domain + '/scores/series/8048/season/{}/indian-premier-league?view=results'.format(self.year)
        else:
            raise ValueError("Scrapper is defined only for the seasons from 2010 to 2020")


    def __extract_match_urls(self):
        season_page = requests.get(self.season_url)
        if season_page.status_code == 200:
            soup = BeautifulSoup(season_page.content, 'html.parser')
            matches = soup.find_all(class_='col-md-8 col-16')
            match_urls = []
            for match in matches:
                match_url = self.domain + match.find('a', href=True)['href']
                match_urls.append(match_url)
        else:
            raise ValueError("Response status code: {}".format(season_page.status_code))

        return match_urls


    def __extract_batsman_data(self, soup):
        batsman_tables = soup.find_all(class_="table batsman")
        #assert len(batsman_tables) == 2

        columns = ['name', 'wicket', 'runs', 'balls', 'duration', 'fours', 'sixes', 'strike_rate']
        for inning, batsman_table in enumerate(batsman_tables, start=1):
            rows = batsman_table.find_all('tr')
            batsman_list = []
            for i in range(1, len(rows), 2):
                batsman_row = rows[i]
                cells = batsman_row.find_all('td')
                cells = [cell.text.strip() for cell in cells]

                if cells[0] == 'Extras':
                    row = ['Extras', 'Extras', cells[2], '0', '0', '0', '0', '0']
                    batsman_list.append(row)
                elif len(cells) > 7:
                    row = cells
                    batsman_list.append(row)
                else:
                    if cells[0].startswith('Did not bat: '):
                        batsmen = [batsman.strip() for batsman in cells[0][len('Did not bat: '):].split(',')]
                        for batsman in batsmen:
                            row = [batsman, 'Did not bat', '0', '0', '0', '0', '0', '0']
                            batsman_list.append(row)
                
            batsman_df = pd.DataFrame(batsman_list, columns=columns)
            if inning == 1:
                batsman_df_1 = batsman_df
                batsman_df_1['inning'] = 1
            elif inning == 2:  
                batsman_df_2 = batsman_df
                batsman_df_2['inning'] = 2

        if len(batsman_tables) == 2:
            batsman_df = pd.concat([batsman_df_1, batsman_df_2])
        elif len(batsman_tables) == 1:
            batsman_df = batsman_df_1
        elif len(batsman_tables) == 0:
            batsman_df = pd.DataFrame(columns=columns)

        return batsman_df 


    def __extract_bowler_data(self, soup):
        bowler_tables = soup.find_all(class_="table bowler")
        #assert len(bowler_tables) == 2

        columns = ['name', 'overs', 'maidens', 'runs', 'wickets', 'economy', 'dots', 'fours', 'sixes', 'wides', 'no_balls']
        for inning, bowler_table in enumerate(bowler_tables, start=1):
            rows = bowler_table.find_all('tr')
            bowler_list = []
            for i in range(1, len(rows)):
                bowler_row = rows[i]
                cells = bowler_row.find_all('td')
                cells = [cell.text.strip() for cell in cells]
                row = cells
                bowler_list.append(row)
                
            bowler_df = pd.DataFrame(bowler_list, columns=columns)
            if inning == 1:
                bowler_df_1 = bowler_df
                bowler_df_1['inning'] = 1
            elif inning == 2:  
                bowler_df_2 = bowler_df
                bowler_df_2['inning'] = 2
            

        if len(bowler_tables) == 2:
            bowler_df = pd.concat([bowler_df_1, bowler_df_2])
        elif len(bowler_tables) == 1:
            bowler_df = bowler_df_1
        elif len(bowler_tables) == 0:
            bowler_df = pd.DataFrame(columns=columns)

        return bowler_df


    def __extract_meta_data(self, soup, match_id, match_url):

        match_name, location, date, _ = soup.find(class_='desc text-truncate').get_text().split(',')
        match_name = match_name.replace('/', ' and ').strip()
        team_names = soup.find_all('a', class_='team-name')
        team_names = [team_name.find('span').get('title').strip() for team_name in team_names]
        score_runs = [score_run.text.strip() for score_run in soup.find_all('div', class_='score-run')]
        match_details_table = soup.find(class_="w-100 table match-details-table")
        match_details_rows = match_details_table.find_all('tr')
        match_details = {}
        for i in range(1, len(match_details_rows)):
            cells = match_details_rows[i].find_all('td')
            key = cells[0].get_text().strip()
            value = cells[1].get_text().strip()
            match_details[key] = value

        meta_data = {}
        meta_data['year'] = self.year
        meta_data['match_id'] = match_id
        meta_data['match_name'] = match_name
        meta_data['match_url'] = match_url
        meta_data['location'] = location.strip()
        meta_data['stadium'] = match_details_rows[0].find('a').get_text().strip()
        meta_data['date'] = date.strip()
        meta_data['toss'] = match_details.get('Toss', 'Not Found')
        meta_data['team_1'] = team_names[0]
        meta_data['team_2'] = team_names[1]
        meta_data['team_1_score'] = score_runs[0]
        meta_data['team_2_score'] = score_runs[1]
        if soup.find('div', class_='score-extra-score'):
            meta_data['team_2_final_status'] = soup.find('div', class_='score-extra-score').get_text().strip()
        else:
            meta_data['team_2_final_status'] = 'Not Found'
        if soup.find(class_='best-player-name'):
            meta_data['best_player'] = soup.find(class_='best-player-name').find('a').get_text().strip()
        else:
            meta_data['team_2_final_status'] = 'Not Found'
        meta_data['summary'] = soup.find('div', class_='summary').get_text().strip()
        meta_data['points'] = match_details.get('Points', 'Not Found')

        return meta_data


    def scrape(self):
        self.match_urls = self.__extract_match_urls()[::-1]
        self.season_dir = os.path.join(self.data_dir, str(self.year))
        if not os.path.exists(self.season_dir):
            os.mkdir(self.season_dir)
        for match_id, match_url in tqdm(enumerate(self.match_urls, start=1), desc="Matches", leave=False, total=len(self.match_urls)):
            match_page = requests.get(match_url)
            soup = BeautifulSoup(match_page.content, 'html.parser')
            
            match_name, location, date, _ = soup.find(class_='desc text-truncate').get_text().split(',')
            match_name = match_name.replace('/', ' and ').strip()
            
            meta_data = self.__extract_meta_data(soup, match_id, match_url)
            batsman_df = self.__extract_batsman_data(soup)
            bowler_df = self.__extract_bowler_data(soup)
            
            match_dir = os.path.join(self.season_dir, match_name)
            if not os.path.exists(match_dir):
                os.mkdir(match_dir)

            with open(os.path.join(match_dir, 'meta_data.json'), 'w') as fp:
                json.dump(meta_data, fp, sort_keys=False, indent=4)
            batsman_df.to_csv(os.path.join(match_dir, 'batsman_df.csv'), index=False)
            bowler_df.to_csv(os.path.join(match_dir, 'bowler_df.csv'), index=False)


In [3]:
#for year in tqdm(range(2010, 2020), 'Seasons'):
#    scrapper = IPLDataScrapper(year=year)
#    scrapper.scrape()

In [4]:
scrapper = IPLDataScrapper(year=2020)
scrapper.scrape()

HBox(children=(FloatProgress(value=0.0, description='Matches', max=7.0, style=ProgressStyle(description_width=…