In [195]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import re
from datetime import datetime
import numpy as np

In [3]:
# # get a list of historical test matches score card links

# list_of_scorecards = []
# for i in range(1, 1000):
    
#     url = f"https://stats.espncricinfo.com/ci/engine/stats/index.html?class=1;page={i};template=results;type=team;view=results"
#     try:
#         r = requests.get(url)
#         if r.status_code == 404:
#             raise "PageNotFound"
#         else:
#             soup = BeautifulSoup(r.text, 'html.parser')
#             html = soup.findAll('a', string='Match scorecard')
#             list_of_scorecards.append(
#                 [a['href'] for a in html]
#             )
#     except:
#         print(f"Page {i} not found, ending this!")
#         break

# list_of_scorecards = [f"https://stats.espncricinfo.com{item}" for sublist in list_of_scorecards for item in sublist]

In [4]:
# with open("list_of_test_links.txt", "wb") as fp:   #Pickling
#     pickle.dump(list_of_scorecards, fp)

In [5]:
# open historical list of test matches score card links
with open("data/list_of_test_links.txt", "rb") as fp:   # Unpickling
    list_of_scorecards = pickle.load(fp)

In [90]:
list_of_scorecards[-1]

'https://stats.espncricinfo.com/ci/engine/match/1243017.html'

In [204]:
def clean_names(string):
    return string.replace('\xa0', '').replace('(c)', '').replace('†', '').strip() 

def get_batsmen_names(batting_df):
    batsmen_names = batting_df.BATSMEN
    extras_index = batsmen_names[batsmen_names.str.contains('Extras')].index
    did_bat = batsmen_names.loc[:extras_index[0]-1]
    did_bat = did_bat.apply(clean_names)
    did_not_bat_idx = batsmen_names[batsmen_names.str.contains('Did not bat')].index
    if len(did_not_bat_idx) >0:

        did_not_bat = list(batsmen_names.loc[did_not_bat_idx])[0]
        did_not_bat = did_not_bat.replace('Did not bat:', '')
        did_not_bat = clean_names(did_not_bat)

        return did_bat.append( pd.Series(did_not_bat.split(',')) ).reset_index(drop=True)
    else:
        return did_bat.reset_index(drop=True)

def get_bowler_names(bowling_df):
    bolwing_names = bowling_df.BOWLING
    bolwing_names = bolwing_names.apply(clean_names)
    return bolwing_names

def get_player_table(player_id, table_type='batting'):
    url = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=1;template=results;type={table_type};view=innings"
    r = requests.get(url)
    if r.status_code == 404:
        raise PlayerNotFoundError
    else:
        soup = BeautifulSoup(r.text, 'html.parser')
        tables = soup.findAll('table', class_='engineTable')
        table = pd.read_html(str(tables[3]))[0]
        table['Start Date'] = pd.to_datetime(table['Start Date'], format='%d %b %Y')
        return table

def get_recent_batting_avg(player_number, num_innings=8):
    df = get_player_table(player_number, 'batting')
    df = df[ 
        (df["Start Date"] < match_date)
        & (df["Runs"]!='DNB')
    ]
    if len(df)>num_innings:
        df_recent = df.iloc[-num_innings:]
        recent_runs = df_recent['Runs'].str.replace('*','')
        recent_runs = recent_runs.astype(float).sum()
        outs = (~df_recent['Dismissal'].str.contains('not out')).sum()
        return recent_runs/outs

    else:
        return np.nan
    
def get_recent_bowling_avg(player_number, num_innings=8):
    df = get_player_table(player_number, 'bowling')
    df = df[ 
        (df["Start Date"] < match_date)
        & (df["Overs"]!='DNB')
    ]

    if len(df)>num_innings:
        df_recent = df.iloc[-num_innings:]
        recent_wkts = df_recent['Wkts'].astype(float).sum()
        recent_runs = df_recent['Runs'].astype(float).sum()
        return recent_runs/recent_wkts

    else:
        return np.nan

In [None]:


batting_data = []
bowling_data = []
match_data = []

for match_number in list_of_scorecards[200:]:
    
    r = requests.get(match_number)

    soup = BeautifulSoup(r.text, 'html.parser')

    name_number_links_list = soup.findAll('a', class_="small")

    name_to_number_dict = {
        clean_names(i.text): i['href'].split('/')[-1].replace('.html', '') 
        for i in name_number_links_list
    }

    date_elem = soup.findAll('div', class_='description')[3].text.split(',')[2].strip().split(' ')
    date_string = ' '.join([ date_elem[i] for i in [1, 0,len(date_elem)-1]])
    match_date = datetime.strptime(date_string, '%d %b %Y')

    innings_html = soup.findAll('div', class_="card content-block match-scorecard-table")


    for inn in range(4):
        
        try:
            tables = innings_html[inn].findAll('table')

            # batting info
            batting = pd.read_html(str( tables[0]))[0]
            batting = batting.dropna(axis=1, how='all').dropna(axis=0, how='all')
            batting = batting.reset_index(drop=True)

            batsmen_player_numbers = get_batsmen_names(batting).replace(name_to_number_dict)

            # batsmens averages for innings
            bat_avgs = batsmen_player_numbers.apply(get_recent_batting_avg)

            # bowling info
            bowling = pd.read_html(str( tables[1]))[0]

            bowl_player_names = get_bowler_names(bowling).replace(name_to_number_dict)

            # bowl avg for innings
            bowl_avgs = bowl_player_names.apply(get_recent_bowling_avg)

            # innings total
            get_total = batting[batting['BATSMEN'] == 'TOTAL']['R'].values[0]
            team_tot = float(get_total.split('/')[0])

            inn_info = innings_html[inn].findAll('h5', class_="header-title label")[0].text.strip()#

            # get innings info
            if '1st' in inn_info:
                innings_num = 1
                innings_status = 'complete'
            elif '2nd' in inn_info:
                innings_num = 2
                if 'target' in inn_info:
                    innings_status = 'incomplete'
                else:
                    innings_status = 'complete'


            batting_data.append(bat_avgs.to_numpy())
            bowling_data.append(bowl_avgs.to_numpy())
            match_data.append(
                np.array(
                    [
                        team_tot,
                        innings_num, 
                        innings_status, 
                        match_number
                    ]
                ),
            )
        except:
            print(f"Failed for innings {inn}, match {match_number}")




  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


In [245]:
pd.DataFrame(match_data)

Unnamed: 0,0,1,2,3
0,220.0,1,complete,https://stats.espncricinfo.com/ci/engine/match...
1,378.0,1,complete,https://stats.espncricinfo.com/ci/engine/match...
2,245.0,2,complete,https://stats.espncricinfo.com/ci/engine/match...
3,90.0,2,incomplete,https://stats.espncricinfo.com/ci/engine/match...
