In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
players_df = pd.read_csv('resources/people.csv')

In [None]:
cricinfo_ids = players_df[['identifier','key_cricinfo']].fillna(0).to_dict('records')

In [None]:
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                         "AppleWebKit/605.1.15 (KHTML, like Gecko) "
                         "Version/15.4 Safari/605.1.15"}

In [None]:
def to_float(x):
    try:
        return float(x)
    except:
        return 0

In [None]:
def get_player_career_averages(player_id):
    url_batting = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=3;template=results;type=batting;view=cumulative;wrappertype=print"
    url_bowling = f"https://stats.espncricinfo.com/ci/engine/player/{player_id}.html?class=3;template=results;type=bowling;view=cumulative;wrappertype=print"

    r_batting = requests.get(url_batting, headers=headers)
    r_bowling = requests.get(url_bowling, headers=headers)

    if r_batting.status_code != 200 or r_bowling.status_code != 200:
        return None

    html_batting = BeautifulSoup(r_batting.text, 'html.parser')
    html_bowling = BeautifulSoup(r_bowling.text, 'html.parser')

    batting_stats = html_batting.findAll('table')[2]
    bowling_stats = html_bowling.findAll('table')[2]

    if 'No records available to match this query' in batting_stats.text or 'No records available to match this query' in bowling_stats.text:
        return None

    df_batting = pd.read_html(str(batting_stats))[0]
    df_bowling = pd.read_html(str(bowling_stats))[0]

    # convert all data types to objects
    data_dict = {}
    data_dict['id'] = player_id
    data_dict['batting_average'] = to_float(df_batting['Ave'][0])
    data_dict['batting_strike_rate'] = to_float(df_batting['SR'][0])

    batting_innings = to_float(df_batting['Inns'][0])
    bowling_innings = to_float(df_bowling['Inns'][0])
    bowling_overs = to_float(df_bowling['Overs'][0])

    data_dict['batting_100s_normalized'] = to_float(df_batting['100'][0])/batting_innings if batting_innings > 0 else 0
    data_dict['batting_50s_normalized'] = to_float(df_batting['50'][0])/batting_innings if batting_innings > 0 else 0
    data_dict['batting_6s_normalized'] = to_float(df_batting['6s'][0])/batting_innings if batting_innings > 0 else 0
    data_dict['batting_4s_normalized'] = to_float(df_batting['4s'][0])/batting_innings if batting_innings > 0 else 0
    data_dict['batting_highest_score'] = to_float(df_batting['HS'][0])
    data_dict['batting_not_outs_normalized'] = to_float(df_batting['NO'][0])/batting_innings if batting_innings > 0 else 0
    data_dict['bowling_average'] = to_float(df_bowling['Ave'][0])
    data_dict['bowling_strike_rate'] = to_float(df_bowling['SR'][0]) #the number of balls bowled per wicket
    data_dict['bowling_economy'] = to_float(df_bowling['Econ'][0]) #number of runs allowed per over
    data_dict['bowling_maidens_normalized'] = to_float(df_bowling['Mdns'][0])/bowling_overs if bowling_overs > 0 else 0
    
    return data_dict

In [None]:
# iterate over all players and get their career averages and save them to a csv file
player_info_not_found = []
records = []
i = 0
for player_id in tqdm(cricinfo_ids[:20]):
    data_dict = get_player_career_averages(int(player_id['key_cricinfo']))
    if data_dict is None:
        player_info_not_found.append(player_id)
    else:
        data_dict['identifier'] = player_id['identifier']
        records.append(data_dict)
    
    if i % 1000 == 0:
        df = pd.DataFrame.from_records(records)
        df.to_csv(f'resources/player_career_averages_{i}.csv', index=False)
        pd.DataFrame(player_info_not_found).to_csv(f'resources/player_info_not_found_{i}.csv', index=False)
        records = []
        player_info_not_found = []
    i += 1


df = pd.DataFrame.from_records(records)
df.to_csv(f'resources/player_career_averages_{i}.csv', index=False)
# save player info not found to a csv file
pd.DataFrame(player_info_not_found).to_csv(f'resources/player_info_not_found_{i}.csv', index=False)