In [1]:
import json
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
from tqdm import tqdm

In [9]:
def extract_match_data(match_file, career_averages, player_info = False):

    # extract file name from the path
    match_id = os.path.basename(match_file).split('.')[0]

    # read json file
    with open(match_file) as f:
        data = json.load(f)
    
    info = dict()
    info_reverse = dict()
    info['id'] = match_id
    info_reverse['id'] = match_id
    info['date'] = datetime.strptime(data['info']['dates'][0], '%Y-%m-%d')
    info_reverse['date'] = datetime.strptime(data['info']['dates'][0], '%Y-%m-%d')
    info['match_type'] = data['info']['match_type']
    info_reverse['match_type'] = data['info']['match_type']
    info['venue'] = data['info']['venue']
    info_reverse['venue'] = data['info']['venue']
    info['city'] = data['info']['city'] if 'city' in data['info'] else None
    info_reverse['city'] = data['info']['city'] if 'city' in data['info'] else None
    info['gender'] = data['info']['gender']
    info_reverse['gender'] = data['info']['gender']
    info['team_type'] = data['info']['team_type']
    info_reverse['team_type'] = data['info']['team_type']
    info['team1'] = data['info']['teams'][0]
    info_reverse['team2'] = data['info']['teams'][0]
    info['team2'] = data['info']['teams'][1]
    info_reverse['team1'] = data['info']['teams'][1]
    info['outcome'] = data['info']['outcome']['winner'] if 'winner' in data['info']['outcome'] else 'draw'
    info_reverse['outcome'] = data['info']['outcome']['winner'] if 'winner' in data['info']['outcome'] else 'draw'
    info['toss_winner'] = data['info']['toss']['winner']
    info_reverse['toss_winner'] = data['info']['toss']['winner']
    info['toss_decision'] = data['info']['toss']['decision']
    info_reverse['toss_decision'] = data['info']['toss']['decision']
    try:
        info['target'] = data['innings'][1]['target']['runs'] if 'target' in data['innings'][1] else (1+sum([i for i in map(lambda x: sum([j for j in map(lambda y: y['runs']['total'], x['deliveries'])]), data['innings'][0]['overs'])]))
        info_reverse['target'] = info['target']
    except:
        return None

    if player_info:
        # get team players
        team1_players = career_averages[career_averages['identifier'].isin([data['info']['registry']['people'][player] for player in data['info']['players'][data['info']['teams'][0]]])]
        team2_players = career_averages[career_averages['identifier'].isin([data['info']['registry']['people'][player] for player in data['info']['players'][data['info']['teams'][1]]])]
        # get team averages
        info['team1_batting_average'] = team1_players[team1_players['batting_average'] != 0]['batting_average'].values.mean()
        info['team1_batting_strike_rate'] = team1_players[team1_players['batting_strike_rate'] != 0]['batting_strike_rate'].values.mean()
        info['team1_batting_100s_normalized'] = team1_players[team1_players['batting_100s_normalized'] != 0]['batting_100s_normalized'].values.mean()
        info['team1_batting_50s_normalized'] = team1_players[team1_players['batting_50s_normalized'] != 0]['batting_50s_normalized'].values.mean()
        info['team1_batting_6s_normalized'] = team1_players[team1_players['batting_6s_normalized'] != 0]['batting_6s_normalized'].values.mean()
        info['team1_batting_4s_normalized'] = team1_players[team1_players['batting_4s_normalized'] != 0]['batting_4s_normalized'].values.mean()
        info['team1_bowling_average'] = team1_players[team1_players['bowling_average'] != 0]['bowling_average'].values.mean()
        info['team1_bowling_strike_rate'] = team1_players[team1_players['bowling_strike_rate'] != 0]['bowling_strike_rate'].values.mean()
        info['team1_bowling_economy'] = team1_players[team1_players['bowling_economy'] != 0]['bowling_economy'].values.mean()
        info['team1_bowling_maidens_normalized'] = team1_players[team1_players['bowling_maidens_normalized'] != 0]['bowling_maidens_normalized'].values.mean()

        info['team2_batting_average'] = team2_players[team2_players['batting_average'] != 0]['batting_average'].values.mean()
        info['team2_batting_strike_rate'] = team2_players[team2_players['batting_strike_rate'] != 0]['batting_strike_rate'].values.mean()
        info['team2_batting_100s_normalized'] = team2_players[team2_players['batting_100s_normalized'] != 0]['batting_100s_normalized'].values.mean()
        info['team2_batting_50s_normalized'] = team2_players[team2_players['batting_50s_normalized'] != 0]['batting_50s_normalized'].values.mean()
        info['team2_batting_6s_normalized'] = team2_players[team2_players['batting_6s_normalized'] != 0]['batting_6s_normalized'].values.mean()
        info['team2_batting_4s_normalized'] = team2_players[team2_players['batting_4s_normalized'] != 0]['batting_4s_normalized'].values.mean()
        info['team2_bowling_average'] = team2_players[team2_players['bowling_average'] != 0]['bowling_average'].values.mean()
        info['team2_bowling_strike_rate'] = team2_players[team2_players['bowling_strike_rate'] != 0]['bowling_strike_rate'].values.mean()
        info['team2_bowling_economy'] = team2_players[team2_players['bowling_economy'] != 0]['bowling_economy'].values.mean()
        info['team2_bowling_maidens_normalized'] = team2_players[team2_players['bowling_maidens_normalized'] != 0]['bowling_maidens_normalized'].values.mean()

        info_reverse['team1_batting_average'] = info['team2_batting_average']
        info_reverse['team1_batting_strike_rate'] = info['team2_batting_strike_rate']
        info_reverse['team1_batting_100s_normalized'] = info['team2_batting_100s_normalized']
        info_reverse['team1_batting_50s_normalized'] = info['team2_batting_50s_normalized']
        info_reverse['team1_batting_6s_normalized'] = info['team2_batting_6s_normalized']
        info_reverse['team1_batting_4s_normalized'] = info['team2_batting_4s_normalized']
        info_reverse['team1_bowling_average'] = info['team2_bowling_average']
        info_reverse['team1_bowling_strike_rate'] = info['team2_bowling_strike_rate']
        info_reverse['team1_bowling_economy'] = info['team2_bowling_economy']
        info_reverse['team1_bowling_maidens_normalized'] = info['team2_bowling_maidens_normalized']

        info_reverse['team2_batting_average'] = info['team1_batting_average']
        info_reverse['team2_batting_strike_rate'] = info['team1_batting_strike_rate']
        info_reverse['team2_batting_100s_normalized'] = info['team1_batting_100s_normalized']
        info_reverse['team2_batting_50s_normalized'] = info['team1_batting_50s_normalized']
        info_reverse['team2_batting_6s_normalized'] = info['team1_batting_6s_normalized']
        info_reverse['team2_batting_4s_normalized'] = info['team1_batting_4s_normalized']
        info_reverse['team2_bowling_average'] = info['team1_bowling_average']
        info_reverse['team2_bowling_strike_rate'] = info['team1_bowling_strike_rate']
        info_reverse['team2_bowling_economy'] = info['team1_bowling_economy']
        info_reverse['team2_bowling_maidens_normalized'] = info['team1_bowling_maidens_normalized']

    # replace NaN with -1
    info = {k: -1 if (isinstance(v, float) and np.isnan(v)) else v for k,v in info.items()}
    info_reverse = {k: -1 if (isinstance(v, float) and np.isnan(v)) else v for k,v in info_reverse.items()}
    return [info, info_reverse]

In [3]:
# read all csv files in the folder using glob
career_average_files = glob.glob('resources/temp/player_career_averages*.csv')

In [4]:
# combine all files in the list
career_average_df = pd.concat((pd.read_csv(f) for f in career_average_files))

In [5]:
career_average_df.to_csv('resources/player_career_averages.csv', index=False)

In [6]:
career_averages = pd.read_csv('resources/player_career_averages.csv')

In [7]:
# read all json file names in the folder using glob
match_files = glob.glob('matches/*.json')

In [10]:
# iterate over all files and extract match info
matches_info = []
for match_file in tqdm(match_files):
    match_data = extract_match_data(match_file, career_averages, player_info = True)
    if match_data is not None:
        matches_info.extend(match_data)        

# convert to dataframe
df = pd.DataFrame.from_records(matches_info)
df.to_csv('resources/matches_info.csv', index=False)

  info['team2_batting_100s_normalized'] = team2_players[team2_players['batting_100s_normalized'] != 0]['batting_100s_normalized'].values.mean()
  ret = ret.dtype.type(ret / rcount)
  info['team1_batting_100s_normalized'] = team1_players[team1_players['batting_100s_normalized'] != 0]['batting_100s_normalized'].values.mean()
  info['team1_batting_average'] = team1_players[team1_players['batting_average'] != 0]['batting_average'].values.mean()
  info['team1_batting_strike_rate'] = team1_players[team1_players['batting_strike_rate'] != 0]['batting_strike_rate'].values.mean()
  info['team1_batting_50s_normalized'] = team1_players[team1_players['batting_50s_normalized'] != 0]['batting_50s_normalized'].values.mean()
  info['team1_batting_6s_normalized'] = team1_players[team1_players['batting_6s_normalized'] != 0]['batting_6s_normalized'].values.mean()
  info['team1_batting_4s_normalized'] = team1_players[team1_players['batting_4s_normalized'] != 0]['batting_4s_normalized'].values.mean()
  info