In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
import json
import os
from tqdm import tqdm
from collections import defaultdict

In [3]:
folder_path = 't20s_male_json'
json_files = os.listdir(folder_path)

In [4]:
countries = ['England', 'Australia', 'Afghanistan', 'India', 'New Zealand', 'South Africa', 'West Indies', 'Bangladesh']
top_matches = []
scorecards = []

In [5]:
def process_match_data(data):
    player_stats = defaultdict(lambda: {
        'team': '',
        'player_name': '',
        'runs_scored': 0,
        'wickets_taken': 0,
        'balls_faced': 0,
        'runs_conceded': 0,
        'balls_bowled': 0,
        'fours_hit': 0,
        'sixes_hit': 0,
        'dot_balls_bowled': 0,
        'death_overs_runs_conceded': 0,
        'death_overs_balls_bowled': 0,
        'out': 0
    })
    
    for innings_idx, innings in enumerate(data['innings']):
        team_name = innings['team']

        for over_idx, over_data in enumerate(innings['overs']):
            over_num = over_data['over']
            is_death_over = over_num >= 16 and over_num <= 20
            
            for delivery in over_data['deliveries']:
                batter = delivery['batter']
                bowler = delivery['bowler']
                non_striker = delivery['non_striker']
                runs = delivery.get('runs', {})
                batter_runs = runs.get('batter', 0)
                extras = runs.get('extras', 0)
                total_runs = runs.get('total', 0)
                player_stats[bowler]['team'] = get_opposing_team(team_name, data)
                player_stats[bowler]['player_name'] = bowler
                
                if extras == 0:
                    player_stats[bowler]['balls_bowled'] += 1
                    if is_death_over:
                        player_stats[bowler]['death_overs_balls_bowled'] += 1
                
                player_stats[bowler]['runs_conceded'] += total_runs
                if is_death_over:
                    player_stats[bowler]['death_overs_runs_conceded'] += total_runs
                
                player_stats[batter]['team'] = team_name
                player_stats[batter]['player_name'] = batter
                player_stats[batter]['balls_faced'] += 1
                player_stats[batter]['runs_scored'] += batter_runs
                player_stats[non_striker]['team'] = team_name
                player_stats[non_striker]['player_name'] = non_striker
                
                if total_runs == 0:
                    player_stats[bowler]['dot_balls_bowled'] += 1
                if batter_runs == 4:
                    player_stats[batter]['fours_hit'] += 1
                elif batter_runs == 6:
                    player_stats[batter]['sixes_hit'] += 1
                if 'wickets' in delivery:
                    for wicket in delivery['wickets']:
                        player_out = wicket['player_out']
                        player_stats[bowler]['wickets_taken'] += 1
                        player_stats[player_out]['out'] = 1
    df = pd.DataFrame(player_stats.values())
    return df

def get_opposing_team(batting_team, data):
    all_teams = [innings['team'] for innings in data['innings']]
    for team in all_teams:
        if team != batting_team:
            return team
    return "Unknown Team"

In [6]:
for match_file in tqdm(json_files):
    if '.json' in match_file:
        with open(os.path.join(folder_path, match_file), 'r') as f:
            content = f.read()
            data = json.loads(content)
            if set(data['info']['teams']).issubset(countries):
                top_matches.append(match_file) 
                scorecard = process_match_data(data)
                date = data['info']['dates'][0]
                year = int(date.split('-')[0])
                scorecard['date'] = date
                scorecard['year'] = year
                scorecard['boundary_runs'] = 4 * scorecard['fours_hit'].fillna(0) + 6 * scorecard['sixes_hit'].fillna(0)
                scorecard['boundaries'] = scorecard['fours_hit'].fillna(0) + scorecard['sixes_hit'].fillna(0)
                scorecards.append(scorecard)

100%|██████████| 2604/2604 [00:10<00:00, 237.82it/s]


In [8]:
data['info']['city']

'Colombo'

In [66]:
df = pd.concat(scorecards, ignore_index=True)

In [67]:
df.head()

Unnamed: 0,team,player_name,runs_scored,wickets_taken,balls_faced,runs_conceded,balls_bowled,fours_hit,sixes_hit,dot_balls_bowled,death_overs_runs_conceded,death_overs_balls_bowled,out,date,year,boundary_runs,boundaries
0,New Zealand,BM Wheeler,0,2,0,22,24,0,0,14,9,6,0,2017-01-03,2017,0,0
1,Bangladesh,Tamim Iqbal,11,0,14,0,0,1,0,0,0,0,1,2017-01-03,2017,4,1
2,Bangladesh,Imrul Kayes,0,0,2,0,0,0,0,0,0,0,1,2017-01-03,2017,0,0
3,New Zealand,MJ Henry,0,1,0,44,24,0,0,8,27,12,0,2017-01-03,2017,0,0
4,Bangladesh,Sabbir Rahman,16,0,16,0,0,1,1,0,0,0,1,2017-01-03,2017,10,2


In [70]:
df.drop(columns=['fours_hit', 'sixes_hit'], inplace=True)

In [71]:
years = df['year'].unique().tolist()
years = [year for year in years if year <= 2022 ]
years.sort()

In [72]:
yearwise = []
for year in years:
    year_df = df[df['year'] == year]
    year_df.drop(columns=['team', 'date', 'year'], inplace=True)
    grouped = year_df.groupby(by='player_name').sum()
    grouped['year'] = year
    grouped['strike_rate'] = (grouped['runs_scored'] / grouped['balls_faced']).round(2) * 100
    grouped['economy'] = ((grouped['runs_conceded'] / grouped['balls_bowled']) * 6).round(2)
    grouped['dot_ball_pct'] = (grouped['dot_balls_bowled'] / grouped['balls_bowled']).round(2) * 100
    grouped['bowling_strike_rate'] = (grouped['balls_bowled'] / grouped['wickets_taken']).round(2)
    grouped['death_economy'] = ((grouped['death_overs_runs_conceded'] / grouped['death_overs_balls_bowled']) * 6).round(2)
    grouped['average'] = grouped.apply(lambda x: x['runs_scored'] if x['out'] == 0 else (x['runs_scored'] / x['out']).round(2), axis=1)
    grouped['boundary_pct'] = (grouped['boundary_runs'] / grouped['runs_scored']) * 100
    grouped['balls_per_boundary'] = grouped['balls_faced'] / grouped['boundaries']
    yearwise.append(grouped[['year', 'strike_rate', 'economy', 'dot_ball_pct', 'bowling_strike_rate', 'death_economy', 'average', 
                             'boundary_pct', 'balls_per_boundary', 'runs_scored', 'wickets_taken', 'balls_bowled', 'balls_faced']])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop(columns=['team', 'date', 'year'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop(columns=['team', 'date', 'year'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop(columns=['team', 'date', 'year'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [73]:
yearwise = [df.reset_index() for df in yearwise]
sum_cols = ['runs_scored', 'wickets_taken', 'balls_bowled', 'balls_faced']

In [76]:
def combine_dataframes_with_weighted_avg(dataframes, years, sum_cols):
    df_dict = {year: df.copy() for year, df in zip(years, dataframes)}
    
    all_players = set()
    for df in dataframes:
        all_players.update(df['player_name'].tolist())
    
    all_columns = set()
    for df in dataframes:
        all_columns.update(df.columns)
    
    data_columns = [col for col in all_columns if col not in ['player_name', 'year']]
    
    result_data = []
    for player in all_players:
        player_data = {'player_name': player}
        player_years = []
        for year in years:
            df = df_dict[year]
            if player in df['player_name'].values:
                player_years.append(year)
        player_years.sort(reverse=True)
        
        if len(player_years) == 1:
            weights = [1.0]
        elif len(player_years) == 2:
            weights = [0.9, 0.1]
        elif len(player_years) == 3:
            weights = [0.9, 0.09, 0.01]
        else:
            weights = [0.9 * (0.1 ** i) for i in range(len(player_years))]
            weights = [w / sum(weights) for w in weights]
        
        for column in data_columns:
            weighted_sum = 0
            total_weight = 0
            for i, year in enumerate(player_years):
                df = df_dict[year]
                player_row = df[df['player_name'] == player]
                
                if not player_row.empty and column in player_row.columns:
                    value = player_row[column].values[0]
                    if not (pd.isna(value) or value == float('inf') or value == float('-inf')):
                        if column not in sum_cols:
                            weighted_sum += value * weights[i]
                        else:
                            weighted_sum += value
                        total_weight += weights[i]
            if total_weight > 0:
                player_data[column] = weighted_sum / total_weight
            else:
                player_data[column] = np.nan   
        result_data.append(player_data)
    
    result_df = pd.DataFrame(result_data)
    return result_df

In [77]:
stats = combine_dataframes_with_weighted_avg(yearwise, years, sum_cols)

In [78]:
stats.head()

Unnamed: 0,player_name,balls_per_boundary,balls_faced,death_economy,balls_bowled,boundary_pct,economy,strike_rate,runs_scored,wickets_taken,average,dot_ball_pct,bowling_strike_rate
0,DP Hyatt,7.04,88.0,,0.0,65.909091,,89.3,100.0,0.0,7.6,,
1,MN Samuels,4.52437,1592.0,4.696639,718.0,77.549842,10.632685,141.985877,1982.0,40.0,15.98108,27.89965,17.999309
2,VD Philander,,60.0,8.5,164.0,0.0,8.63,47.0,28.0,8.0,3.5,46.0,20.5
3,SS Cottrell,6.0,58.0,9.31268,1374.0,0.720072,8.499936,93.768338,36.0,88.0,0.2034,44.973195,18.841311
4,Avesh Khan,7.0,14.0,18.0,398.0,60.0,9.5,143.0,20.0,18.0,10.0,44.0,22.11


In [79]:
def calculate_percentiles(df, lower_is_better_stats, higher_is_better_stats):
    percentile_df = df.copy()
    for stat in lower_is_better_stats:
        if stat in df.columns:
            percentile_df[stat] = df[stat].rank(pct=True, ascending=False) * 100
    
    for stat in higher_is_better_stats:
        if stat in df.columns:
            percentile_df[stat] = df[stat].rank(pct=True) * 100
    return percentile_df

In [80]:
batting = ['runs_scored', 'average', 'boundary_pct', 'strike_rate', 'balls_per_boundary']
bowling = ['wickets_taken', 'economy', 'dot_ball_pct', 'bowling_strike_rate']
lower_stats = ['economy', 'bowling_strike_rate', 'death_economy', 'balls_per_boundary']
higher_stats = ['runs_scored', 'average', 'boundary_pct', 'strike_rate', 'wickets_taken', 'dot_ball_pct']

In [81]:
percentiles = calculate_percentiles(stats, lower_stats, higher_stats)

In [82]:
percentiles.head(10)

Unnamed: 0,player_name,balls_per_boundary,balls_faced,death_economy,balls_bowled,boundary_pct,economy,strike_rate,runs_scored,wickets_taken,average,dot_ball_pct,bowling_strike_rate
0,DP Hyatt,48.214286,88.0,,0.0,79.882812,,39.130435,55.245347,19.458545,44.670051,,
1,MN Samuels,83.333333,1592.0,94.174757,718.0,92.773438,20.97561,84.310019,94.923858,86.80203,64.128596,24.634146,49.723757
2,VD Philander,,60.0,71.197411,164.0,9.082031,52.560976,13.232514,36.294416,58.967851,31.725888,86.219512,39.364641
3,SS Cottrell,62.261905,58.0,63.430421,1374.0,20.117188,55.609756,42.533081,39.763113,97.800338,17.428088,84.390244,44.475138
4,Avesh Khan,50.119048,14.0,8.899676,398.0,67.578125,34.756098,85.255198,31.302876,72.758037,49.323181,83.04878,30.939227
5,HF Gurney,,0.0,76.699029,46.0,,69.634146,,6.76819,42.047377,6.76819,51.097561,26.104972
6,SCJ Broad,14.047619,196.0,17.799353,1486.0,41.796875,45.365854,53.119093,66.666667,96.362098,54.99154,63.902439,38.121547
7,KH Pandya,47.619048,188.0,85.598706,786.0,59.960938,58.292683,71.928166,70.72758,79.103215,81.218274,27.073171,11.049724
8,JM Vince,79.52381,394.0,,0.0,79.101562,,83.931947,80.71066,19.458545,81.387479,,
9,JJ Roy,44.285714,1812.0,,0.0,73.828125,,52.741021,96.362098,19.458545,73.265651,,


In [45]:
batters = percentiles.dropna(subset=batting)['player_name'].tolist()
bowlers = percentiles.dropna(subset=bowling)['player_name'].tolist()
all_rounders = list(set(batters).intersection(bowlers))

In [46]:
len(all_rounders)

236

In [47]:
percentiles.head(20)

Unnamed: 0,player_name,balls_per_boundary,balls_faced,death_economy,balls_bowled,boundary_pct,economy,strike_rate,runs_scored,wickets_taken,average,dot_ball_pct,bowling_strike_rate
0,DP Hyatt,48.214286,44.0,,0.0,79.882812,,39.130435,55.245347,19.458545,47.208122,,
1,MN Samuels,83.333333,796.0,94.174757,359.0,92.773438,20.97561,84.310019,94.923858,86.80203,65.651438,24.634146,49.723757
2,VD Philander,,30.0,71.197411,82.0,9.082031,52.560976,13.232514,36.294416,58.967851,33.756345,86.219512,39.364641
3,SS Cottrell,62.261905,29.0,63.430421,687.0,20.117188,55.609756,42.533081,39.763113,97.800338,17.428088,84.390244,44.475138
4,Avesh Khan,50.119048,7.0,8.899676,199.0,67.578125,34.756098,85.255198,31.302876,72.758037,51.692047,83.04878,30.939227
5,HF Gurney,,0.0,76.699029,23.0,,69.634146,,6.76819,42.047377,6.76819,51.097561,26.104972
6,SCJ Broad,14.047619,98.0,17.799353,743.0,41.796875,45.365854,53.119093,66.666667,96.362098,56.683587,63.902439,38.121547
7,KH Pandya,47.619048,94.0,85.598706,393.0,59.960938,58.292683,71.928166,70.72758,79.103215,82.233503,27.073171,11.049724
8,JM Vince,79.52381,197.0,,0.0,79.101562,,83.931947,80.71066,19.458545,82.402707,,
9,JJ Roy,44.285714,906.0,,0.0,73.828125,,52.741021,96.362098,19.458545,74.619289,,


In [48]:
stats.to_csv('statistics.csv')
percentiles.to_csv('percetiles.csv')

In [49]:
stats.head(10)

Unnamed: 0,player_name,balls_per_boundary,balls_faced,death_economy,balls_bowled,boundary_pct,economy,strike_rate,runs_scored,wickets_taken,average,dot_ball_pct,bowling_strike_rate
0,DP Hyatt,7.04,44.0,,0.0,65.909091,,89.3,50.0,0.0,7.6,,
1,MN Samuels,4.52437,796.0,4.696639,359.0,77.549842,10.632685,141.985877,991.0,20.0,15.98108,27.89965,17.999309
2,VD Philander,,30.0,8.5,82.0,0.0,8.63,47.0,14.0,4.0,3.5,46.0,20.5
3,SS Cottrell,6.0,29.0,9.31268,687.0,0.720072,8.499936,93.768338,18.0,44.0,0.11295,44.973195,18.841311
4,Avesh Khan,7.0,7.0,18.0,199.0,60.0,9.5,143.0,10.0,9.0,10.0,44.0,22.11
5,HF Gurney,,0.0,7.64,23.0,,7.83,,0.0,1.0,0.0,35.0,23.0
6,SCJ Broad,11.996931,98.0,15.220745,743.0,45.139779,8.985403,103.64007,104.0,40.0,12.294111,38.592207,20.789953
7,KH Pandya,7.060606,94.0,6.71,393.0,56.645963,8.249,126.6,121.0,14.0,24.35,28.7,30.7
8,JM Vince,4.926905,197.0,,0.0,65.52434,,141.78,264.0,0.0,24.77,,
9,JJ Roy,7.367875,906.0,,0.0,62.504302,,103.071326,1179.0,0.0,19.68289,,


In [50]:
stats.isna().sum() / stats.shape[0]

player_name            0.000000
balls_per_boundary     0.289340
balls_faced            0.000000
death_economy          0.477157
balls_bowled           0.000000
boundary_pct           0.133672
economy                0.306261
strike_rate            0.104907
runs_scored            0.000000
wickets_taken          0.000000
average                0.000000
dot_ball_pct           0.306261
bowling_strike_rate    0.387479
dtype: float64