In [2]:
import pandas as pd
import json
import os

In [3]:
folder_path = r"D:\Shaaf Projects\Data sets\pakistan_male_json"

# Extracting T20 matches

In [5]:
T20_FILES = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):
        file_path = os.path.join(folder_path, file_name)
        try:
            with open(file_path, 'r') as json_file:
                data = json.load(json_file)
            if data.get('info', {}).get('match_type') == 'T20':
                T20_FILES.append(file_name)
        except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
            print(f"Error processing {file_name}: {e}")


# Initializing DataFrames

In [7]:
match_information={ 
'city':[],
'event':[],
'stage':[],
'dates':[],
'match_type':[],
'match_type_number':[],
'player_of_match':[],
'winner':[],
'winner_by':[],
'overs':[],
'season':[],
'team_type':[],
'team1':[],
'team2':[],
'toss_winner':[],
'toss_decision':[],
'venue':[]
}

players_infomation={
    'name':[],
    'team':[],
    'match_type_number':[],
    'match_type':[],
    'venue':[]
}

# Data Extraction Functions

In [9]:
def extract_match_info(rawdata):
    if 'city' in rawdata['info']:
        match_information['city'].append(rawdata['info']['city'])
    else:
        match_information['city'].append(None)
    if 'event' in rawdata['info']:
        match_information['event'].append(rawdata['info']['event']['name'])
        if 'stage' in rawdata['info']['event']:
            match_information['stage'].append(rawdata['info']['event']['stage'])
        else:
            match_information['stage'].append(None)
    else:
        match_information['event'].append(None)
        match_information['stage'].append(None)
    if 'player_of_match' in rawdata['info']:
        match_information['player_of_match'].append(rawdata['info']["player_of_match"])
    else:
        match_information['player_of_match'].append(None)
    match_information['dates'].append(''.join(rawdata['info']['dates']))
    match_information['match_type'].append(rawdata['info']['match_type'])
    match_information['match_type_number'].append(rawdata['info']['match_type_number'])
    if 'winner' in rawdata['info']['outcome']:
        match_information['winner'].append(rawdata['info']['outcome']['winner'])
        match_information['winner_by'].append(rawdata['info']['outcome']['by'])
    else:
        match_information['winner'].append('no result')
        match_information['winner_by'].append('no result')
    match_information['overs'].append(rawdata['info']['overs'])
    match_information['season'].append(rawdata['info']['season'])
    match_information['team_type'].append(rawdata['info']['team_type'])
    match_information['team1'].append(rawdata['info']['teams'][0])
    match_information['team2'].append(rawdata['info']['teams'][1])
    match_information['toss_winner'].append(rawdata['info']['toss']['winner'])
    match_information['toss_decision'].append(rawdata['info']['toss']['decision'])
    match_information['venue'].append(rawdata['info']['venue'])

In [10]:
def players_info(rawdata):
    for x in rawdata['info']['players']:
        for y in rawdata['info']['players'][x]:
            players_infomation['name'].append(y)
            players_infomation['match_type_number'].append(rawdata['info']['match_type_number'])
            players_infomation['match_type'].append(rawdata['info']['match_type'])
            players_infomation['team'].append(x)
            players_infomation['venue'].append(rawdata['info']['venue'])

In [11]:
def innings_data(raw_data):
    innings_data = raw_data['innings']
    rows = []
    for inning in innings_data:
        team = inning['team']
        for over in inning['overs']:
            over_number = over['over']
            for delivery in over['deliveries']:
                row = {
                    'Team': team,
                    'Over': over_number,
                    'Batter': delivery['batter'],
                    'Bowler': delivery['bowler'],
                    'Non_Striker': delivery.get('non_striker', None),
                    'Runs_Batter': delivery['runs']['batter'],
                    'Runs_Extras': delivery['runs'].get('extras', 0),
                    'Runs_Total': delivery['runs']['total']
                }
                if 'wickets' in delivery:
                    row['Wickets']  = 1
                    row['player_out'] = delivery['wickets'][0]['player_out']
                    if 'fielders' in delivery['wickets'][0]:
                        row['fielder_name'] = delivery['wickets'][0]['fielders'][0]['name']
                    else:
                         row['fielder_name'] = None
                    row['kind']= delivery['wickets'][0]['kind']
                else:
                    row['Wickets'] = 0
                    row['fielder_name'] = None
                    row['kind'] = None
                
                rows.append(row)
    return rows

In [12]:
def Finding_all_csv(csvs_list):
    directory = r"./matches_info_innings_data"
    if not os.path.exists(directory):
        print("The specified directory does not exist.")
        return
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == "matches_information.csv" or file == "player_information.csv":
                continue
            if file.endswith('.csv'):
                csvs_list.append(file)
    print(f"Found {len(csvs_list)} CSV files:")

In [13]:
def merging_all_csv(csvs_list):
    firstFile = pd.read_csv(f'./matches_info_innings_data/{csvs_list[0]}')
    firstFile['match_type_number'] = csvs_list[0].split('_')[1]
    for file in csvs_list[1:]:
        df = pd.read_csv(f'./matches_info_innings_data/{file}')
        df['match_type_number'] = file.split('_')[1]
        firstFile = pd.concat([firstFile, df])
    return firstFile

# Writing to CSV 

In [15]:
def writetocsv(data,filename):
    df = pd.DataFrame(data)
    df.to_csv(f'./matches_info_innings_data/{filename}.csv',index=False)

# Main code

In [17]:
for name in T20_FILES:
    with open(f'D:/Shaaf Projects/Data sets/pakistan_male_json/{name}') as file:
        # Data converting
        rawdata = json.load(file)
        # Extracting match info
        extract_match_info(rawdata)
        # Extracting player info
        players_info(rawdata)
        # Extracting Inning info
        innings = innings_data(rawdata)
        # Filename
        innings_filename = f'{rawdata['info']['teams'][0]} vs {rawdata['info']['teams'][1]}_{str(rawdata['info']['match_type_number'])}'
        writetocsv(match_information,'matches_information')
        writetocsv(players_infomation,'player_information')
        writetocsv(innings,innings_filename)

In [18]:
csvs_list = []
Finding_all_csv(csvs_list)
DataSet = merging_all_csv(csvs_list)
writetocsv(DataSet,'T20_Pakistan_matches_Dataset.csv')