In [11]:
import os
import requests
import zipfile
import json
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

In [20]:
# --- Configuration ---
# We'll focus on T20s for the initial EDA
match_type = "T20"
url = "https://cricsheet.org/downloads/t20s_json.zip"

# Define local directories
base_dir = "cricsheet_eda"
zip_path = os.path.join(base_dir, f"{match_type.lower()}.zip")
unzipped_dir = os.path.join(base_dir, f"{match_type.lower()}_json")

# --- Setup ---
# Create the local directories
os.makedirs(base_dir, exist_ok=True)
os.makedirs(unzipped_dir, exist_ok=True)

print("Directories are ready.")

# --- Download and Unzip ---
if not os.listdir(unzipped_dir):
    print(f"Downloading {match_type} data...")
    response = requests.get(url)
    response.raise_for_status()
    with open(zip_path, 'wb') as f:
        f.write(response.content)
    print("Download complete.")

    print("Unzipping data...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(unzipped_dir)
    print("Unzip complete.")
else:
    print("Data already downloaded and unzipped.")

# # Set plot style for better visuals
# sns.set_style("whitegrid")

Directories are ready.
Data already downloaded and unzipped.


In [22]:
# --- Load all JSON files into a list ---
all_matches = []
json_files_path = unzipped_dir

# List all files, ignoring any non-JSON or README files
file_names = [f for f in os.listdir(json_files_path) if f.endswith('.json') and not f.startswith('README')]

for file_name in file_names:
    file_path = os.path.join(json_files_path, file_name)
    with open(file_path, 'r') as f:
        try:
            all_matches.append(json.load(f))
        except json.JSONDecodeError:
            print(f"Warning: Could not decode JSON from {file_name}. Skipping.")

print(f"Successfully loaded {len(all_matches)} match files.")
print("-" * 30)

# --- Inspect the structure of a single match ---
if all_matches:
    print("Top-level keys in the first match file:")
    print(list(all_matches[0].keys()))
    print("\n'info' keys in the first match file:")
    print(list(all_matches[0]['info'].keys()))

Successfully loaded 4342 match files.
------------------------------
Top-level keys in the first match file:
['meta', 'info', 'innings']

'info' keys in the first match file:
['balls_per_over', 'dates', 'event', 'gender', 'match_type', 'match_type_number', 'officials', 'outcome', 'overs', 'player_of_match', 'players', 'registry', 'season', 'team_type', 'teams', 'toss', 'venue']


In [24]:
from collections import Counter

# --- Count the frequency of each key in the 'info' dictionary ---
info_key_counter = Counter()
for match in all_matches:
    info_key_counter.update(match['info'].keys())

# --- Display the results in a readable format ---
total_matches = len(all_matches)
print(f"{'Key':<25} | {'Count':>10} | {'Percentage':>12}")
print("-" * 55)

for key, count in info_key_counter.most_common():
    percentage = f"{(count / total_matches) * 100:.2f}%"
    print(f"{key:<25} | {count:>10} | {percentage:>12}")

Key                       |      Count |   Percentage
-------------------------------------------------------
balls_per_over            |       4342 |      100.00%
dates                     |       4342 |      100.00%
gender                    |       4342 |      100.00%
match_type                |       4342 |      100.00%
match_type_number         |       4342 |      100.00%
outcome                   |       4342 |      100.00%
overs                     |       4342 |      100.00%
players                   |       4342 |      100.00%
registry                  |       4342 |      100.00%
season                    |       4342 |      100.00%
team_type                 |       4342 |      100.00%
teams                     |       4342 |      100.00%
toss                      |       4342 |      100.00%
venue                     |       4342 |      100.00%
event                     |       4276 |       98.48%
city                      |       4163 |       95.88%
officials                 