## SQLITE DATABASE

## Define Paths & Match Formats

In [None]:
import os, json
import pandas as pd
import sqlite3
from pathlib import Path
from sqlalchemy import create_engine

engine = create_engine("sqlite:///cricket_matches.db")  


match_paths = {
    "Test": "data/tests_json/",
    "ODI": "data/odis_json/",
    "T20": "data/it20s_json/"
}


## Parse JSON into DataFrames

In [58]:
import os, json
import pandas as pd

def flatten_fielders(fielders_raw):
    if isinstance(fielders_raw, list):
        return ', '.join([
            f.get('name') for f in fielders_raw
            if isinstance(f, dict) and f.get('name')
        ])
    return None

def create_match_dataframe(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    info = data.get("info", {})
    match_id = os.path.basename(file_path).split('.')[0]
    meta = {
        'match_id': match_id,
        'match_type': info.get('match_type'),
        'gender': info.get('gender'),
        'city': info.get('city'),
        'venue': info.get('venue'),
        'season': info.get('season'),
        'event_name': info.get('event', {}).get('name'),
        'event_group': info.get('event', {}).get('group'),
        'date': info.get('dates', [None])[0],
        'balls_per_over': info.get('balls_per_over'),
        'player_of_match': ', '.join(info.get('player_of_match', [])),
        'toss_winner': info.get('toss', {}).get('winner'),
        'toss_decision': info.get('toss', {}).get('decision'),
        'winner': info.get('outcome', {}).get('winner'),
        'win_by_wickets': info.get('outcome', {}).get('by', {}).get('wickets'),
        'teams': ', '.join(info.get('teams', []))
    }

    deliveries = []
    for inning_idx, inning in enumerate(data['innings'], start=1):
        team = inning['team']
        for over in inning['overs']:
            over_num = over['over']
            for ball_idx, delivery in enumerate(over['deliveries'], start=1):
                runs = delivery.get('runs', {})
                extras = delivery.get('extras', {})
                wickets = delivery.get('wickets', [])
                fielders_raw = wickets[0].get('fielders', []) if wickets and isinstance(wickets[0], dict) else []
                fielders = flatten_fielders(fielders_raw)
                row = {
                    'inning': inning_idx,
                    'batting_team': team,
                    'over': over_num,
                    'ball': ball_idx,
                    'batter': delivery.get('batter'),
                    'bowler': delivery.get('bowler'),
                    'non_striker': delivery.get('non_striker'),
                    'runs_batter': runs.get('batter', 0),
                    'runs_extras': runs.get('extras', 0),
                    'runs_total': runs.get('total', 0),
                    'extras_type': list(extras.keys())[0] if extras else None,
                    'wicket_kind': wickets[0]['kind'] if wickets else None,
                    'player_out': wickets[0]['player_out'] if wickets else None,
                    'fielders': fielders
                }
                row.update(meta)
                deliveries.append(row)

    return pd.DataFrame(deliveries)

## Create DataFrames by Match Type

In [60]:
def build_match_df(match_type, path):
    records = []
    for file in os.listdir(path):
        if file.endswith(".json"):
            full_path = os.path.join(path, file)
            record = create_match_dataframe(full_path)
            records.append(record)
    return pd.concat(records, ignore_index=True)

test_df = build_match_df("Test", match_paths["Test"])
odi_df = build_match_df("ODI", match_paths["ODI"])
t20_df = build_match_df("T20", match_paths["T20"])

## Insert Data Frame in SQL

In [61]:
test_df.to_sql("test_matches", con=engine, if_exists="replace", index=False)
odi_df.to_sql("odi_matches", con=engine, if_exists="replace", index=False)
t20_df.to_sql("t20_matches", con=engine, if_exists="replace", index=False)

70908

# EDA

In [62]:
!pip install python-pptx



In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pptx import Presentation
from pptx.util import Inches
import os

def generate_visualizations(df, output_dir="plots"):
    os.makedirs(output_dir, exist_ok=True)
    balls_per_over = df['balls_per_over'].iloc[0]

    # 1. Run Accumulation Over Innings
    df['ball_id'] = df['over'] + df['ball'] / balls_per_over
    run_progress = df.groupby(['inning', 'ball_id'])['runs_total'].sum().groupby(level=0).cumsum().reset_index()
    plt.figure()
    sns.lineplot(data=run_progress, x='ball_id', y='runs_total', hue='inning')
    plt.title('Run Accumulation Over Innings')
    plt.savefig(f"{output_dir}/run_accumulation.png", bbox_inches='tight')

    # 2. Wicket Distribution by Over
    plt.figure()
    wickets = df[df['wicket_kind'].notnull()]
    sns.histplot(data=wickets, x='over', hue='inning', multiple='stack', bins=20)
    plt.title('Wickets per Over')
    plt.savefig(f"{output_dir}/wickets_per_over.png", bbox_inches='tight')

    # 3. Top Batters
    plt.figure()
    top_batters = df.groupby('batter')['runs_batter'].sum().nlargest(10).reset_index()
    sns.barplot(data=top_batters, x='runs_batter', y='batter', palette='viridis')
    plt.title('Top 10 Batters by Runs')
    plt.savefig(f"{output_dir}/top_batters.png", bbox_inches='tight')

    # 4. Top Bowlers
    plt.figure()
    top_bowlers = df[df['wicket_kind'].notnull()].groupby('bowler')['wicket_kind'].count().nlargest(10).reset_index()
    sns.barplot(data=top_bowlers, x='wicket_kind', y='bowler', palette='Reds')
    plt.title('Top 10 Bowlers by Wickets')
    plt.savefig(f"{output_dir}/top_bowlers.png", bbox_inches='tight')

    # 5. Extras Breakdown
    plt.figure()
    sns.countplot(data=df[df['extras_type'].notnull()], x='extras_type', palette='Set2')
    plt.title('Extras Distribution by Type')
    plt.savefig(f"{output_dir}/extras_breakdown.png", bbox_inches='tight')

    # 6. Partnership Heatmap
    plt.figure(figsize=(10, 8))
    partnerships = df.groupby(['batter', 'non_striker'])['runs_total'].sum().unstack().fillna(0)
    sns.heatmap(partnerships, cmap='YlGnBu')
    plt.title('Partnership Runs Heatmap')
    plt.savefig(f"{output_dir}/partnership_heatmap.png", bbox_inches='tight')

    # 7. Venue-wise Run Totals
    plt.figure()
    venue_runs = df.groupby('venue')['runs_total'].sum().nlargest(10).reset_index()
    sns.barplot(data=venue_runs, x='runs_total', y='venue', palette='magma')
    plt.title('Top Venues by Total Runs')
    plt.savefig(f"{output_dir}/venue_runs.png", bbox_inches='tight')

    # 8. Dismissal Types
    plt.figure()
    sns.countplot(data=wickets, y='wicket_kind', order=wickets['wicket_kind'].value_counts().index, palette='Spectral')
    plt.title('Types of Dismissals')
    plt.savefig(f"{output_dir}/dismissal_types.png", bbox_inches='tight')

    # 9. Fielders Involved
    plt.figure()
    fielders_series = df['fielders'].dropna().explode()
    top_fielders = fielders_series.value_counts().nlargest(10).reset_index()
    top_fielders.columns = ['fielder', 'count']  # Rename for clarity
    sns.barplot(data=top_fielders, x='count', y='fielder', palette='cubehelix')
    plt.title('Top Fielders Involved in Dismissals')
    plt.savefig(f"{output_dir}/fielders_involved.png", bbox_inches='tight')

    # 10. Toss Decision vs Outcome
    plt.figure()
    toss_outcome = df.drop_duplicates('match_id')[['toss_decision', 'winner', 'toss_winner']]
    toss_outcome['won_match'] = toss_outcome['toss_winner'] == toss_outcome['winner']
    sns.countplot(data=toss_outcome, x='toss_decision', hue='won_match')
    plt.title('Toss Decision Impact on Match Outcome')
    plt.savefig(f"{output_dir}/toss_impact.png", bbox_inches='tight')

    plt.close('all')

def create_ppt_from_visuals(image_dir="plots", ppt_path="Cricket_Analytics_Report.pptx"):
    prs = Presentation()
    title_slide_layout = prs.slide_layouts[0]
    slide = prs.slides.add_slide(title_slide_layout)
    slide.shapes.title.text = "Cricket Analytics Report"
    slide.placeholders[1].text = "Generated from delivery-level match data"

    for img_file in sorted(os.listdir(image_dir)):
        if img_file.endswith(".png"):
            slide = prs.slides.add_slide(prs.slide_layouts[5])
            slide.shapes.title.text = img_file.replace("_", " ").replace(".png", "").title()
            slide.shapes.add_picture(os.path.join(image_dir, img_file), Inches(1), Inches(1.5), width=Inches(8))

    prs.save(ppt_path)

# Usage
generate_visualizations(test_df)
create_ppt_from_visuals()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=top_batters, x='runs_batter', y='batter', palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=top_bowlers, x='wicket_kind', y='bowler', palette='Reds')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df[df['extras_type'].notnull()], x='extras_type', palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=venue_runs, x='runs_total', y='venue', palette='magma')

Passing `pal