# Mumbai Indians: The Ultimate Analytical Deep Dive

## Overview
This notebook provides an exhaustive, 360-degree analysis of the Mumbai Indians (MI) franchise. It goes beyond basic stats to uncover:

1.  **Batting Stability Index**: Measuring consistency vs explosiveness.
2.  **Chasing vs Defending Mastery**: Win probability based on innings.
3.  **Boundary Distribution**: Which players rely on boundaries vs running?
4.  **Wicket-Taking Timelines**: Probabilistic density of falling wickets.
5.  **Matchups**: Bowler vs Batsman battles (Player vs Player).
6.  **Toss Decisions**: Trend of toss decisions over the years.

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Comparison Palette
sns.set_theme(style="whitegrid")
palette_mi = ["#004BA0", "#D1AB3E"] # MI Blue & Gold
plt.rcParams["figure.figsize"] = (14, 8)
plt.rcParams["font.size"] = 12

In [None]:
# --- LOAD CLEAN DATA ---
try:
    # low_memory=False prevents DtypeWarning for mixed types (e.g., season strings vs ints)
    df_fact = pd.read_csv("MI_Fact_Deliveries.csv", low_memory=False)
    df_matches = pd.read_csv("MI_Dim_Matches.csv", low_memory=False)
    print("Data loaded successfully.")
    
    # --- DATA INTEGRITY FIX ---
    # The Match Dimension table might be missing 'toss_decision' depending on the aggregation step.
    # We can perform a self-healing step by merging it from the Fact table.
    if "toss_decision" not in df_matches.columns:
        print("⚠️ 'toss_decision' column missing in Matches Dim. Retrieving from Fact table...")
        # Get unique match_id and toss_decision from fact table
        toss_info = df_fact[["match_id", "toss_decision"]].drop_duplicates()
        df_matches = pd.merge(df_matches, toss_info, on="match_id", how="left")
        print("✓ Fixed: merged 'toss_decision' into matches dataframe.")

except FileNotFoundError:
    print("❌ Data not found. Please run the initial preparation notebook first.")

## 1. Batting Stability Analysis (Avg vs SR)
**Insight**: Segmenting batters into 'Anchors', 'Aggressors', and 'Finishers'.

In [None]:
# Filter MI Batting
mi_batting = df_fact[df_fact["batting_team"] == "Mumbai Indians"].copy()

batter_stats = mi_batting.groupby("batter").agg(
    runs=("runs_batter", "sum"),
    balls=("balls_faced", "sum"),
    innings=("match_id", "nunique"),
    outs=("player_out", "count") # Approximate outs
).reset_index()

# Filter for established players (>300 runs)
batter_stats = batter_stats[batter_stats["runs"] > 300].copy()

# Metric Calculations
batter_stats["average"] = batter_stats.apply(lambda x: x["runs"]/x["outs"] if x["outs"] > 0 else x["runs"], axis=1)
batter_stats["strike_rate"] = (batter_stats["runs"] / batter_stats["balls"]) * 100

plt.figure(figsize=(14, 8))
sns.scatterplot(
    data=batter_stats, 
    x="average", 
    y="strike_rate", 
    size="runs", 
    sizes=(100, 1000), 
    color="#004BA0",
    alpha=0.7
)

# Quadrant Lines
avg_mean = batter_stats["average"].mean()
sr_mean = batter_stats["strike_rate"].mean()
plt.axvline(x=avg_mean, color='gray', linestyle='--', label=f"Avg Mean ({avg_mean:.1f})")
plt.axhline(y=sr_mean, color='gray', linestyle='--', label=f"SR Mean ({sr_mean:.1f})")

# Annotations
for line in range(0, batter_stats.shape[0]):
    plt.text(
        batter_stats.average.iloc[line]+0.5, 
        batter_stats.strike_rate.iloc[line], 
        batter_stats.batter.iloc[line], 
        horizontalalignment='left', 
        size='small', 
        color='black'
    )

plt.title("Batting Stability Matrix: Average vs Strike Rate", fontsize=16, fontweight='bold')
plt.xlabel("Batting Average")
plt.ylabel("Strike Rate")
plt.legend()
plt.show()

## 2. Match Win Analysis: Chase vs Defend
**Insight**: Is MI a better Chasing team or Defending team?

In [None]:
# Use df_matches which filters strictly for matches involving MI (from previous steps)
mi_matches = df_matches.copy()

# Safety check for columns needed
required_cols = ["toss_winner", "toss_decision", "match_won_by"]
missing_cols = [c for c in required_cols if c not in mi_matches.columns]

if not missing_cols:
    # Create 'batted_first' logic
    def did_mi_bat_first(row):
        if row["toss_winner"] == "Mumbai Indians":
            return row["toss_decision"] == "bat"
        else:
            # If opponent won toss
            return row["toss_decision"] == "field"

    mi_matches["bat_first"] = mi_matches.apply(did_mi_bat_first, axis=1)
    mi_matches["result"] = np.where(mi_matches["match_won_by"] == "Mumbai Indians", "Won", "Lost")

    win_summary = mi_matches.groupby(["bat_first", "result"]).size().unstack(fill_value=0).reset_index()
    win_summary["bat_first_label"] = win_summary["bat_first"].map({True: "Defending (Bat 1st)", False: "Chasing (Bat 2nd)"})

    # Plotting
    win_summary_melt = win_summary.melt(id_vars="bat_first_label", value_vars=["Won", "Lost"], var_name="Outcome", value_name="Count")

    plt.figure(figsize=(10, 6))
    sns.barplot(data=win_summary_melt, x="bat_first_label", y="Count", hue="Outcome", palette={"Won": "#004BA0", "Lost": "#D32F2F"})
    plt.title("MI Win Record: Chasing vs Defending", fontsize=16, fontweight='bold')
    plt.xlabel("Match Scenario")
    plt.ylabel("Number of Matches")
    plt.show()
else:
    print(f"⚠️ Error: Missing columns for analysis: {missing_cols}")

## 3. Boundary Reliance Analysis
**Insight**: Who deals in boundaries? (Boundary % of Total Runs)

In [None]:
mi_batting["boundary_runs"] = mi_batting.apply(lambda x: x["runs_batter"] if x["runs_batter"] in [4, 6] else 0, axis=1)

boundary_stats = mi_batting.groupby("batter").agg(
    total_runs=("runs_batter", "sum"),
    boundary_runs=("boundary_runs", "sum")
).reset_index()

boundary_stats = boundary_stats[boundary_stats["total_runs"] > 500].copy()
boundary_stats["boundary_pct"] = (boundary_stats["boundary_runs"] / boundary_stats["total_runs"]) * 100
boundary_stats = boundary_stats.sort_values("boundary_pct", ascending=False).head(15)

plt.figure(figsize=(12, 6))
sns.barplot(data=boundary_stats, x="boundary_pct", y="batter", palette="YlOrBr_r")
plt.title("Highest Boundary Reliability (Run % from 4s & 6s)", fontsize=16, fontweight='bold')
plt.xlabel("Percentage of Runs from Boundaries")
plt.xlim(0, 100)
plt.show()

## 4. Wicket Timeline (Density Plot)
**Insight**: When does MI tend to lose wickets? (Early collapse vs Death over sacrifice)

In [None]:
wickets = mi_batting[mi_batting["is_wicket"] == 1]

plt.figure(figsize=(14, 6))
sns.kdeplot(data=wickets, x="over", fill=True, color="#D32F2F", alpha=0.6, bw_adjust=0.5)
plt.title("Wicket Fall Density: When Does MI Lose Wickets?", fontsize=16, fontweight='bold')
plt.xlabel("Over Number")
plt.xlim(0, 20)
plt.xticks(range(1, 21))
plt.ylabel("Density Prediction")
plt.show()

## 5. Player Matchup: RG Sharma vs Phase Performance
**Insight**: Analyzing Rohit Sharma's performance across different match phases.

In [None]:
target_batter = "RG Sharma"
player_data = mi_batting[mi_batting["batter"] == target_batter]

phase_perf = player_data.groupby("phase").agg(
    runs=("runs_batter", "sum"),
    balls=("balls_faced", "sum"),
    outs=("is_wicket", "sum")
).reset_index()

phase_perf["strike_rate"] = (phase_perf["runs"] / phase_perf["balls"]) * 100
phase_perf["average"] = phase_perf["runs"] / phase_perf["outs"].replace(0, 1)

fig, ax1 = plt.subplots(figsize=(12, 6))

sns.barplot(data=phase_perf, x="phase", y="runs", ax=ax1, color="#8da0cb", alpha=0.6)
ax1.set_ylabel("Total Runs Scored", color="#8da0cb")

ax2 = ax1.twinx()
sns.lineplot(data=phase_perf, x="phase", y="strike_rate", ax=ax2, color="#fc8d62", marker="o", linewidth=3)
ax2.set_ylabel("Strike Rate", color="#fc8d62")

plt.title(f"{target_batter} Performance Analysis by Phase", fontsize=16, fontweight='bold')
plt.show()

## 6. Toss Decision Trends Over Years
**Insight**: How has MI's strategy changed? (Batting first vs Fielding first)

In [None]:
mi_toss_won = df_matches[df_matches["toss_winner"] == "Mumbai Indians"].copy()

# Ensure season is integer or string for grouping
if "season" in mi_toss_won.columns:
    mi_toss_won["season"] = mi_toss_won["season"].astype(str)

toss_trend = mi_toss_won.groupby(["season", "toss_decision"]).size().unstack(fill_value=0)

toss_trend.plot(kind="bar", stacked=True, figsize=(14, 7), color=["#66b3ff", "#ffcc99"])
plt.title("Toss Decision Trends Over Years (When MI Won Toss)", fontsize=16, fontweight='bold')
plt.xlabel("Season")
plt.ylabel("Frequency")
plt.legend(title="Decision")
plt.show()