In [None]:
import pandas as pd

batting = pd.read_csv("BATTING.csv", encoding="latin1")


In [None]:
batting.head()

In [None]:
batting.columns = batting.columns.str.lower().str.replace(" ", "_")

In [None]:
numeric_cols = ['runs', 'balls', 'minutes', 'fours', 'sixes', 'strick_rate']

for col in numeric_cols:
    batting[col] = pd.to_numeric(batting[col], errors='coerce')


In [None]:
batting.info()


In [None]:
batting.head()

In [None]:
batting[['runs','balls','minutes','fours','sixes','strick_rate']] = (
    batting[['runs','balls','minutes','fours','sixes','strick_rate']].fillna(0)
)


In [None]:
top_scorers = (
    batting
    .groupby('batting')['runs']
    .sum()
    .sort_values(ascending=False)
    .head(5)
)

top_scorers


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# convert series to dataframe for seaborn
top_scorers_df = top_scorers.reset_index()
top_scorers_df.columns = ["Player", "Runs"]

plt.figure(figsize=(5, 2))
sns.barplot(
    x="Runs",
    y="Player",
    data=top_scorers_df
)

plt.title("Top 10 Run Scorers â€“ WC23")
plt.xlabel("Total Runs")
plt.ylabel("Batsman")
plt.tight_layout()
plt.show()


In [None]:
runs_per_match = (
    batting
    .groupby('batting')
    .agg(
        total_runs=('runs', 'sum'),
        matches=('match_id', 'nunique')
    )
)

runs_per_match['runs_per_match'] = (
    runs_per_match['total_runs'] / runs_per_match['matches']
)

runs_per_match[runs_per_match['matches'] >= 5] \
    .sort_values('runs_per_match', ascending=False) \
    .head(5)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

rpm_top5 = (
    runs_per_match[runs_per_match['matches'] >= 5]
    .sort_values('runs_per_match', ascending=False)
    .head(5)
    .reset_index()
)

sns.set_theme(style="whitegrid")
plt.figure(figsize=(8, 3))

ax = sns.barplot(
    x="runs_per_match",
    y="batting",
    hue="batting",         
    data=rpm_top5,
    palette="magma",
    legend=False            
)

ax.set_title(
    "Top 5 Batsmen by Runs per Match (Min 5 Matches) â€“ WC23",
    fontsize=14,
    weight="bold"
)
ax.set_xlabel("Runs per Match")
ax.set_ylabel("Batsman")

# Value labels
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f", padding=3)

sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()


In [None]:
sr_runs = (
    batting
    .groupby('batting')
    .agg(
        total_runs=('runs', 'sum'),
        avg_strike_rate=('strick_rate', 'mean'),
        matches=('match_id', 'nunique')
    )
    .query("matches >= 5 and total_runs >= 200")
    .sort_values('avg_strike_rate', ascending=False)
    .head(5)
)

sr_runs


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data
sr_top5 = sr_runs.reset_index()

sns.set_theme(style="whitegrid")
plt.figure(figsize=(6, 4))   # ðŸ‘ˆ small size

ax = sns.scatterplot(
    data=sr_top5,
    x="avg_strike_rate",
    y="total_runs",
    color="steelblue",
    s=80
)

# Titles and labels
ax.set_title(
    "Strike Rate vs Runs (Min 5 Matches)",
    fontsize=11,
    weight="bold"
)
ax.set_xlabel("Average Strike Rate", fontsize=9)
ax.set_ylabel("Total Runs", fontsize=9)

# Player labels (clean & readable)
for _, row in sr_top5.iterrows():
    ax.text(
        row["avg_strike_rate"] + 0.2,
        row["total_runs"] + 3,
        row["batting"],
        fontsize=8
    )

plt.tight_layout()
plt.show()


In [None]:
team_batting = (
    batting
    .groupby('batting_team')
    .agg(
        total_runs=('runs', 'sum'),
        avg_strike_rate=('strick_rate', 'mean'),
        matches=('match_id', 'nunique')
    )
    .sort_values('total_runs', ascending=False)
)

team_batting


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data
team_batting_df = team_batting.reset_index()

sns.set_theme(style="whitegrid")
plt.figure(figsize=(7, 4))   # ðŸ‘ˆ small & neat

ax = sns.barplot(
    x="total_runs",
    y="batting_team",
    hue="batting_team",      # future-proof
    data=team_batting_df,
    palette="crest",
    legend=False
)

# Titles and labels
ax.set_title(
    "Total Runs by Team â€“ WC23",
    fontsize=11,
    weight="bold"
)
ax.set_xlabel("Total Runs", fontsize=9)
ax.set_ylabel("Team", fontsize=9)

# Value labels
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=8)

sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()


In [None]:
position_runs = (
    batting
    .groupby('batting_position')['runs']
    .sum()
    .sort_values(ascending=False)
)

position_runs


In [None]:
boundary_stats = (
    batting
    .groupby('batting')
    .agg(
        fours=('fours', 'sum'),
        sixes=('sixes', 'sum')
    )
)

boundary_stats['total_boundaries'] = (
    boundary_stats['fours'] + boundary_stats['sixes']
)

boundary_stats.sort_values('total_boundaries', ascending=False).head(10)


In [None]:
balls_per_match = (
    batting
    .groupby('batting')
    .agg(
        total_balls=('balls', 'sum'),
        matches=('match_id', 'nunique')
    )
)

balls_per_match = balls_per_match[balls_per_match['matches'] >= 5]

balls_per_match['balls_faced_per_match'] = (
    balls_per_match['total_balls'] / balls_per_match['matches']
)

balls_per_match.sort_values('balls_faced_per_match', ascending=False).head(10)


In [None]:
batting_rating = (
    batting
    .groupby('batting')
    .agg(
        runs=('runs', 'sum'),
        strike_rate=('strick_rate', 'mean'),
        sixes=('sixes', 'sum'),
        fours=('fours', 'sum'),
        minutes=('minutes', 'sum'),
        balls=('balls', 'sum'),
        matches=('match_id', 'nunique')
    )
)

batting_rating = batting_rating[batting_rating['matches'] >= 5]


In [None]:
for col in ['runs', 'strike_rate', 'sixes', 'fours', 'minutes', 'balls']:
    batting_rating[col + '_n'] = (
        batting_rating[col] - batting_rating[col].min()
    ) / (
        batting_rating[col].max() - batting_rating[col].min()
    )


In [None]:
batting_rating['batting_rating'] = (
    0.35 * batting_rating['runs_n'] +
    0.25 * batting_rating['strike_rate_n'] +
    0.15 * batting_rating['sixes_n'] +
    0.15 * batting_rating['fours_n'] +
    0.05 * batting_rating['minutes_n'] +
    0.05 * batting_rating['balls_n']
)


In [None]:
batting_rating.sort_values('batting_rating', ascending=False).head(5)


In [None]:
india_players = batting[batting['batting_team'] == 'India']['batting'].unique()

india_batting_rating = (
    batting_rating
    .loc[batting_rating.index.isin(india_players)]
    .sort_values('batting_rating', ascending=False)
    .head(10)
)

india_batting_rating


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare clean dataframe
india_df = (
    india_batting_rating
    .reset_index()[["batting", "batting_rating"]]
)
india_df.columns = ["Player", "Batting_Rating"]

sns.set_theme(style="whitegrid")
plt.figure(figsize=(7, 4.5))

# Highlight top 3
colors = ["#1f77b4" if i < 3 else "#aec7e8" for i in range(len(india_df))]

plt.barh(
    india_df["Player"],
    india_df["Batting_Rating"],
    color=colors
)

plt.gca().invert_yaxis()

plt.title(
    "Top 10 Indian Batsmen by Batting Rating â€“ WC23",
    fontsize=12,
    weight="bold"
)
plt.xlabel("Batting Rating", fontsize=9)
plt.ylabel("Batsman", fontsize=9)

# Value labels
for i, value in enumerate(india_df["Batting_Rating"]):
    plt.text(value + 0.5, i, f"{value:.1f}", va="center", fontsize=8)

sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()

