In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


ACCENT = "#4E79A7"
FONT   = "DejaVu Sans"
plt.rcParams.update({
    "figure.figsize": (9,5),
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "font.family": FONT,
    "axes.spines.right": False,
    "axes.spines.top": False,
    "axes.grid": True,
    "grid.linestyle": "--",
    "grid.alpha": 0.35,
    "axes.axisbelow": True,
})

def bar_with_labels(labels, values, title, ylabel="Win Rate", ylim=(0, 0.5), outfile=None):
    fig, ax = plt.subplots()
    bars = ax.bar(labels, values, color=ACCENT, alpha=0.9)
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    if ylim: ax.set_ylim(*ylim)
    ax.set_xticklabels(labels, rotation=30, ha="right")
    # value labels
    for b, v in zip(bars, values):
        ax.text(b.get_x() + b.get_width()/2, b.get_height() + 0.01, f"{v:.3f}",
                ha="center", va="bottom", fontsize=10)
    fig.tight_layout()
    if outfile:
        fig.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.show()

def line_markers(x, y, title, xlabel="Season", ylabel="Win Rate", ylim=(0,0.5), outfile=None):
    fig, ax = plt.subplots()
    ax.plot(x, y, marker="o", color=ACCENT, linewidth=2)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if ylim: ax.set_ylim(*ylim)
    # annotate every 2nd point to stay clean
    for xi, yi in list(zip(x, y))[::2]:
        ax.text(xi, yi + 0.01, f"{yi:.2f}", ha="center", fontsize=9)
    fig.tight_layout()
    if outfile:
        fig.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.show()


CSV_PATH = "dfb_matches.csv"
df = pd.read_csv(CSV_PATH)

# ensure numeric
for c in ["home_score","away_score","home_division","away_division","round","season"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

winner = pd.Series(index=df.index, dtype=object)
winner[df["home_score"] > df["away_score"]] = "home"
winner[df["home_score"] < df["away_score"]] = "away"
df["_winner_side"] = winner

used = df[df["_winner_side"].isin(["home","away"])].copy()
used = used[used["home_division"] != used["away_division"]].copy()

# lower-division result
used["_lower_side"] = np.where(used["home_division"] > used["away_division"], "home", "away")
used["_lower_win"]  = (used["_lower_side"] == used["_winner_side"]).astype(int)


# Win rate by round

round_map = {1:"Round 1", 2:"Round 2", 3:"Round 3", 4:"Quarterfinal", 5:"Semifinal", 6:"Final"}
used["_round_name"] = used["round"].map(round_map)
order_rounds = ["Round 1","Round 2","Round 3","Quarterfinal","Semifinal","Final"]
rates_round = used.groupby("_round_name")["_lower_win"].mean().reindex(order_rounds)

bar_with_labels(
    labels=rates_round.index.tolist(),
    values=rates_round.fillna(0).values,
    title="Lower Division Win Rate by Tournament Round",
    outfile="rq1_winrate_by_round.png"
)
print("RQ1 rates:\n", rates_round)


# Win rate by division gap

used["_div_gap"] = (used["home_division"] - used["away_division"]).abs()
rates_gap = used.groupby("_div_gap")["_lower_win"].mean().sort_index()

bar_with_labels(
    labels=[str(i) for i in rates_gap.index.tolist()],
    values=rates_gap.values,
    title="Lower Division Win Rate by Division Gap",
    outfile="rq2_winrate_by_gap.png"
)
print("RQ2 rates:\n", rates_gap)


# Win rate by season

trend = used.groupby("season")["_lower_win"].mean().sort_index()

line_markers(
    x=trend.index.values,
    y=trend.values,
    title="Lower Division Win Rate by Season",
    xlabel="Season",
    outfile="rq4_winrate_by_season.png"
)

from statsmodels.stats.proportion import proportion_confint
from scipy.stats import chi2_contingency, spearmanr
import matplotlib.pyplot as plt

# 95% binomial CIs for win rates by ROUND
round_order = ["Round 1","Round 2","Round 3","Quarterfinal","Semifinal","Final"]
round_grp = used.groupby("_round_name")["_lower_win"]
round_sum = round_grp.sum()
round_cnt = round_grp.count()
round_rate = (round_sum / round_cnt).reindex(round_order)

# Wilson CI (better small-sample behavior)
ci_low, ci_high = proportion_confint(count=round_sum.reindex(round_order).fillna(0).astype(int),
                                     nobs=round_cnt.reindex(round_order).fillna(0).astype(int),
                                     alpha=0.05, method="wilson")
round_ci = pd.DataFrame({
    "win_rate": round_rate.values,
    "n": round_cnt.reindex(round_order).values,
    "ci_low": ci_low,
    "ci_high": ci_high
}, index=round_order)
print("Win rate by ROUND with 95% CI:\n", round_ci)


# 95% binomial CIs for win rates by DIVISION GAP
used["_div_gap"] = (used["home_division"] - used["away_division"]).abs()
gap_grp = used.groupby("_div_gap")["_lower_win"]
gap_sum = gap_grp.sum()
gap_cnt = gap_grp.count()
gap_rate = gap_sum / gap_cnt

ci_low_g, ci_high_g = proportion_confint(count=gap_sum.astype(int),
                                         nobs=gap_cnt.astype(int),
                                         alpha=0.05, method="wilson")
gap_ci = pd.DataFrame({
    "win_rate": gap_rate.values,
    "n": gap_cnt.values,
    "ci_low": ci_low_g,
    "ci_high": ci_high_g
}, index=gap_rate.index.sort_values())
print("\nWin rate by DIVISION GAP with 95% CI:\n", gap_ci)

plt.figure(figsize=(9,5))
x = np.arange(len(gap_ci))
plt.bar(x, gap_ci["win_rate"], width=0.6)
plt.errorbar(x, gap_ci["win_rate"],
             yerr=[gap_ci["win_rate"]-gap_ci["ci_low"], gap_ci["ci_high"]-gap_ci["win_rate"]],
             fmt='none', capsize=5)
plt.xticks(x, gap_ci.index.astype(str))
plt.title("Lower-Division Win Rate by Division Gap (with 95% CI)")
plt.ylabel("Win rate")
plt.tight_layout()
plt.savefig("rq2_winrate_by_gap_CI.png", dpi=160)
plt.show()

# Chi-square tests: Upset association with ROUND and with DIVISION GAP

ct_round = pd.crosstab(used["_round_name"], used["_lower_win"]).reindex(round_order)
chi2_r, p_r, dof_r, exp_r = chi2_contingency(ct_round.fillna(0))
print("\nChi-square test (Upset vs ROUND): chi2=%.3f, dof=%d, p=%.4g" % (chi2_r, dof_r, p_r))

# Division gap contingency
ct_gap = pd.crosstab(used["_div_gap"], used["_lower_win"]).sort_index()
chi2_g, p_g, dof_g, exp_g = chi2_contingency(ct_gap)
print("Chi-square test (Upset vs DIVISION GAP): chi2=%.3f, dof=%d, p=%.4g" % (chi2_g, dof_g, p_g))

# Simple trend test over SEASONS (Spearman over season-level rates)
season_rates = used.groupby("season")["_lower_win"].mean().sort_index()
rho, p_s = spearmanr(season_rates.index.values, season_rates.values)
print("\nSpearman trend (season vs win rate): rho=%.3f, p=%.4g" % (rho, p_s))

print("RQ4 rates:\n", trend)

print("\nSaved figures:")
print(" - rq1_winrate_by_round.png")
print(" - rq2_winrate_by_gap.png")
print(" - rq4_winrate_by_season.png")


In [None]:
# Win-rate summaries with 95% CIs (by round, by division gap, by season)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.proportion import proportion_confint

# BY ROUND
round_order = ["Round 1","Round 2","Round 3","Quarterfinal","Semifinal","Final"]
round_grp = used.groupby("_round_name")["_lower_win"]
round_sum = round_grp.sum()
round_cnt = round_grp.count()
round_rate = (round_sum / round_cnt).reindex(round_order)

ci_low_r, ci_high_r = proportion_confint(
    count=round_sum.reindex(round_order).fillna(0).astype(int),
    nobs=round_cnt.reindex(round_order).fillna(0).astype(int),
    alpha=0.05, method="wilson"
)

round_ci = pd.DataFrame({
    "n": round_cnt.reindex(round_order).values,
    "win_rate": round_rate.values,
    "ci_low": ci_low_r,
    "ci_high": ci_high_r
}, index=round_order)
print("Win rate by ROUND with 95% CI:\n", round_ci)

plt.figure(figsize=(9,5))
x = np.arange(len(round_ci))
plt.bar(x, round_ci["win_rate"], width=0.6)
plt.errorbar(x, round_ci["win_rate"],
             yerr=[round_ci["win_rate"]-round_ci["ci_low"], round_ci["ci_high"]-round_ci["win_rate"]],
             fmt='none', capsize=5)
plt.xticks(x, round_ci.index, rotation=0)
plt.ylabel("Win rate (lower division)")
plt.title("Lower-Division Win Rate by Round (95% CI)")
plt.tight_layout()
plt.savefig("rq1_winrate_by_round_CI.png", dpi=160)
plt.show()

# BY DIVISION GAP
used["_div_gap"] = (used["home_division"] - used["away_division"]).abs()
gap_grp = used.groupby("_div_gap")["_lower_win"]
gap_sum = gap_grp.sum()
gap_cnt = gap_grp.count()
gap_rate = gap_sum / gap_cnt

ci_low_g, ci_high_g = proportion_confint(
    count=gap_sum.astype(int), nobs=gap_cnt.astype(int),
    alpha=0.05, method="wilson"
)

gap_ci = pd.DataFrame({
    "n": gap_cnt.values,
    "win_rate": gap_rate.values,
    "ci_low": ci_low_g,
    "ci_high": ci_high_g
}, index=gap_rate.index.sort_values())
print("\nWin rate by DIVISION GAP with 95% CI:\n", gap_ci)

plt.figure(figsize=(9,5))
x = np.arange(len(gap_ci))
plt.bar(x, gap_ci["win_rate"], width=0.6)
plt.errorbar(x, gap_ci["win_rate"],
             yerr=[gap_ci["win_rate"]-gap_ci["ci_low"], gap_ci["ci_high"]-gap_ci["win_rate"]],
             fmt='none', capsize=5)
plt.xticks(x, gap_ci.index.astype(str))
plt.ylabel("Win rate (lower division)")
plt.title("Lower-Division Win Rate by Division Gap (95% CI)")
plt.tight_layout()
plt.savefig("rq2_winrate_by_gap_CI.png", dpi=160)
plt.show()

# BY SEASON (with 95% CI)
season_sum = used.groupby("season")["_lower_win"].sum()
season_cnt = used.groupby("season")["_lower_win"].count()
season_rate = season_sum / season_cnt

ci_low_s, ci_high_s = proportion_confint(
    count=season_sum.astype(int), nobs=season_cnt.astype(int),
    alpha=0.05, method="wilson"
)
season_ci = pd.DataFrame({
    "n": season_cnt,
    "win_rate": season_rate,
    "ci_low": ci_low_s,
    "ci_high": ci_high_s
}).sort_index()
print("\nWin rate by SEASON with 95% CI:\n", season_ci)

plt.figure(figsize=(9,5))
plt.plot(season_ci.index.values, season_ci["win_rate"], marker="o")
plt.fill_between(season_ci.index.values, season_ci["ci_low"], season_ci["ci_high"], alpha=0.2)
plt.xlabel("Season")
plt.ylabel("Win rate (lower division)")
plt.title("Lower-Division Win Rate by Season (95% CI)")
plt.tight_layout()
plt.savefig("rq4_winrate_by_season_CI.png", dpi=160)
plt.show()


In [None]:
# Hypothesis tests: Upset association with Round / Division Gap, and Season trend

import pandas as pd
from scipy.stats import chi2_contingency, spearmanr

# Chi-square: Upset (0/1) vs ROUND
ct_round = pd.crosstab(used["_round_name"], used["_lower_win"]).reindex(
    ["Round 1","Round 2","Round 3","Quarterfinal","Semifinal","Final"]
)
chi2_r, p_r, dof_r, exp_r = chi2_contingency(ct_round.fillna(0))
print("Chi-square (Upset vs ROUND): chi2=%.3f, dof=%d, p=%.4g" % (chi2_r, dof_r, p_r))
print("Contingency (ROUND):\n", ct_round, "\n")

# Chi-square: Upset (0/1) vs DIVISION GAP
ct_gap = pd.crosstab(used["_div_gap"], used["_lower_win"]).sort_index()
chi2_g, p_g, dof_g, exp_g = chi2_contingency(ct_gap)
print("Chi-square (Upset vs DIVISION GAP): chi2=%.3f, dof=%d, p=%.4g" % (chi2_g, dof_g, p_g))
print("Contingency (DIV GAP):\n", ct_gap, "\n")

# Season trend
season_rates = used.groupby("season")["_lower_win"].mean().sort_index()
rho, p_s = spearmanr(season_rates.index.values, season_rates.values)
print("Spearman trend (season vs lower-division win rate): rho=%.3f, p=%.4g" % (rho, p_s))

