In [None]:
import pandas as pd

players_df = pd.read_csv("cleaned_players.csv")
matches_df = pd.read_csv("dfb_matches.csv")

In [None]:
# Fill missing player values using team-season averages
avg_team_value = (
    players_df[players_df['value'].notna()]
    .groupby(['team', 'season'])['value']
    .mean()
    .reset_index()
    .rename(columns={'value': 'avg_team_value'})
)

players_with_filled_value = players_df.merge(avg_team_value, on=['team', 'season'], how='left')
players_with_filled_value['value'] = players_with_filled_value['value'].fillna(players_with_filled_value['avg_team_value'])

In [None]:
# Compute total team value per season
team_value_filled = (
    players_with_filled_value
    .groupby(['team', 'season'])['value']
    .sum()
    .reset_index()
    .rename(columns={'value': 'team_value'})
)

In [None]:
# Merge team values into match data
home_values_filled = team_value_filled.rename(columns={'team': 'home_team', 'team_value': 'home_team_value'})
away_values_filled = team_value_filled.rename(columns={'team': 'away_team', 'team_value': 'away_team_value'})

matches_with_values = matches_df.merge(home_values_filled, on=['home_team', 'season'], how='left')\
                                .merge(away_values_filled, on=['away_team', 'season'], how='left')

matches_with_values_clean = matches_with_values.dropna(subset=['home_team_value', 'away_team_value'])

In [None]:
# Cross-Division Team Value Difference Analysis
def get_higher_div_team(row):
    if row['home_division'] < row['away_division']:
        return 'home'
    elif row['away_division'] < row['home_division']:
        return 'away'
    else:
        return 'same'

matches_with_values_clean['division_comparison'] = matches_with_values_clean.apply(get_higher_div_team, axis=1)

cross_div_matches = matches_with_values_clean[matches_with_values_clean['division_comparison'] != 'same']

cross_div_matches['winner'] = cross_div_matches.apply(
    lambda row: 'home' if row['home_score'] > row['away_score']
    else ('away' if row['away_score'] > row['home_score'] else 'draw'),
    axis=1
)

def get_value_diff(row):
    if row['division_comparison'] == 'home':
        return row['home_team_value'] - row['away_team_value']
    else:
        return row['away_team_value'] - row['home_team_value']

def get_result_from_high_div_team(row):
    if row['division_comparison'] == row['winner']:
        return 'won'
    elif row['winner'] == 'draw':
        return 'draw'
    else:
        return 'lost'

cross_div_matches['value_diff'] = cross_div_matches.apply(get_value_diff, axis=1)
cross_div_matches['high_div_team_result'] = cross_div_matches.apply(get_result_from_high_div_team, axis=1)

In [None]:
# Visualize the result with Boxplot
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.boxplot(data=cross_div_matches, x='high_div_team_result', y='value_diff', palette='coolwarm')
plt.title("Team Value Difference vs Match Outcome\n(Higher-Division Team Perspective)", fontsize=14)
plt.xlabel("Result of Higher-Division Team", fontsize=12)
plt.ylabel("Team Value Difference", fontsize=12)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Statistical Hypothesis Testing for the Money Factor

import numpy as np
import pandas as pd

from scipy.stats import mannwhitneyu, ttest_ind
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

dfv = matches_with_filled_values_clean.copy()

# Keep only decided matches (no draws) and cross-division fixtures
dfv = dfv[dfv["winner"].isin(["home", "away"])].copy()
dfv = dfv[dfv["home_division"] != dfv["away_division"]].copy()

# Define which side is lower-division and whether it's an upset
dfv["lower_side"] = np.where(dfv["home_division"] > dfv["away_division"], "home", "away")
dfv["is_upset"] = (dfv["winner"] == dfv["lower_side"]).astype(int)

# Absolute value gap for distribution comparisons
dfv["value_gap_abs"] = dfv["value_diff"].abs()

print("Row counts:")
print("  Total cross-division decided matches:", len(dfv))
print("  Upsets:", int(dfv["is_upset"].sum()))
print("  Non-upsets:", int((1 - dfv["is_upset"]).sum()))

# Mann–Whitney U and Welch t-test on |value_diff|

g_upset = dfv.loc[dfv["is_upset"] == 1, "value_gap_abs"].values
g_non   = dfv.loc[dfv["is_upset"] == 0, "value_gap_abs"].values

mw_stat, mw_p = mannwhitneyu(g_upset, g_non, alternative="two-sided")
t_stat, t_p   = ttest_ind(g_upset, g_non, equal_var=False)  # Welch's t-test

print("\n[Money gap distribution tests: |value_diff|]")
print("  Mann–Whitney U: U = %.2f, p = %.4g" % (mw_stat, mw_p))
print("  Welch t-test:   t = %.2f, p = %.4g" % (t_stat, t_p))
print("  Means:   upset = %.3g, non-upset = %.3g" % (g_upset.mean(), g_non.mean()))
print("  Medians: upset = %.3g, non-upset = %.3g" % (np.median(g_upset), np.median(g_non)))

if mw_p < 0.05 or t_p < 0.05:
    print("  → Result: Significant difference. Upsets tend to occur with smaller money gaps.")
else:
    print("  → Result: No significant difference detected at 5% level.")

df1 = matches_with_filled_values_clean.copy()
df1 = df1[df1["winner"].isin(["home", "away"])].copy()
df1 = df1[(df1["home_division"] - df1["away_division"]).abs() <= 1].copy()

df1["higher_value_team"] = np.where(df1["value_diff"] > 0, "home",
                                np.where(df1["value_diff"] < 0, "away", "equal"))
df1 = df1[df1["higher_value_team"] != "equal"].copy()

df1["higher_team_won"] = (df1["winner"] == df1["higher_value_team"]).astype(int)

succ = int(df1["higher_team_won"].sum())
nobs = int(df1["higher_team_won"].count())
phat = (succ / nobs) if nobs else np.nan

# One-sample z-test against 0.5
z_stat, p_val = proportions_ztest(count=succ, nobs=nobs, value=0.5)

# 95% Wilson CI for the win proportion
ci_low, ci_high = proportion_confint(count=succ, nobs=nobs, alpha=0.05, method="wilson")

print("\n[Higher-value team win rate within ±1 division]")
print("  successes = %d, n = %d, p̂ = %.3f" % (succ, nobs, phat))
print("  z = %.3f, p = %.4g" % (z_stat, p_val))
print("  95%% CI (Wilson) = [%.3f, %.3f]" % (ci_low, ci_high))

if p_val < 0.05 and phat > 0.5:
    print("  → Result: Significantly above 50%% — richer teams win more often even within ±1 division.")
elif p_val < 0.05 and phat < 0.5:
    print("  → Result: Significantly below 50%% — richer teams win less often than chance (unexpected).")
else:
    print("  → Result: Not significantly different from 50%% at 5%% level.")