In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess


In [4]:
push_updates = True


## Simulate Games to calculate xP

Basic strategy: 

- simulate each shot N times
- sum each of those shots along the columns by event.id, by team
- compare sum by team by simulation
- calculate the probability the team wins the event by comparing % of matches wrt win, lose, draw

In [5]:
# import the shots dataset
shots = pd.read_csv("/Users/harrisonward/Desktop/CS/Git/xG/datasets/23_24_shotmaps_augmented.csv", index_col="Unnamed: 0")

In [6]:
# pull P(Goal) and match info
sim_df = shots[["xG", "team", "opponent", "event.id"]]

In [21]:
# find the number of sample, and the associated weights
N_samples = 10_000
shot_sim_weights = sim_df[["xG"]].values

In [22]:
# simulate each of the shots
shot_sim_matrix = np.random.binomial(
    1, shot_sim_weights.squeeze(), size=(N_samples, shot_sim_weights.shape[0])
)
shot_sim_matrix.shape

(10000, 1912)

In [23]:
# create column names for the simulated df and concat with match info
simulated_shots_df = pd.DataFrame(
    shot_sim_matrix.T,
    columns=[f"Simulation_{i+1}" for i in range(shot_sim_matrix.shape[0])],
)

simulated_shots_df = pd.concat(
    [simulated_shots_df, shots[["event.id", "team", "opponent"]]], axis=1
)
simulated_shots_df.head()

Unnamed: 0,Simulation_1,Simulation_2,Simulation_3,Simulation_4,Simulation_5,Simulation_6,Simulation_7,Simulation_8,Simulation_9,Simulation_10,...,Simulation_9994,Simulation_9995,Simulation_9996,Simulation_9997,Simulation_9998,Simulation_9999,Simulation_10000,event.id,team,opponent
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11352251,Arsenal,Nottingham Forest
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,11352251,Nottingham Forest,Arsenal
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11352251,Arsenal,Nottingham Forest
3,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,1,0,0,11352251,Arsenal,Nottingham Forest
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11352251,Arsenal,Nottingham Forest


In [24]:
# sum each column by event, opponent and team; take the transpose
simulated_events_df = simulated_shots_df.groupby(["event.id", "team", "opponent"]).sum()
simulated_events_df_t = simulated_events_df.T
simulated_events_df_t.head()

event.id,11352250,11352250,11352251,11352251,11352252,11352252,11352253,11352253,11352254,11352254,...,11352642,11352642,11352643,11352643,11352644,11352644,11352645,11352645,11352646,11352646
team,Bournemouth,West Ham United,Arsenal,Nottingham Forest,Brighton & Hove Albion,Luton Town,Everton,Fulham,Crystal Palace,Sheffield United,...,Burnley,Newcastle United,Brentford,Nottingham Forest,Liverpool,Tottenham Hotspur,Sheffield United,West Ham United,Manchester City,Wolverhampton
opponent,West Ham United,Bournemouth,Nottingham Forest,Arsenal,Luton Town,Brighton & Hove Albion,Fulham,Everton,Sheffield United,Crystal Palace,...,Newcastle United,Burnley,Nottingham Forest,Brentford,Tottenham Hotspur,Liverpool,West Ham United,Sheffield United,Wolverhampton,Manchester City
Simulation_1,2,0,2,1,5,1,1,1,0,0,...,1,3,1,1,0,3,1,3,0,1
Simulation_2,3,0,1,1,1,1,2,1,1,0,...,0,3,2,0,1,1,0,3,0,1
Simulation_3,1,1,2,1,2,2,3,1,4,0,...,0,3,1,1,1,1,0,1,0,0
Simulation_4,1,1,2,0,3,1,2,1,1,0,...,2,5,1,0,0,1,1,4,0,0
Simulation_5,1,1,4,1,2,1,1,0,1,0,...,0,4,1,0,0,3,0,2,0,0


In [25]:
# calculate possible outcomes for each event
simulated_results = []
for event_id, team, opponent in simulated_events_df_t.columns:
    simulated_results_dict = {
        "event.id": event_id,
        "team": team,
        "opponent": opponent,
        "results": np.array(
            simulated_events_df_t[event_id][team][opponent]
            - simulated_events_df_t[event_id][opponent][team]
        ),
    }

    simulated_results_dict["outcome_code"] = np.where(
        simulated_results_dict["results"] > 0, 1, -1
    )
    simulated_results_dict["outcome_code"] = np.where(
        simulated_results_dict["results"] == 0,
        0,
        simulated_results_dict["outcome_code"],
    )

    simulated_results_dict["win.pct"] = np.average(
        np.where(simulated_results_dict["outcome_code"] == 1, 1, 0)
    )
    simulated_results_dict["draw.pct"] = np.average(
        np.where(simulated_results_dict["outcome_code"] == 0, 1, 0)
    )

    simulated_results_dict["xP"] = (
        3 * simulated_results_dict["win.pct"] + 1 * simulated_results_dict["draw.pct"]
    )

    simulated_results.append(simulated_results_dict)

In [26]:
simulated_results_df = pd.DataFrame(simulated_results)
simulated_results_df.to_csv('/Users/harrisonward/Desktop/CS/Git/xG/assets/simulated_match_data.csv')
simulated_results_df.head()

Unnamed: 0,event.id,team,opponent,results,outcome_code,win.pct,draw.pct,xP
0,11352250,Bournemouth,West Ham United,"[2, 3, 0, 0, 0, 1, 1, -1, -1, -1, 0, 0, -1, 0,...","[1, 1, 0, 0, 0, 1, 1, -1, -1, -1, 0, 0, -1, 0,...",0.4254,0.3885,1.6647
1,11352250,West Ham United,Bournemouth,"[-2, -3, 0, 0, 0, -1, -1, 1, 1, 1, 0, 0, 1, 0,...","[-1, -1, 0, 0, 0, -1, -1, 1, 1, 1, 0, 0, 1, 0,...",0.1861,0.3885,0.9468
2,11352251,Arsenal,Nottingham Forest,"[1, 0, 1, 2, 3, 1, 3, 1, 2, 0, 1, -1, 4, 0, 1,...","[1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, -1, 1, 0, 1,...",0.7106,0.2193,2.3511
3,11352251,Nottingham Forest,Arsenal,"[-1, 0, -1, -2, -3, -1, -3, -1, -2, 0, -1, 1, ...","[-1, 0, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, ...",0.0701,0.2193,0.4296
4,11352252,Brighton & Hove Albion,Luton Town,"[4, 0, 0, 2, 1, 3, 2, 4, 2, 2, 3, 0, 2, 2, 5, ...","[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0.923,0.0639,2.8329


In [27]:
xp_table = (
    pd.DataFrame(simulated_results_df.groupby(by="team")["xP"].sum())
    .sort_values(by="xP", ascending=False)
    .reset_index()
)
xp_table.index = range(1, xp_table.shape[0] + 1)
xp_table["xP"] = xp_table["xP"].map("{:,.2f}".format).astype("float64")
xp_table = xp_table.rename(columns={"team": "Team"})

In [28]:
xg_table = pd.read_csv("/Users/harrisonward/Desktop/CS/Git/xG/assets/xG_table.csv", index_col="Unnamed: 0")

full_table = xg_table.merge(xp_table, how="outer", left_on="Team", right_on="Team")
full_table = full_table.sort_values(by="xP", ascending=False, inplace=False)
full_table.index = range(1, full_table.shape[0] + 1)

In [29]:
full_table.to_csv("/Users/harrisonward/Desktop/CS/Git/xG/assets/full_table.csv")
full_table.to_markdown("/Users/harrisonward/Desktop/CS/Git/xG/assets/full_table.md")
full_table.head(20)

Unnamed: 0,Team,xG,xGA,xG Differential,xP
1,Manchester City,13.84,2.9,10.94,17.47
2,Arsenal,14.05,5.66,8.39,15.71
3,Tottenham Hotspur,14.23,7.8,6.42,14.31
4,Aston Villa,14.79,9.35,5.43,13.2
5,Newcastle United,15.86,6.67,9.19,13.0
6,Liverpool,13.37,8.98,4.38,12.52
7,Brighton & Hove Albion,14.55,12.44,2.11,12.35
8,West Ham United,12.99,10.69,2.31,11.58
9,Crystal Palace,8.6,6.95,1.64,10.54
10,Chelsea,8.66,6.56,2.11,10.52


In [30]:
if push_updates:
    subprocess.run(
        ["/Users/harrisonward/Desktop/CS/Git/xG/scripts/push_assets.sh"])


[main fdc52c2] Auto-commit all assets
 3 files changed, 177 insertions(+), 177 deletions(-)
 rewrite assets/full_table.csv (82%)
 rewrite assets/simulated_match_data.csv (99%)


remote: Bypassed rule violations for refs/heads/main:        
remote: 
remote: - Changes must be made through a pull request.        
remote: 
To https://github.com/Harrison-Ward/xG.git
   0c217c8..fdc52c2  main -> main
