In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
# Read match data
all_matches = pd.read_csv("matches.csv", index_col=0)

# Clean goals for/against columns by removing extra text in parentheses
all_matches['gf'] = pd.to_numeric(all_matches['gf'].str.split('(').str[0].str.strip(), errors='coerce')
all_matches['ga'] = pd.to_numeric(all_matches['ga'].str.split('(').str[0].str.strip(), errors='coerce')

# Convert penalty columns to float
all_matches['pk'] = pd.to_numeric(all_matches['pk'], errors='coerce').astype('float64')
all_matches['pkatt'] = pd.to_numeric(all_matches['pkatt'], errors='coerce').astype('float64')

In [3]:
del all_matches["comp"]
del all_matches["notes"]

In [4]:
all_matches["date"] = pd.to_datetime(all_matches["date"])


In [5]:
# Convert categorical columns to codes
all_matches["venue_code"] = all_matches["venue"].astype("category").cat.codes
all_matches["opponent_code"] = all_matches["opponent"].astype("category").cat.codes

# Extract hour from time
all_matches["hour"] = all_matches["time"].str.replace(":.+", "",regex=True).astype("int")

# Get day of week (0-6)
all_matches["day_code"]=all_matches["date"].dt.dayofweek

# Create binary target (1=win, 0=not win)
all_matches["target"] = (all_matches["result"] == "W").astype("int")

In [6]:
rf = RandomForestClassifier(
    n_estimators=750,      # Number of trees
    min_samples_split=35,  # Min samples required to split node
    max_depth=15,         # Max tree depth
    class_weight='balanced',  # Handle class imbalance
    random_state=1
)

In [7]:
train = all_matches[all_matches["date"]<'2024-01-01']
test = all_matches[all_matches["date"]>'2024-01-01']

In [8]:
predictors = ["venue_code", "opponent_code","hour","day_code"]

In [9]:
rf.fit(train[predictors], train["target"])

In [10]:
preds = rf.predict(test[predictors])

In [11]:
precision = accuracy_score(test["target"], preds)

In [12]:
precision

0.6390728476821192

In [13]:
combined = pd.DataFrame(dict(actual=test["target"],predicted=preds))

In [14]:
pd.crosstab(index=combined["actual"], columns = combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,114,54
1,55,79


In [15]:
precision_score(test["target"],preds)

0.5939849624060151

In [16]:
grouped_matches = all_matches.groupby("team")


In [17]:
group = grouped_matches.get_group("Arsenal").sort_values("date")

In [18]:
def rolling_averages(group, cols, new_cols):
    # Sort matches by date
    group = group.sort_values("date")
    # Calculate 10-game rolling averages for specified columns
    rolling_stats = group[cols].rolling(10, closed='left').mean()
    # Add rolling averages as new columns
    group[new_cols] = rolling_stats
    # Remove rows with missing rolling averages
    group = group.dropna(subset=new_cols)
    return group

In [19]:
columns = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_columns = [f"{c}_rolling" for c in columns]

rolling_averages(group, columns, new_columns)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2024-09-19,21:00,League phase,Thu,Away,D,0.0,0.0,Atalanta,0.8,...,3,0,1.9,0.8,12.0,4.2,15.24,0.1,0.1,0.1
8,2024-10-01,20:00,League phase,Tue,Home,W,2.0,0.0,Paris S-G,0.7,...,1,1,1.5,0.8,10.8,3.6,15.29,0.2,0.1,0.1
11,2024-10-22,20:00,League phase,Tue,Home,W,1.0,0.0,Shakhtar,1.8,...,1,1,1.6,0.6,10.3,3.5,15.68,0.3,0.1,0.1
15,2024-11-06,21:00,League phase,Wed,Away,L,0.0,1.0,Inter,1.8,...,2,0,1.5,0.5,10.1,3.5,15.64,0.3,0.1,0.2
18,2024-11-26,20:00,League phase,Tue,Away,W,5.0,1.0,Sporting Cp,4.0,...,1,1,1.3,0.6,11.0,3.6,15.47,0.2,0.1,0.2
22,2024-12-11,20:00,League phase,Wed,Home,W,3.0,0.0,Monaco,2.3,...,2,1,1.2,0.7,10.9,3.7,15.45,0.2,0.1,0.2


In [20]:
matches_rolling = all_matches.groupby("team").apply(lambda x: rolling_averages(x, columns, new_columns))

  matches_rolling = all_matches.groupby("team").apply(lambda x: rolling_averages(x, columns, new_columns))


In [21]:
matches_rolling = matches_rolling.droplevel('team')

In [22]:
matches_rolling.index = range(matches_rolling.shape[0])

In [23]:
def make_predictions(data, predictors):
    # Split data into train (pre-2024) and test (2024+) sets
    train = data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    # Fit model and make predictions
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    # Combine actual and predicted values
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    # Calculate precision score
    error = precision_score(test["target"], preds)
    return combined, error

In [24]:
combined, precision = make_predictions(matches_rolling, predictors + new_columns)

In [25]:
precision

0.6666666666666666

In [26]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [27]:
combined.head()

Unnamed: 0,actual,predicted,date,team,opponent,result
4,0,0,2024-09-19,Arsenal,Atalanta,D
5,1,0,2024-10-01,Arsenal,Paris S-G,W
6,1,1,2024-10-22,Arsenal,Shakhtar,W
7,0,0,2024-11-06,Arsenal,Inter,L
8,1,1,2024-11-26,Arsenal,Sporting Cp,W


In [28]:
class MissingDict(dict):
    def __missing__(self, key):
        return key

map_values = {
    "Atletico Madrid": "Atlético Madrid",
    "Internazionale": "Inter",
    "Paris Saint Germain": "Paris S-G",
    "Sporting CP": "Sporting Cp",
    "RB Leipzig": "Rb Leipzig",
    "Red Bull Salzburg": "Rb Salzburg",
    "Shakhtar Donetsk": "Shakhtar",
    "PSV Eindhoven": "Psv Eindhoven",
    "Bayer Leverkusen": "Leverkusen",
}

mapping = MissingDict(**map_values)

In [29]:
combined["new_team"] = combined["team"].map(mapping)

In [30]:
merged_data = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [31]:

merged_data.head(10)

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2024-10-01,Arsenal,Paris S-G,W,Arsenal,0,1,Paris Saint Germain,Arsenal,L,Paris S-G
1,1,1,2024-10-22,Arsenal,Shakhtar,W,Arsenal,0,0,Shakhtar Donetsk,Arsenal,L,Shakhtar
2,0,0,2024-11-06,Arsenal,Inter,L,Arsenal,1,1,Internazionale,Arsenal,W,Inter
3,1,1,2024-11-26,Arsenal,Sporting Cp,W,Arsenal,0,0,Sporting CP,Arsenal,L,Sporting Cp
4,0,1,2024-12-10,Atalanta,Real Madrid,L,Atalanta,1,1,Real Madrid,Atalanta,W,Real Madrid
5,0,1,2024-02-20,Atletico Madrid,Inter,L,Atlético Madrid,1,0,Internazionale,Atlético Madrid,W,Inter
6,1,1,2024-03-13,Atletico Madrid,Inter,W,Atlético Madrid,0,0,Internazionale,Atlético Madrid,L,Inter
7,1,1,2024-04-10,Atletico Madrid,Dortmund,W,Atlético Madrid,0,0,Dortmund,Atlético Madrid,L,Dortmund
8,0,0,2024-04-16,Atletico Madrid,Dortmund,L,Atlético Madrid,1,0,Dortmund,Atlético Madrid,W,Dortmund
9,1,1,2024-09-19,Atletico Madrid,Rb Leipzig,W,Atlético Madrid,0,0,RB Leipzig,Atlético Madrid,L,Rb Leipzig


In [32]:
merged_data[(merged_data["predicted_x"] == 1) & (merged_data["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    18
0    15
Name: count, dtype: int64