In [63]:
import pandas as pd
import urllib.request
import os

# 1. Official Data Source (2022-2023 Season and 2023-2024 Season)
# We combine two seasons to give the Machine Learning model more to learn from
urls = [
    "https://www.football-data.co.uk/mmz4281/2324/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2223/E0.csv"
]
save_path = "../data/matches.csv"

try:
    print("Fetching data from official football archives... ⚽")
    all_data = []
    for url in urls:
        df = pd.read_csv(url)
        all_data.append(df)
    
    # Combine seasons
    matches_raw = pd.concat(all_data, ignore_index=True)
    
    # 2. RENAME COLUMNS to match your project requirements
    # HomeTeam -> team, AwayTeam -> opponent, FTR -> result
    # We create two rows per match (one for home, one for away) to match the tutorial style
    home_matches = matches_raw[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG']].copy()
    home_matches.columns = ['date', 'team', 'opponent', 'result', 'gf', 'ga']
    home_matches['venue'] = 'Home'

    away_matches = matches_raw[['Date', 'AwayTeam', 'HomeTeam', 'FTR', 'FTAG', 'FTHG']].copy()
    away_matches.columns = ['date', 'team', 'opponent', 'result', 'gf', 'ga']
    away_matches['venue'] = 'Away'
    # Flip the result for the away team
    away_matches['result'] = away_matches['result'].map({'H': 'L', 'A': 'W', 'D': 'D'})
    home_matches['result'] = home_matches['result'].map({'H': 'W', 'A': 'L', 'D': 'D'})

    matches = pd.concat([home_matches, away_matches], ignore_index=True)
    
    # 3. Save and Display
    matches.to_csv(save_path, index=False)
    print(f"✅ Success! {len(matches)} match-perspectives loaded and saved.")
    display(matches.head())

except Exception as e:
    print(f"❌ Download failed: {e}")

Fetching data from official football archives... ⚽
✅ Success! 1520 match-perspectives loaded and saved.


Unnamed: 0,date,team,opponent,result,gf,ga,venue
0,11/08/2023,Burnley,Man City,L,0,3,Home
1,12/08/2023,Arsenal,Nott'm Forest,W,2,1,Home
2,12/08/2023,Bournemouth,West Ham,D,1,1,Home
3,12/08/2023,Brighton,Luton,W,4,1,Home
4,12/08/2023,Everton,Fulham,L,0,1,Home


In [64]:
# 1. Convert 'date' column to actual Datetime objects
# We specify dayfirst=True because the source data is in UK format (DD/MM/YYYY)
matches["date"] = pd.to_datetime(matches["date"], dayfirst=True)

# 2. Create the Target variable
# 1 = Win, 0 = Loss or Draw (We are predicting winners specifically)
matches["target"] = (matches["result"] == "W").astype("int")

# 3. Convert Venue to numbers (Home = 1, Away = 0)
matches["venue_code"] = (matches["venue"] == "Home").astype("int")

# 4. Convert Opponents into unique numeric codes
# Every unique team name gets a specific number
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# 5. Add Day of the Week code (0 = Monday, 6 = Sunday)
matches["day_code"] = matches["date"].dt.dayofweek

print("Preprocessing Complete! ⚽")
# Display the new columns we created
display(matches[["date", "team", "opponent", "venue_code", "opp_code", "day_code", "target"]].head())

Preprocessing Complete! ⚽


Unnamed: 0,date,team,opponent,venue_code,opp_code,day_code,target
0,2023-08-11,Burnley,Man City,1,14,4,0
1,2023-08-12,Arsenal,Nott'm Forest,1,17,5,1
2,2023-08-12,Bournemouth,West Ham,1,21,5,0
3,2023-08-12,Brighton,Luton,1,13,5,1
4,2023-08-12,Everton,Fulham,1,9,5,0


In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# 1. Initialize the Model
# n_estimators: number of trees, min_samples_split: prevents overfitting
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

# 2. Split data: Train on matches before 2024, Test on 2024 matches
train = matches[matches["date"] < '2024-01-01']
test = matches[matches["date"] >= '2024-01-01']

# 3. Define our Predictors (the numbers we use to guess the result)
predictors = ["venue_code", "opp_code", "day_code"]

# 4. Train the "AI Brain"
rf.fit(train[predictors], train["target"])

# 5. Make Predictions on the Test Set
preds = rf.predict(test[predictors])

# 6. Check Accuracy
acc = accuracy_score(test["target"], preds)
print(f"Initial Model Accuracy: {acc*100:.2f}%")

Initial Model Accuracy: 61.41%


In [66]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier

# 1. LOAD DATA FRESH
df_raw = pd.read_csv("../data/matches.csv")

# 2. PREPROCESSING
df_raw["date"] = pd.to_datetime(df_raw["date"], dayfirst=True)
df_raw["target"] = (df_raw["result"] == "W").astype("int")
df_raw["venue_code"] = (df_raw["venue"] == "Home").astype("int")
df_raw["opp_code"] = df_raw["opponent"].astype("category").cat.codes
df_raw["day_code"] = df_raw["date"].dt.dayofweek

# 3. ROLLING AVERAGES LOGIC
def make_rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    return group.dropna(subset=new_cols)

cols = ["gf", "ga"]
new_cols = [f"{c}_rolling" for c in cols]

# 4. THE ROBUST FIX: Loop through teams to keep names intact
all_teams = []
for team in df_raw['team'].unique():
    team_data = df_raw[df_raw['team'] == team].copy()
    team_with_rolling = make_rolling_averages(team_data, cols, new_cols)
    all_teams.append(team_with_rolling)

app_data = pd.concat(all_teams).reset_index(drop=True)

# 5. TRAIN FINAL MODEL
predictors = ["venue_code", "opp_code", "day_code", "gf_rolling", "ga_rolling"]
rf_final = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
rf_final.fit(app_data[predictors], app_data["target"])

# 6. SAVE EVERYTHING
joblib.dump(rf_final, '../outputs/epl_model.pkl')
app_data.to_csv('../data/processed_matches.csv', index=False)

print("--- CLEAN SETUP COMPLETE ---")
print(f"✅ Success! Found 'team' column: {'team' in app_data.columns}")
print(f"✅ Verified Team Name: {app_data['team'].iloc[0]}") # Should be a name, not a number
display(app_data[['team', 'opponent', 'gf_rolling']].head())

--- CLEAN SETUP COMPLETE ---
✅ Success! Found 'team' column: True
✅ Verified Team Name: Burnley


Unnamed: 0,team,opponent,gf_rolling
0,Burnley,Nott'm Forest,1.0
1,Burnley,Man United,1.333333
2,Burnley,Newcastle,1.0
3,Burnley,Luton,0.333333
4,Burnley,Chelsea,0.666667
