In [1]:
import pandas as pd
import urllib.request
import os

# 1. Official Data Source (2022-2023 Season and 2023-2024 Season)
# We combine two seasons to give the Machine Learning model more to learn from
urls = [
    "https://www.football-data.co.uk/mmz4281/2324/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2223/E0.csv"
]
save_path = "../data/matches.csv"

try:
    print("Fetching data from official football archives... ‚öΩ")
    all_data = []
    for url in urls:
        df = pd.read_csv(url)
        all_data.append(df)
    
    # Combine seasons
    matches_raw = pd.concat(all_data, ignore_index=True)
    
    # 2. RENAME COLUMNS to match your project requirements
    # HomeTeam -> team, AwayTeam -> opponent, FTR -> result
    # We create two rows per match (one for home, one for away) to match the tutorial style
    home_matches = matches_raw[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG']].copy()
    home_matches.columns = ['date', 'team', 'opponent', 'result', 'gf', 'ga']
    home_matches['venue'] = 'Home'

    away_matches = matches_raw[['Date', 'AwayTeam', 'HomeTeam', 'FTR', 'FTAG', 'FTHG']].copy()
    away_matches.columns = ['date', 'team', 'opponent', 'result', 'gf', 'ga']
    away_matches['venue'] = 'Away'
    # Flip the result for the away team
    away_matches['result'] = away_matches['result'].map({'H': 'L', 'A': 'W', 'D': 'D'})
    home_matches['result'] = home_matches['result'].map({'H': 'W', 'A': 'L', 'D': 'D'})

    matches = pd.concat([home_matches, away_matches], ignore_index=True)
    
    # 3. Save and Display
    matches.to_csv(save_path, index=False)
    print(f"‚úÖ Success! {len(matches)} match-perspectives loaded and saved.")
    display(matches.head())

except Exception as e:
    print(f"‚ùå Download failed: {e}")

Fetching data from official football archives... ‚öΩ
‚úÖ Success! 1520 match-perspectives loaded and saved.


Unnamed: 0,date,team,opponent,result,gf,ga,venue
0,11/08/2023,Burnley,Man City,L,0,3,Home
1,12/08/2023,Arsenal,Nott'm Forest,W,2,1,Home
2,12/08/2023,Bournemouth,West Ham,D,1,1,Home
3,12/08/2023,Brighton,Luton,W,4,1,Home
4,12/08/2023,Everton,Fulham,L,0,1,Home


In [2]:
# 1. Convert 'date' column to actual Datetime objects
# We specify dayfirst=True because the source data is in UK format (DD/MM/YYYY)
matches["date"] = pd.to_datetime(matches["date"], dayfirst=True)

# 2. Create the Target variable
# 1 = Win, 0 = Loss or Draw (We are predicting winners specifically)
matches["target"] = (matches["result"] == "W").astype("int")

# 3. Convert Venue to numbers (Home = 1, Away = 0)
matches["venue_code"] = (matches["venue"] == "Home").astype("int")

# 4. Convert Opponents into unique numeric codes
# Every unique team name gets a specific number
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# 5. Add Day of the Week code (0 = Monday, 6 = Sunday)
matches["day_code"] = matches["date"].dt.dayofweek

print("Preprocessing Complete! ‚öΩ")
# Display the new columns we created
display(matches[["date", "team", "opponent", "venue_code", "opp_code", "day_code", "target"]].head())

Preprocessing Complete! ‚öΩ


Unnamed: 0,date,team,opponent,venue_code,opp_code,day_code,target
0,2023-08-11,Burnley,Man City,1,14,4,0
1,2023-08-12,Arsenal,Nott'm Forest,1,17,5,1
2,2023-08-12,Bournemouth,West Ham,1,21,5,0
3,2023-08-12,Brighton,Luton,1,13,5,1
4,2023-08-12,Everton,Fulham,1,9,5,0


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# 1. Initialize the Model
# n_estimators: number of trees, min_samples_split: prevents overfitting
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

# 2. Split data: Train on matches before 2024, Test on 2024 matches
train = matches[matches["date"] < '2024-01-01']
test = matches[matches["date"] >= '2024-01-01']

# 3. Define our Predictors (the numbers we use to guess the result)
predictors = ["venue_code", "opp_code", "day_code"]

# 4. Train the "AI Brain"
rf.fit(train[predictors], train["target"])

# 5. Make Predictions on the Test Set
preds = rf.predict(test[predictors])

# 6. Check Accuracy
acc = accuracy_score(test["target"], preds)
print(f"Initial Model Accuracy: {acc*100:.2f}%")

Initial Model Accuracy: 61.41%


In [4]:
import pandas as pd
import joblib
import urllib.request
from sklearn.ensemble import RandomForestClassifier

# 1. EXPANDED DATA SOURCE (Adding 24/25 and 25/26)
# Format: https://www.football-data.co.uk/mmz4281/YYZZ/E0.csv
urls = [
    "https://www.football-data.co.uk/mmz4281/2526/E0.csv", # Current Season (Ongoing)
    "https://www.football-data.co.uk/mmz4281/2425/E0.csv", # Last Season
    "https://www.football-data.co.uk/mmz4281/2324/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2223/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2122/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2021/E0.csv",
    "https://www.football-data.co.uk/mmz4281/1920/E0.csv"
]

print("Fetching 7 seasons of Premier League data (including 2026)... ‚öΩ")
all_seasons = []
for url in urls:
    try:
        # We use low_memory=False to handle mixed types in current season data
        df_season = pd.read_csv(url, low_memory=False)
        # Drop empty rows that sometimes appear at the bottom of the live 25/26 file
        df_season = df_season.dropna(subset=['Date', 'HomeTeam'])
        all_seasons.append(df_season)
        print(f"‚úÖ Loaded: {url.split('/')[-2]}")
    except Exception as e:
        print(f"‚ö†Ô∏è Skipping {url} (Season might not have started or link changed). Error: {e}")

raw_data = pd.concat(all_seasons, ignore_index=True)

# 2. STANDARDISE COLUMNS
# Home perspective
home = raw_data[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG']].copy()
home.columns = ['date', 'team', 'opponent', 'result', 'gf', 'ga']
home['venue'] = 'Home'
home['result'] = home['result'].map({'H': 'W', 'A': 'L', 'D': 'D'})

# Away perspective
away = raw_data[['Date', 'AwayTeam', 'HomeTeam', 'FTR', 'FTAG', 'FTHG']].copy()
away.columns = ['date', 'team', 'opponent', 'result', 'gf', 'ga']
away['venue'] = 'Away'
away['result'] = away['result'].map({'H': 'L', 'A': 'W', 'D': 'D'})

df_combined = pd.concat([home, away], ignore_index=True)

# 3. PREPROCESSING
df_combined["date"] = pd.to_datetime(df_combined["date"], dayfirst=True)
df_combined["target"] = (df_combined["result"] == "W").astype("int")
df_combined["venue_code"] = (df_combined["venue"] == "Home").astype("int")

# Important: We re-calculate categorical codes to include any new teams promoted in 25/26
df_combined["opp_code"] = df_combined["opponent"].astype("category").cat.codes
df_combined["day_code"] = df_combined["date"].dt.dayofweek

# 4. ROLLING AVERAGES LOGIC
def make_rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    return group.dropna(subset=new_cols)

cols = ["gf", "ga"]
new_cols = [f"{c}_rolling" for c in cols]

# 5. APPLY ROLLING STATS
all_teams_data = []
for team in df_combined['team'].unique():
    team_data = df_combined[df_combined['team'] == team].copy()
    team_with_rolling = make_rolling_averages(team_data, cols, new_cols)
    all_teams_data.append(team_with_rolling)

app_data = pd.concat(all_teams_data).reset_index(drop=True)

# 6. TRAIN REINFORCED MODEL
predictors = ["venue_code", "opp_code", "day_code", "gf_rolling", "ga_rolling"]
rf_reinforced = RandomForestClassifier(n_estimators=200, min_samples_split=5, random_state=1)
rf_reinforced.fit(app_data[predictors], app_data["target"])

# 7. SAVE REINFORCED FILES
joblib.dump(rf_reinforced, '../outputs/epl_model.pkl')
app_data.to_csv('../data/processed_matches.csv', index=False)

print(f"\n--- 2026 MODEL UPGRADE COMPLETE ---")
print(f"Total Match Perspectives: {len(app_data)}")
print(f"‚úÖ Model trained on 7 seasons and saved! Your predictions are now up-to-date.")

Fetching 7 seasons of Premier League data (including 2026)... ‚öΩ
‚úÖ Loaded: 2526
‚úÖ Loaded: 2425
‚úÖ Loaded: 2324
‚úÖ Loaded: 2223
‚úÖ Loaded: 2122
‚úÖ Loaded: 2021
‚úÖ Loaded: 1920

--- 2026 MODEL UPGRADE COMPLETE ---
Total Match Perspectives: 4994
‚úÖ Model trained on 7 seasons and saved! Your predictions are now up-to-date.


In [5]:
import os
import joblib

# --- UNBREAKABLE PATH LOGIC ---
# This looks for the 'epl-match-prediction' folder name to find the root
current_path = os.getcwd()
if "notebooks" in current_path:
    # If we are inside /notebooks, go up one level to root
    project_root = os.path.dirname(current_path)
else:
    # We are already in root
    project_root = current_path

# Define absolute paths for the data and outputs folders
data_dir = os.path.join(project_root, "data")
output_dir = os.path.join(project_root, "outputs")

# Create folders if they don't exist (Safety first)
os.makedirs(data_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Define final file paths
model_path = os.path.join(output_dir, "epl_model.pkl")
processed_data_path = os.path.join(data_dir, "processed_matches.csv")

# --- SAVE THE FILES ---
print("Saving system files... üíæ")
joblib.dump(rf_reinforced, model_path)
app_data.to_csv(processed_data_path, index=False)

# --- VERIFICATION CHECK ---
print("-" * 30)
if os.path.exists(model_path) and os.path.exists(processed_data_path):
    print("‚úÖ SUCCESS: Files saved to correct project folders!")
    print(f"Model Location: {model_path}")
    print(f"Data Location: {processed_data_path}")
else:
    print("‚ùå FAILED: Files were not found after saving. Check your folder structure.")
print("-" * 30)

Saving system files... üíæ
------------------------------
‚úÖ SUCCESS: Files saved to correct project folders!
Model Location: /Users/kenesuowolowolo/Documents/epl-match-prediction/outputs/epl_model.pkl
Data Location: /Users/kenesuowolowolo/Documents/epl-match-prediction/data/processed_matches.csv
------------------------------
