In [None]:
'''

ENGLISH PREMIER LEAGUE CHAMPION PREDICTION PROJECT

TECHNIQUE 3: Simulation (Monte Carlo) Approach


Dataset:
Premier League Matches 1992-2022
https://www.kaggle.com/datasets/evangower/premier-league-matches-19922022/data?select=premier-league-matches.csv

'''

'\n\nENGLISH PREMIER LEAGUE CHAMPION PREDICTION PROJECT\n\nTECHNIQUE 3: Simulation (Monte Carlo) Approach\n\n\nDataset:\nPremier League Matches 1992-2022\nhttps://www.kaggle.com/datasets/evangower/premier-league-matches-19922022/data?select=premier-league-matches.csv\n\n'

# **IMPORT NECESSARY MODULES**

**Load Modules**

In [None]:
# data
import numpy as np
import pandas as pd

# Randomization
import random

# Evaluate Performance
from sklearn.metrics import mean_absolute_error

# **LOAD & PREPROCESS DATASET**

**Correctly choosing the Datasets folder and file**

Ensures folder is correct and found correctly with the "../datasets" etc

In [None]:
import os

original_path = "/content/Datasets"
target_path = "/content/../Datasets"
download_url = "https://raw.githubusercontent.com/KraeBM/PremierPredictionTool/refs/heads/master/Datasets/premier-league-matches.csv"
csv_filename = "premier-league-matches.csv"

#first if the final target exists, it skips everything
if not os.path.exists(target_path):
  # fixes issue regarding folder not being present in content folder
# - would lead to errors since dataset folder never existed.
   if not os.path.exists(original_path):
    os.makedirs(original_path)
    print(f"'Datasets' folder created at {original_path}")

# This downloads the CSV file into the original_path folder if it doesn’t exist
csv_path = os.path.join(original_path, csv_filename)
if not os.path.exists(csv_path):
    os.system(f"wget -O {csv_path} {download_url}")
    print(f"Downloaded '{csv_filename}' into '{original_path}'")
else:
    print(f"'{csv_filename}' already exists in '{original_path}'")

# Check if the original path exists before renaming - also moves it out of the content folder , for preventing later issues in pycharm
if os.path.exists(original_path):
    os.rename(original_path, target_path)
elif os.path.exists(target_path):
    print("Datasets folder is already in the correct location.")

'Datasets' folder created at /content/Datasets
Downloaded 'premier-league-matches.csv' into '/content/Datasets'


**Load Dataset**

reads and loads the dataset into a pandas dataframe

In [None]:
premier_league_matches_dataset = pd.read_csv('../Datasets/premier-league-matches.csv')

**Read Dataset**

In [None]:
  print("Read Dataset Completed Sucessfully\n")
    #Verifies if data has been loaded correctly and displays structure of dataset
  premier_league_matches_dataset.head(5)

Read Dataset Completed Sucessfully



Unnamed: 0,Season_End_Year,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A


# **GENERATE LEAGUE STANDINGS**

In [None]:
# Create an empty list to store league tables
league_standings = []

# Process standings for each season
for season in premier_league_matches_dataset['Season_End_Year'].unique():
    # Filter matches for this season and stores it
    season_matches = premier_league_matches_dataset[premier_league_matches_dataset['Season_End_Year'] == season]

    # Initialize a dictionary to track team preformance stats
    team_stats = {}

    for _, match in season_matches.iterrows():
        home_team = match['Home']
        away_team = match['Away']
        home_goals = match['HomeGoals']
        away_goals = match['AwayGoals']
        result = match['FTR']   # 'H' = Home win,'A' = Away win,'D' = Draw

        # Ensure teams exist in dictionary
        if home_team not in team_stats:
            team_stats[home_team] = {'Pld': 0, 'W': 0, 'D': 0, 'L': 0, 'GF': 0, 'GA': 0, 'GD': 0, 'Pts': 0}
        if away_team not in team_stats:
            team_stats[away_team] = {'Pld': 0, 'W': 0, 'D': 0, 'L': 0, 'GF': 0, 'GA': 0, 'GD': 0, 'Pts': 0}

        # Update stats for home team
        team_stats[home_team]['Pld'] += 1
        team_stats[home_team]['GF'] += home_goals
        team_stats[home_team]['GA'] += away_goals
        team_stats[home_team]['GD'] = team_stats[home_team]['GF'] - team_stats[home_team]['GA']

        # Update stats for away team
        team_stats[away_team]['Pld'] += 1
        team_stats[away_team]['GF'] += away_goals
        team_stats[away_team]['GA'] += home_goals
        team_stats[away_team]['GD'] = team_stats[away_team]['GF'] - team_stats[away_team]['GA']

        # Assign points based on result
        if result == 'H':  # Home win
            team_stats[home_team]['W'] += 1
            team_stats[home_team]['Pts'] += 3
            team_stats[away_team]['L'] += 1
        elif result == 'A':  # Away win
            team_stats[away_team]['W'] += 1
            team_stats[away_team]['Pts'] += 3
            team_stats[home_team]['L'] += 1
        else:  # Draw
            team_stats[home_team]['D'] += 1
            team_stats[away_team]['D'] += 1
            team_stats[home_team]['Pts'] += 1
            team_stats[away_team]['Pts'] += 1

    # Convert dictionary to DataFrame
    standings_df = pd.DataFrame.from_dict(team_stats, orient='index')
    standings_df.insert(0, 'Team', standings_df.index)
    standings_df.insert(1, 'Season', season)

    # Sort by Points -> GD -> GF
    standings_df = standings_df.sort_values(by=['Pts', 'GD', 'GF'], ascending=[False, False, False]).reset_index(drop=True)

    # Assign position (Rank)
    standings_df.insert(0, 'Pos', standings_df.index + 1)

    # **Filter to only keep the top 10 teams per season**
    standings_df = standings_df.head(10)

    # Append standings for this season
    league_standings.append(standings_df)

# Combine all seasons into one DataFrame
final_league_standings = pd.concat(league_standings, ignore_index=True)

# Define the path where you want to save the file in Google Drive
save_path = "../Datasets/Premier_League_Standings.csv"

# Save the DataFrame to Google Drive - found in Datasets folder !
final_league_standings.to_csv(save_path, index=False)

print(f"File saved successfully at: {save_path}")

# KEY
print('\nKEY:\nPos: League Ranking\nPld: Played\nW: Won\nD: Drawn\nL: Lost\nGF: Goals For\nGA: Goals Against\nGD: Goal Difference\nPts: Points\n')

# Display the final league standings - shows it works :D
final_league_standings.head(200)  # Display first 5 rows

File saved successfully at: ../Datasets/Premier_League_Standings.csv

KEY:
Pos: League Ranking
Pld: Played
W: Won
D: Drawn
L: Lost
GF: Goals For
GA: Goals Against
GD: Goal Difference
Pts: Points



Unnamed: 0,Pos,Team,Season,Pld,W,D,L,GF,GA,GD,Pts
0,1,Manchester Utd,1993,42,24,12,6,67,31,36,84
1,2,Aston Villa,1993,42,21,11,10,57,40,17,74
2,3,Norwich City,1993,42,21,9,12,61,65,-4,72
3,4,Blackburn,1993,42,20,11,11,68,46,22,71
4,5,QPR,1993,42,17,12,13,63,55,8,63
...,...,...,...,...,...,...,...,...,...,...,...
195,6,Chelsea,2012,38,18,10,10,65,46,19,64
196,7,Everton,2012,38,15,11,12,50,40,10,56
197,8,Liverpool,2012,38,14,10,14,47,40,7,52
198,9,Fulham,2012,38,14,10,14,48,51,-3,52


#  Generating TEAM STRENGTHS and weaknesses + computing historical averages



Loads the newly made Dataset

In [None]:
data = pd.read_csv('../../Datasets/Premier_League_Standings.csv')

**Compute historical performance averages (Points, Goals, etc.)**

In [None]:
# Give much higher importance to last 5 years
# e.g. 2018 = 1 + (2018-2015)/5 = 1.6 (more important that 2015 which was 1)
data["SeasonWeight"] = data["Season"].apply(lambda x: 1 if x < 2015 else (1 + (x - 2015) / 5))

# Compute weighted team performance
team_performance = data.groupby("Team").apply(lambda x: pd.Series({
    "Weighted_Pts": np.average(x["Pts"], weights=x["SeasonWeight"]),
    "GF": np.average(x["GF"], weights=x["SeasonWeight"]),
    "GA": np.average(x["GA"], weights=x["SeasonWeight"]),
    "GD": np.average(x["GD"], weights=x["SeasonWeight"]),
    "W": np.average(x["W"], weights=x["SeasonWeight"]),
    "D": np.average(x["D"], weights=x["SeasonWeight"]),
    "L": np.average(x["L"], weights=x["SeasonWeight"]),
    "Pld": np.average(x["Pld"], weights=x["SeasonWeight"])
})).reset_index()

  team_performance = data.groupby("Team").apply(lambda x: pd.Series({


**Normalize team strengths**

In [None]:
 # Calculates the teams Strength, Weakness and Win rate
team_performance["Attack_Strength"] = team_performance["GF"] / team_performance["Pld"]
team_performance["Defense_Strength"] = team_performance["GA"] / team_performance["Pld"]
team_performance["Win_Rate"] = team_performance["W"] / team_performance["Pld"]

print(team_performance.head(5))  # Verify and display the computed values with 5 teams

              Team  Weighted_Pts         GF         GA         GD          W  \
0          Arsenal     71.833333  68.489247  39.489247  29.000000  21.005376   
1      Aston Villa     58.215909  50.602273  44.920455   5.681818  15.840909   
2  Birmingham City     50.000000  40.500000  47.500000  -7.000000  12.500000   
3        Blackburn     62.909091  57.272727  46.636364  10.636364  17.818182   
4           Bolton     55.750000  48.250000  48.250000   0.000000  15.250000   

           D          L        Pld  Attack_Strength  Defense_Strength  \
0   8.817204   8.392473  38.215054         1.792206          1.033343   
1  10.693182  11.920455  38.454545         1.315898          1.168144   
2  12.500000  13.000000  38.000000         1.065789          1.250000   
3   9.454545  11.818182  39.090909         1.465116          1.193023   
4  10.000000  12.750000  38.000000         1.269737          1.269737   

   Win_Rate  
0  0.549662  
1  0.411939  
2  0.328947  
3  0.455814  
4  0.40131

**Convert to dictionary for fast lookup**



In [None]:
#Converts into dictionary for fast lookup for later
team_stats_dict = team_performance.set_index("Team")[
    ["Attack_Strength", "Defense_Strength", "Win_Rate"]
].to_dict(orient="index")

# **Define a Monte Carlo simulation function**

In [None]:
def simulate_match(home_team, away_team):
    """Simulates a match outcome using historical performance metrics with valid probabilities."""
   #checks if home team is present in the team stats dictionary.
    if home_team not in team_stats_dict or away_team not in team_stats_dict:
        return np.random.choice(["H", "D", "A"], p=[1/3, 1/3, 1/3])  # No data = fair randomised probability of winning.#
                                                                     # Each outcome has a 1/3 of winning #

    # increase/decrease the strength by 5% to simulate realistic outcomes.
    home_attack = team_stats_dict[home_team]["Attack_Strength"] * random.uniform(0.95, 1.05)
    away_attack = team_stats_dict[away_team]["Attack_Strength"] * random.uniform(0.95, 1.05)
    home_defense = team_stats_dict[home_team]["Defense_Strength"] * random.uniform(0.95, 1.05)
    away_defense = team_stats_dict[away_team]["Defense_Strength"] * random.uniform(0.95, 1.05)

    # Compute expected goals (Ensure no negative values)
    home_expected_goals = max((home_attack + away_defense) / 2, 0.1)  # Minimum 0.1 to avoid zero division
    away_expected_goals = max((away_attack + home_defense) / 2, 0.1)

    # Compute probabilities (Ensure they sum to 1)
    total_goals = home_expected_goals + away_expected_goals
    home_win_prob = home_expected_goals / total_goals
    away_win_prob = away_expected_goals / total_goals
    draw_prob = max(1 - (home_win_prob + away_win_prob), 0)  # Ensure it's not negative

    # Adjust Draw Probability Dynamically (Avoid 0% Draws)
    #Prevents it being too low /too high by comparing strengths of both teams
    draw_prob = max(1 - (home_win_prob + away_win_prob), min(0.3, abs(home_expected_goals - away_expected_goals) / 3))

    # Normalize probabilities to sum exactly to 1
    total_prob = home_win_prob + draw_prob + away_win_prob
    home_win_prob /= total_prob
    away_win_prob /= total_prob
    draw_prob /= total_prob

    #Debugging of Simulation #
    print(f"\n Match Simulation: {home_team} (Home) vs {away_team} (Away)")
    print(f"-----------------------------------------------------")
    print(f" Adjusted Attack & Defense Strengths:")  #in 3 decimal Places
    print(f" {home_team} - Attack: {home_attack:.3f}, Defense: {home_defense:.3f}")
    print(f" {away_team} - Attack: {away_attack:.3f}, Defense: {away_defense:.3f}")

    print(f"\n Expected Goals:")
    print(f"   {home_team} Expected Goals: {home_expected_goals:.3f}")
    print(f"   {away_team} Expected Goals: {away_expected_goals:.3f}")

    print(f"\n Win/Loss/Draw Probabilities:")
    print(f"   {home_team} Win Probability: {home_win_prob:.4f} ({home_win_prob * 100:.2f}%)") #Made as a Precentage
    print(f"   Draw Probability: {draw_prob:.4f} ({draw_prob * 100:.2f}%)")
    print(f"   {away_team} Win Probability: {away_win_prob:.4f} ({away_win_prob * 100:.2f}%)")#Made as a Precentage

    print(f"\n Probability Sum Check: {home_win_prob + draw_prob + away_win_prob:.4f} (Should be 1.0000)")
    print(f"-----------------------------------------------------\n")


    return np.random.choice(["H", "D", "A"], p=[home_win_prob, draw_prob, away_win_prob])

    #Testing of Monte Carlo simulation #

home_team = "Manchester City"
away_team = "Liverpool"

# Run the simulation for testing
for i in range(1):
    result = simulate_match(home_team, away_team)
    print(f"Match {i+1}: {home_team} vs {away_team} -> Result: {result}")


 Match Simulation: Manchester City (Home) vs Liverpool (Away)
-----------------------------------------------------
 Adjusted Attack & Defense Strengths:
 Manchester City - Attack: 2.040, Defense: 0.926
 Liverpool - Attack: 1.844, Defense: 0.989

 Expected Goals:
   Manchester City Expected Goals: 1.515
   Liverpool Expected Goals: 1.385

 Win/Loss/Draw Probabilities:
   Manchester City Win Probability: 0.5007 (50.07%)
   Draw Probability: 0.0416 (4.16%)
   Liverpool Win Probability: 0.4577 (45.77%)

 Probability Sum Check: 1.0000 (Should be 1.0000)
-----------------------------------------------------

Match 1: Manchester City vs Liverpool -> Result: A


# **Simulate an Entire Premier League Season**

In [None]:
def simulate_future_season(teams):
    """Simulates a generic future Premier League season using historical team performance metrics."""
    standings = {team: 0 for team in teams}  # Initialize points table starting with 0
    matches = [(home, away) for home in teams for away in teams if home != away]

    print("Example Matches:", matches[:5])  # shows example of first 5 matches

    # Use historical team stats to simulate match outcomes
    results = [simulate_match(home, away) for home, away in matches]  #Calls the monte carlo simulation  for each match

      # Print sample results
    print(" Debug: First 5 Simulated Match Results:")
    for i in range(5):
        print(f"  {matches[i][0]} vs {matches[i][1]} -> Result: {results[i]}")


    for (home, away), result in zip(matches, results):
        if result == "H":
            standings[home] += 3
        elif result == "D":
            standings[home] += 1
            standings[away] += 1
        else:
            standings[away] += 3

    df = pd.DataFrame(standings.items(), columns=["Team", "Points"])

    # Add a small random noise to prevent extreme overconfidence
    df["Points"] += np.random.uniform(-3, 3, len(df))
    # Sort again after adding noise
    df = df.sort_values(by="Points", ascending=False)

    # Print final top 5 teams
    print("\n Debug : Final Top 5 Teams:\n", df.head())

    return df


# **Run Monte Carlo Simulations (1,000 Seasons)**

In [None]:
# Get teams that played in the latest season
latest_season = data["Season"].max()
valid_teams = data[data["Season"] == latest_season]["Team"].tolist()

# Filter historical teams to only those still in the Premier League
historical_teams = [team for team in team_performance["Team"] if team in valid_teams]

# Run Monte Carlo Simulation (1,000 seasons)
num_simulations = 1000
final_rankings = {team: [] for team in historical_teams}

for _ in range(num_simulations):
    season_results = simulate_future_season(historical_teams)
    for rank, row in season_results.iterrows():
        if row["Team"] in final_rankings:  # Ensure team exists before adding
            final_rankings[row["Team"]].append(rank + 1)  # Store ranking for each team

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 Adjusted Attack & Defense Strengths:
 Aston Villa - Attack: 1.305, Defense: 1.136
 Arsenal - Attack: 1.877, Defense: 1.010

 Expected Goals:
   Aston Villa Expected Goals: 1.158
   Arsenal Expected Goals: 1.507

 Win/Loss/Draw Probabilities:
   Aston Villa Win Probability: 0.3892 (38.92%)
   Draw Probability: 0.1042 (10.42%)
   Arsenal Win Probability: 0.5066 (50.66%)

 Probability Sum Check: 1.0000 (Should be 1.0000)
-----------------------------------------------------


 Match Simulation: Aston Villa (Home) vs Brentford (Away)
-----------------------------------------------------
 Adjusted Attack & Defense Strengths:
 Aston Villa - Attack: 1.335, Defense: 1.184
 Brentford - Attack: 1.579, Defense: 1.206

 Expected Goals:
   Aston Villa Expected Goals: 1.271
   Brentford Expected Goals: 1.382

 Win/Loss/Draw Probabilities:
   Aston Villa Win Probability: 0.4620 (46.20%)
   Draw Probability: 0.0357 (3.57%)
   Brentford 

# **Compute Final Predicted Standings**

In [None]:
final_rankings_df = pd.DataFrame({team: np.mean(ranks) for team, ranks in final_rankings.items()}, index=["Avg Position"]).T
final_rankings_df = final_rankings_df.sort_values(by="Avg Position").head(10)

# Print top 10 predicted teams
print("\n **Predicted Future Premier League Standings (Using Historical Data):**\n")
print(final_rankings_df)


 **Predicted Future Premier League Standings (Using Historical Data):**

                 Avg Position
Arsenal                   1.0
Aston Villa               2.0
Brentford                 3.0
Brighton                  4.0
Fulham                    5.0
Liverpool                 6.0
Manchester City           7.0
Manchester Utd            8.0
Newcastle Utd             9.0
Tottenham                10.0


## **Evaluate Performance**

**Compute MAE against Actual 2022-23 Standings**

In [None]:
# Extract actual 2022-23 rankings
latest_season = data["Season"].max()
latest_season_data = data[data["Season"] == latest_season]
actual_rankings_top_10 = dict(zip(latest_season_data["Team"], latest_season_data["Pos"]))

# Convert predicted rankings to a dictionary
predicted_rankings = final_rankings_df["Avg Position"].to_dict()

# Ensure both actual and predicted rankings have the same teams
common_teams = set(actual_rankings_top_10.keys()) & set(predicted_rankings.keys())

# Compute MAE
actual_values = [actual_rankings_top_10[team] for team in common_teams]
predicted_values = [predicted_rankings[team] for team in common_teams]

mae = mean_absolute_error(actual_values, predicted_values)
print(f" Mean Absolute Error (MAE) for Future Predictions vs 2022-23: {mae}")

 Mean Absolute Error (MAE) for Future Predictions vs 2022-23: 3.8


# **Predict the Most Likely Champion**

In [None]:
winner_counts = {team: final_rankings[team].count(1) for team in final_rankings.keys()}
winner_probabilities = {team: count / num_simulations for team, count in winner_counts.items()}

# Convert to DataFrame
winner_df = pd.DataFrame(list(winner_probabilities.items()), columns=["Team", "Win Probability"])
winner_df = winner_df.sort_values(by="Win Probability", ascending=False)

# Print the most likely winner
print("\n🏆 **Most Likely Premier League Winner (Based on Simulations):**\n")
print(winner_df.head(1))  # Show the most likely champion


🏆 **Most Likely Premier League Winner (Based on Simulations):**

      Team  Win Probability
0  Arsenal              1.0
