In [1]:
import pandas as pd

# 1. Load our clean, processed dataset
df = pd.read_csv(r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\cleaned_matches.csv')
df['date'] = pd.to_datetime(df['date'])


# --- 2. Calculate Rolling Form Features ---
print("Calculating rolling form features...")
ROLLING_WINDOW = 5

# Get a list of all unique team names from the entire dataset
teams = pd.unique(df[['home_team', 'away_team']].values.ravel('K'))

# Initialize the new form feature columns
form_features = ['form_points', 'form_goals_scored', 'form_goals_conceded', 'form_goal_difference']
for side in ['home', 'away']:
    for feature in form_features:
        df[f'{side}_{feature}'] = 0.0

# Loop through each team to calculate their historical form
for team in teams:
    # Filter for all matches played by the current team, sorted by date
    team_matches = df[(df['home_team'] == team) | (df['away_team'] == team)].sort_values('date')

    # Calculate points, goals scored, and goals conceded for the team in each match
    points = team_matches.apply(lambda row: 3 if (row['home_team'] == team and row['result'] == 'HOME_TEAM') or \
                                                 (row['away_team'] == team and row['result'] == 'AWAY_TEAM') else \
                                             1 if row['result'] == 'DRAW' else 0, axis=1)
    goals_scored = team_matches.apply(lambda row: row['home_goals'] if row['home_team'] == team else row['away_goals'], axis=1)
    goals_conceded = team_matches.apply(lambda row: row['away_goals'] if row['home_team'] == team else row['home_goals'], axis=1)
    
    # Calculate the rolling averages, shifting to prevent data leakage
    rolling_stats = {
        'form_points': points.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goals_scored': goals_scored.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goals_conceded': goals_conceded.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goal_difference': (goals_scored - goals_conceded).shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean()
    }

    # Update the main DataFrame with the calculated form stats for the team's matches
    for feature, values in rolling_stats.items():
        # Update home matches
        df.loc[team_matches[team_matches['home_team'] == team].index, f'home_{feature}'] = values
        # Update away matches
        df.loc[team_matches[team_matches['away_team'] == team].index, f'away_{feature}'] = values

# The first few matches of a team's history will have NaNs, so we fill them with 0
df.fillna(0, inplace=True)

# --- Verification ---
print("\\nForm features calculated successfully.")
print("Displaying the last 5 rows with the new features:")
print(df.tail())

Calculating rolling form features...
\nForm features calculated successfully.
Displaying the last 5 rows with the new features:
      season                      date                  home_team  \
2655     0.0 2025-08-30 16:30:00+00:00            Leeds United FC   
2656     0.0 2025-08-31 13:00:00+00:00  Brighton & Hove Albion FC   
2657     0.0 2025-08-31 13:00:00+00:00       Nottingham Forest FC   
2658     0.0 2025-08-31 15:30:00+00:00               Liverpool FC   
2659     0.0 2025-08-31 18:00:00+00:00             Aston Villa FC   

                away_team  home_goals  away_goals     result  target  \
2655  Newcastle United FC           0           0       DRAW       0   
2656   Manchester City FC           2           1  HOME_TEAM       1   
2657   West Ham United FC           0           3  AWAY_TEAM       2   
2658           Arsenal FC           1           0  HOME_TEAM       1   
2659    Crystal Palace FC           0           3  AWAY_TEAM       2   

      home_form_points  

In [2]:
# The previous .fillna(0) call might have turned the 'season' column into a float.
# Let's ensure it's an integer for clean processing.
df['season'] = df['season'].astype(int)

# --- Feature #2: Calculate League Table Standings ---
print("Calculating league table features...")

# Initialize a dictionary to hold the stats for each team within a season
team_stats = {}
current_season = None

# Initialize new columns for our league table features
for side in ['home', 'away']:
    df[f'{side}_rank'] = 20 # Default to a low rank
    df[f'{side}_points'] = 0
    df[f'{side}_goals_for'] = 0
    df[f'{side}_goals_against'] = 0

# Loop through each match to calculate stats chronologically
for index, row in df.iterrows():
    # Check if a new season has started
    if row['season'] != current_season:
        current_season = row['season']
        team_stats = {} # Reset stats for the new season
        print(f"--- Processing season: {current_season}-{current_season+1} ---")

    home_team = row['home_team']
    away_team = row['away_team']

    # Initialize a team's stats if they are new to the season
    for team in [home_team, away_team]:
        if team not in team_stats:
            team_stats[team] = {'points': 0, 'goals_for': 0, 'goals_against': 0}

    # --- Record the stats BEFORE the match ---
    # Create a temporary league table from the current stats to calculate ranks
    if len(team_stats) > 0:
        temp_table = pd.DataFrame.from_dict(team_stats, orient='index')
        temp_table['goal_difference'] = temp_table['goals_for'] - temp_table['goals_against']
        temp_table = temp_table.sort_values(['points', 'goal_difference'], ascending=False)
        temp_table['rank'] = range(1, len(temp_table) + 1)

        # Assign the pre-match stats to the current row in the main DataFrame
        df.loc[index, 'home_rank'] = temp_table.loc[home_team, 'rank']
        df.loc[index, 'away_rank'] = temp_table.loc[away_team, 'rank']
    
    df.loc[index, 'home_points'] = team_stats[home_team]['points']
    df.loc[index, 'away_points'] = team_stats[away_team]['points']
    
    # --- Update the stats AFTER the match has been 'played' ---
    # Update points based on the result
    if row['result'] == 'HOME_TEAM':
        team_stats[home_team]['points'] += 3
    elif row['result'] == 'AWAY_TEAM':
        team_stats[away_team]['points'] += 3
    else: # Draw
        team_stats[home_team]['points'] += 1
        team_stats[away_team]['points'] += 1
    
    # Update goals for and against
    team_stats[home_team]['goals_for'] += row['home_goals']
    team_stats[home_team]['goals_against'] += row['away_goals']
    team_stats[away_team]['goals_for'] += row['away_goals']
    team_stats[away_team]['goals_against'] += row['home_goals']

# --- Verification ---
print("\nLeague table features calculated successfully.")
print("Displaying the last 5 rows with all features added so far:")
print(df.tail())

Calculating league table features...
--- Processing season: 2018-2019 ---
--- Processing season: 2019-2020 ---
--- Processing season: 2020-2021 ---
--- Processing season: 2021-2022 ---
--- Processing season: 2022-2023 ---
--- Processing season: 2023-2024 ---
--- Processing season: 2024-2025 ---
--- Processing season: 0-1 ---

League table features calculated successfully.
Displaying the last 5 rows with all features added so far:
      season                      date                  home_team  \
2655       0 2025-08-30 16:30:00+00:00            Leeds United FC   
2656       0 2025-08-31 13:00:00+00:00  Brighton & Hove Albion FC   
2657       0 2025-08-31 13:00:00+00:00       Nottingham Forest FC   
2658       0 2025-08-31 15:30:00+00:00               Liverpool FC   
2659       0 2025-08-31 18:00:00+00:00             Aston Villa FC   

                away_team  home_goals  away_goals     result  target  \
2655  Newcastle United FC           0           0       DRAW       0   
2656   

In [3]:
from sklearn.ensemble import RandomForestClassifier

# --- Final Model Training ---
print("Training the final model with all available features...")

# 1. Define our complete list of features for the model to use
features = [
    # Form features
    'home_form_points', 'home_form_goals_scored', 'home_form_goals_conceded', 'home_form_goal_difference',
    'away_form_points', 'away_form_goals_scored', 'away_form_goals_conceded', 'away_form_goal_difference',
    # League table features
    'home_rank', 'home_points',
    'away_rank', 'away_points'
]

X = df[features]
y = df['target']

# 2. Create a realistic train-test split based on time
# We train on data before a cutoff date and test on data after.
train_cutoff_date = '2024-08-01'
df_train = df[df['date'] < train_cutoff_date]
df_test = df[df['date'] >= train_cutoff_date]

# Drop the very first season from our training data because its features are not fully formed
first_season = df_train['season'].min()
df_train = df_train[df_train['season'] > first_season]

X_train = df_train[features]
y_train = df_train['target']
X_test = df_test[features]
y_test = df_test['target']

print(f"Training the model on {len(X_train)} matches (from {first_season+1} to 2024).")
print(f"Testing the model on {len(X_test)} recent matches (2024 onwards).")

# 3. Initialize and train the final Random Forest model
# min_samples_leaf=5 helps prevent the model from memorizing the data (overfitting)
final_model = RandomForestClassifier(n_estimators=100, min_samples_leaf=5, random_state=42)
final_model.fit(X_train, y_train)

# 4. Evaluate the model on the unseen test data
final_accuracy = final_model.score(X_test, y_test)

print("\\n--- FINAL MODEL EVALUATION ---")
print(f"Final Model Accuracy: {final_accuracy*100:.2f}%")



Training the final model with all available features...
Training the model on 1900 matches (from 2019 to 2024).
Testing the model on 380 recent matches (2024 onwards).
\n--- FINAL MODEL EVALUATION ---
Final Model Accuracy: 48.42%


In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get the feature importances from our trained model
importances = final_model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance for Football Match Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()

# Save the plot to a file
plt.savefig('feature_importance.png')

print("Feature Importance:")
print(feature_importance_df)

ModuleNotFoundError: No module named 'seaborn'

In [9]:
!pip install matplotlib




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
