In [18]:
# Team Ranking Prediction using Random Forest with Cross-Validation
# Requirements: Python 3, pandas, numpy, scikit-learn
# Save as predict_team_ranking_rf.py
# Data: premier-player-23-24 - premier-player-23-24.csv

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Step 1: Load Data
df = pd.read_csv('premier-player-23-24 - premier-player-23-24.csv')

# Step 2: Aggregate Stats per Team
agg_cols = ['MP', 'Gls', 'Ast', 'xG', 'xAG', 'PrgC', 'PrgP']
team_stats = df.groupby('Team')[agg_cols].sum().reset_index()

# Add derived features: per-match stats and goal difference
team_stats['Gls_per_MP'] = team_stats['Gls'] / team_stats['MP']
team_stats['xG_per_MP'] = team_stats['xG'] / team_stats['MP']
team_stats['PrgC_per_MP'] = team_stats['PrgC'] / team_stats['MP']

# Conceded goals (replace with actual data if available)
conceded_goals = {
    'Manchester City': 34, 'Arsenal': 29, 'Liverpool': 41, 'Aston Villa': 61,
    'Tottenham Hotspur': 61, 'Chelsea': 63, 'Newcastle United': 62, 'Manchester United': 58,
    'West Ham United': 74, 'Crystal Palace': 58, 'Brighton & Hove Albion': 62, 'Bournemouth': 67,
    'Fulham': 61, 'Wolverhampton Wanderers': 65, 'Everton': 51, 'Brentford': 65,
    'Nottingham Forest': 67, 'Luton Town': 85, 'Burnley': 78, 'Sheffield United': 104
}
team_stats['Conceded'] = team_stats['Team'].map(conceded_goals)
team_stats['Goal_Diff'] = team_stats['Gls'] - team_stats['Conceded']

# Step 3: Add Actual Ranks/Points
standings = {
    'Manchester City': {'rank': 1, 'points': 91}, 'Arsenal': {'rank': 2, 'points': 89},
    'Liverpool': {'rank': 3, 'points': 82}, 'Aston Villa': {'rank': 4, 'points': 68},
    'Tottenham Hotspur': {'rank': 5, 'points': 66}, 'Chelsea': {'rank': 6, 'points': 63},
    'Newcastle United': {'rank': 7, 'points': 60}, 'Manchester United': {'rank': 8, 'points': 60},
    'West Ham United': {'rank': 9, 'points': 52}, 'Crystal Palace': {'rank': 10, 'points': 49},
    'Brighton & Hove Albion': {'rank': 11, 'points': 48}, 'Bournemouth': {'rank': 12, 'points': 48},
    'Fulham': {'rank': 13, 'points': 47}, 'Wolverhampton Wanderers': {'rank': 14, 'points': 46},
    'Everton': {'rank': 15, 'points': 40}, 'Brentford': {'rank': 16, 'points': 39},
    'Nottingham Forest': {'rank': 17, 'points': 32}, 'Luton Town': {'rank': 18, 'points': 26},
    'Burnley': {'rank': 19, 'points': 24}, 'Sheffield United': {'rank': 20, 'points': 16}
}
team_stats['points'] = team_stats['Team'].map(lambda t: standings.get(t, {}).get('points', 0))
team_stats['rank'] = team_stats['Team'].map(lambda t: standings.get(t, {}).get('rank', 20))

# Step 4: Preprocessing
feature_cols = ['Gls', 'Ast', 'xG', 'xAG', 'PrgC', 'PrgP', 'Gls_per_MP', 'xG_per_MP', 'PrgC_per_MP', 'Goal_Diff', 'points']
features = team_stats[feature_cols].fillna(0).values
targets = team_stats['rank'].values
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Step 5: Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(features)):
    print(f'Fold {fold+1}')
    X_train, X_test = features[train_idx], features[test_idx]
    y_train, y_test = targets[train_idx], targets[test_idx]

    # Train Random Forest
    model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    mae_scores.append(mae)
    print(f'Fold MAE: {mae:.4f}')

# Step 6: Feature Importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Step 7: Final Results
avg_mae = np.mean(mae_scores)
print(f'Average MAE: {avg_mae:.4f}')

Fold 1
Fold MAE: 1.6550
Fold 2
Fold MAE: 0.9825
Fold 3
Fold MAE: 3.1475
Fold 4
Fold MAE: 3.0625
Fold 5
Fold MAE: 1.0575

Feature Importance:
        Feature  Importance
10       points    0.331431
6    Gls_per_MP    0.221777
0           Gls    0.210101
7     xG_per_MP    0.055548
8   PrgC_per_MP    0.040374
4          PrgC    0.039469
5          PrgP    0.035121
2            xG    0.021337
1           Ast    0.018381
9     Goal_Diff    0.016490
3           xAG    0.009972
Average MAE: 1.9810
