# Football Match Prediction - Exploratory Data Analysis

This notebook explores the structure and patterns in football match data to inform feature engineering and model development.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path for imports
sys.path.append('../src')

from spx.adapters.football.epl import EPLAdapter
from spx.core.ratings import EloRatingSystem
from spx.core.features import FeatureEngineer

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
%matplotlib inline

## 1. Data Loading and Basic Statistics

In [None]:
# Load sample data
data_dir = Path('../tests/data')
adapter = EPLAdapter(data_dir)

# Load matches
try:
    matches = adapter.load_matches('2024-25')
    print(f"Loaded {len(matches)} matches")
    
    # Convert to DataFrame for analysis
    match_data = [match.dict() for match in matches]
    df = pd.DataFrame(match_data)
    
    print("\nDataFrame shape:", df.shape)
    print("\nColumn names:")
    print(df.columns.tolist())
    
except Exception as e:
    print(f"Error loading data: {e}")
    # Create dummy data for demonstration
    df = pd.DataFrame({
        'date': pd.date_range('2024-08-11', periods=10),
        'home_team': ['Arsenal', 'Chelsea', 'Man City'] * 3 + ['Brighton'],
        'away_team': ['Wolves', 'Brighton', 'Arsenal'] * 3 + ['Everton'],
        'home_goals': [2, 1, 3, 0, 2, 1, 1, 2, 0, 1],
        'away_goals': [0, 1, 1, 2, 0, 3, 2, 1, 0, 0],
        'outcome': ['H', 'D', 'H', 'A', 'H', 'A', 'A', 'H', 'D', 'H']
    })
    print("Using dummy data for demonstration")

# Display basic info
print("\nFirst few rows:")
df.head()

## 2. Goal Distribution Analysis

In [None]:
# Analyze goal distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Home goals distribution
axes[0, 0].hist(df['home_goals'], bins=range(0, 8), alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Home Goals Distribution')
axes[0, 0].set_xlabel('Goals')
axes[0, 0].set_ylabel('Frequency')

# Away goals distribution
axes[0, 1].hist(df['away_goals'], bins=range(0, 8), alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Away Goals Distribution')
axes[0, 1].set_xlabel('Goals')
axes[0, 1].set_ylabel('Frequency')

# Total goals distribution
total_goals = df['home_goals'] + df['away_goals']
axes[1, 0].hist(total_goals, bins=range(0, 12), alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Total Goals Distribution')
axes[1, 0].set_xlabel('Total Goals')
axes[1, 0].set_ylabel('Frequency')

# Outcome distribution
outcome_counts = df['outcome'].value_counts()
axes[1, 1].bar(outcome_counts.index, outcome_counts.values)
axes[1, 1].set_title('Match Outcome Distribution')
axes[1, 1].set_xlabel('Outcome')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n=== GOAL STATISTICS ===")
print(f"Average home goals: {df['home_goals'].mean():.2f}")
print(f"Average away goals: {df['away_goals'].mean():.2f}")
print(f"Average total goals: {total_goals.mean():.2f}")

print("\n=== OUTCOME PERCENTAGES ===")
outcome_pcts = df['outcome'].value_counts(normalize=True) * 100
for outcome, pct in outcome_pcts.items():
    outcome_name = {'H': 'Home Win', 'D': 'Draw', 'A': 'Away Win'}[outcome]
    print(f"{outcome_name}: {pct:.1f}%")

## 3. Team Performance Analysis

In [None]:
# Analyze team performance
teams = list(set(df['home_team'].unique()) | set(df['away_team'].unique()))
print(f"Teams in dataset: {len(teams)}")
print(teams)

# Calculate basic team stats
team_stats = []

for team in teams:
    home_matches = df[df['home_team'] == team]
    away_matches = df[df['away_team'] == team]
    
    # Goals
    goals_for = home_matches['home_goals'].sum() + away_matches['away_goals'].sum()
    goals_against = home_matches['away_goals'].sum() + away_matches['home_goals'].sum()
    
    # Points
    points = 0
    games = 0
    
    for _, match in home_matches.iterrows():
        games += 1
        if match['outcome'] == 'H':
            points += 3
        elif match['outcome'] == 'D':
            points += 1
    
    for _, match in away_matches.iterrows():
        games += 1
        if match['outcome'] == 'A':
            points += 3
        elif match['outcome'] == 'D':
            points += 1
    
    if games > 0:
        team_stats.append({
            'team': team,
            'games': games,
            'points': points,
            'ppg': points / games,
            'goals_for': goals_for,
            'goals_against': goals_against,
            'goal_diff': goals_for - goals_against,
            'gpg': goals_for / games,
            'gapg': goals_against / games
        })

team_df = pd.DataFrame(team_stats).sort_values('ppg', ascending=False)
print("\n=== TEAM PERFORMANCE ===")
print(team_df.round(2))

## 4. Home Advantage Analysis

In [None]:
# Analyze home advantage
home_wins = (df['outcome'] == 'H').sum()
draws = (df['outcome'] == 'D').sum()
away_wins = (df['outcome'] == 'A').sum()
total_games = len(df)

print("=== HOME ADVANTAGE ANALYSIS ===")
print(f"Home wins: {home_wins} ({home_wins/total_games*100:.1f}%)")
print(f"Draws: {draws} ({draws/total_games*100:.1f}%)")
print(f"Away wins: {away_wins} ({away_wins/total_games*100:.1f}%)")

# Goals by venue
print(f"\nAverage home goals: {df['home_goals'].mean():.2f}")
print(f"Average away goals: {df['away_goals'].mean():.2f}")
print(f"Home advantage (goals): {df['home_goals'].mean() - df['away_goals'].mean():.2f}")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Outcome pie chart
outcome_counts = [home_wins, draws, away_wins]
labels = ['Home Win', 'Draw', 'Away Win']
colors = ['green', 'gray', 'red']

ax1.pie(outcome_counts, labels=labels, autopct='%1.1f%%', colors=colors)
ax1.set_title('Match Outcomes')

# Goals comparison
venues = ['Home', 'Away']
avg_goals = [df['home_goals'].mean(), df['away_goals'].mean()]

ax2.bar(venues, avg_goals, color=['blue', 'orange'])
ax2.set_title('Average Goals by Venue')
ax2.set_ylabel('Average Goals')

plt.tight_layout()
plt.show()

## 5. Elo Rating Development

In [None]:
# Test Elo rating system
try:
    elo_system = EloRatingSystem(
        k_factor=20.0,
        base_rating=1500.0,
        home_advantage=30.0,
        margin_multiplier=0.1
    )
    
    # Update ratings with our sample data
    ratings = elo_system.update_ratings(matches)
    
    # Show current ratings
    current_ratings = elo_system.get_current_ratings()
    
    print("=== CURRENT ELO RATINGS ===")
    for team, rating in sorted(current_ratings.items(), key=lambda x: x[1], reverse=True):
        print(f"{team:<15}: {rating:.0f}")
    
    # Plot rating evolution
    rating_history = elo_system.get_rating_history()
    
    plt.figure(figsize=(12, 8))
    
    for team in teams[:6]:  # Plot top 6 teams
        if team in rating_history:
            dates = list(rating_history[team].keys())
            ratings = list(rating_history[team].values())
            
            if dates:
                plt.plot(dates, ratings, marker='o', label=team, linewidth=2)
    
    plt.title('Elo Rating Evolution')
    plt.xlabel('Date')
    plt.ylabel('Elo Rating')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"Error with Elo analysis: {e}")

## 6. Feature Engineering Analysis

In [None]:
# Analyze feature correlations
if len(df) > 5:  # Need sufficient data
    try:
        # Create features
        engineer = FeatureEngineer(lookback_games=3)
        rating_dict = elo_system.get_rating_history() if 'elo_system' in locals() else None
        
        features_df = engineer.create_features(matches, rating_dict)
        
        print("=== FEATURE SUMMARY ===")
        print(f"Features shape: {features_df.shape}")
        
        # Show feature correlations with outcome
        numeric_features = features_df.select_dtypes(include=[np.number]).columns
        feature_cols = [col for col in numeric_features 
                       if col not in ['home_goals', 'away_goals']]
        
        if len(feature_cols) > 0:
            # Encode outcome for correlation
            features_df['outcome_numeric'] = features_df['outcome'].map({'A': 0, 'D': 1, 'H': 2})
            
            correlations = features_df[feature_cols + ['outcome_numeric']].corr()['outcome_numeric'].abs().sort_values(ascending=False)
            
            print("\nFeature correlations with outcome:")
            for feature, corr in correlations[:-1].items():  # Exclude self-correlation
                print(f"{feature:<25}: {corr:.3f}")
            
            # Plot correlation heatmap
            plt.figure(figsize=(10, 8))
            corr_matrix = features_df[feature_cols].corr()
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
            plt.title('Feature Correlation Matrix')
            plt.tight_layout()
            plt.show()
        
        # Show sample features
        print("\nSample features:")
        display_cols = ['home_team', 'away_team', 'outcome'] + feature_cols[:5]
        print(features_df[display_cols].head().round(2))
        
    except Exception as e:
        print(f"Error with feature analysis: {e}")
else:
    print("Insufficient data for feature analysis")

## 7. Scoreline Pattern Analysis

In [None]:
# Analyze scoreline patterns
scorelines = df['home_goals'].astype(str) + '-' + df['away_goals'].astype(str)
scoreline_counts = scorelines.value_counts().head(10)

print("=== MOST COMMON SCORELINES ===")
for scoreline, count in scoreline_counts.items():
    pct = count / len(df) * 100
    print(f"{scoreline}: {count} ({pct:.1f}%)")

# Create scoreline heatmap
max_goals = min(6, max(df['home_goals'].max(), df['away_goals'].max()))
scoreline_matrix = np.zeros((max_goals + 1, max_goals + 1))

for _, match in df.iterrows():
    h_goals = min(match['home_goals'], max_goals)
    a_goals = min(match['away_goals'], max_goals)
    scoreline_matrix[h_goals, a_goals] += 1

# Normalize to percentages
scoreline_matrix = scoreline_matrix / len(df) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(scoreline_matrix, annot=True, fmt='.1f', cmap='Blues',
            xticklabels=range(max_goals + 1),
            yticklabels=range(max_goals + 1))
plt.title('Scoreline Frequency Heatmap (%)')
plt.xlabel('Away Goals')
plt.ylabel('Home Goals')
plt.tight_layout()
plt.show()

## 8. Model Performance Preview

In [None]:
# Simple baseline predictions for evaluation framework
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

if len(df) > 10:
    # Prepare simple features
    X_simple = pd.DataFrame({
        'home_advantage': [1] * len(df),
        'home_goals_avg': df['home_goals'].expanding().mean().shift(1).fillna(1.5),
        'away_goals_avg': df['away_goals'].expanding().mean().shift(1).fillna(1.2)
    })
    
    y = df['outcome'].map({'A': 0, 'D': 1, 'H': 2})
    
    # Train dummy classifier
    dummy = DummyClassifier(strategy='prior')
    dummy.fit(X_simple[:-2], y[:-2])  # Leave last 2 for "prediction"
    
    # "Predict" on last 2 matches
    y_pred = dummy.predict(X_simple[-2:])
    y_true = y[-2:]
    
    print("=== BASELINE MODEL PERFORMANCE ===")
    print("This shows the evaluation framework structure")
    print(f"Predictions: {y_pred}")
    print(f"Actual: {y_true.values}")
    
    # Show probabilities
    y_proba = dummy.predict_proba(X_simple[-2:])
    print("\nProbability predictions:")
    for i, (pred_probs, true_outcome) in enumerate(zip(y_proba, y_true)):
        print(f"Match {i+1}: Away={pred_probs[0]:.3f}, Draw={pred_probs[1]:.3f}, Home={pred_probs[2]:.3f} (True: {true_outcome})")

else:
    print("Insufficient data for model evaluation demo")

## Next Steps

This EDA provides insights for:

1. **Feature Engineering**: Form metrics, Elo ratings, home advantage
2. **Model Development**: XGBoost for outcomes, Poisson for scorelines
3. **Evaluation Framework**: Calibration plots, Brier scores, log loss
4. **Simulation Setup**: Monte Carlo with realistic distributions

To continue:
- Run `spx ingest` to process full datasets
- Run `spx train` to build production models
- Run `spx simulate` for season predictions