# âš½ FIFA World Cup & International Football â€” ML Analysis

**Datasets**: international_matches.csv (17,769 matches, 1872-2022), world_cup_matches.csv (900 WC matches),
world_cups.csv (22 tournaments), 2022_world_cup_squads.csv (831 players)

## ML Tasks
| # | Task | Type | Target |
|---|------|------|--------|
| 1 | Match Outcome Prediction | Multi-class Classification | Home Win / Draw / Away Win |
| 2 | Total Goals Regression | Regression | Total goals per match |
| 3 | Player Position Classification | Multi-class Classification | GK / DEF / MID / FWD |
| 4 | Country Performance Clustering | Unsupervised | K-Means / DBSCAN |

In [None]:
import warnings, os, base64, io, pathlib
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from jinja2 import Template

from sklearn.model_selection import (train_test_split, cross_val_score,
                                     GridSearchCV, RandomizedSearchCV,
                                     learning_curve, StratifiedKFold)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                              GradientBoostingClassifier, GradientBoostingRegressor,
                              AdaBoostClassifier, VotingClassifier, StackingClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             mean_absolute_error, mean_squared_error, r2_score)
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA

SEED = 42
np.random.seed(SEED)
sns.set_style('whitegrid')
plt.rcParams.update({'figure.max_open_warning': 0, 'figure.dpi': 120})

PLOT_DIR = pathlib.Path('outputs/plots')
PLOT_DIR.mkdir(parents=True, exist_ok=True)

all_results = {}   # collector for HTML report
saved_plots = []   # list of (title, path)

def save(fig, name, title=None):
    p = PLOT_DIR / name
    fig.savefig(p, bbox_inches='tight', facecolor='white')
    plt.close(fig)
    saved_plots.append((title or name.replace('.png','').replace('_',' ').title(), str(p)))
    print(f'  âœ“ {name}')

import sklearn
print(f'All imports successful')
print(f'  scikit-learn {sklearn.__version__}')
print(f'  pandas {pd.__version__}, numpy {np.__version__}')

## 1 Â· Data Loading

In [None]:
DATA = pathlib.Path('World+Cup')

intl = pd.read_csv(DATA / 'international_matches.csv')
wc_matches = pd.read_csv(DATA / 'world_cup_matches.csv')
wc_summary = pd.read_csv(DATA / 'world_cups.csv')
squads = pd.read_csv(DATA / '2022_world_cup_squads.csv', encoding='latin1')
groups = pd.read_csv(DATA / '2022_world_cup_groups.csv')

print(f'international_matches: {intl.shape}')
print(f'world_cup_matches:     {wc_matches.shape}')
print(f'world_cups:            {wc_summary.shape}')
print(f'squads:                {squads.shape}')
print(f'groups:                {groups.shape}')
print()
print('=== international_matches columns ===')
print(intl.dtypes.to_string())
print()
intl.head()

In [None]:
# â”€â”€ Parse dates & basic features â”€â”€
intl['Date'] = pd.to_datetime(intl['Date'])
intl['year'] = intl['Date'].dt.year
intl['month'] = intl['Date'].dt.month
intl['decade'] = (intl['year'] // 10) * 10

intl['total_goals'] = intl['Home Goals'] + intl['Away Goals']
intl['goal_diff'] = intl['Home Goals'] - intl['Away Goals']

# Match result (target for classification)
intl['result'] = intl['goal_diff'].apply(
    lambda x: 'Home Win' if x > 0 else ('Draw' if x == 0 else 'Away Win'))

# Tournament importance encoding
tourn_map = {
    'FIFA World Cup': 5, 'FIFA World Cup qualification': 4,
    'Confederations Cup': 4,
    'Copa America': 3, 'UEFA Euro': 3, 'African Cup of Nations': 3,
    'Gold Cup': 3, 'AFC Asian Cup': 3,
    'UEFA Euro qualification': 2, 'African Cup of Nations qualification': 2,
    'Copa America qualification': 2, 'AFC Asian Cup qualification': 2,
    'British Championship': 2, 'Friendly': 1
}
intl['tournament_importance'] = intl['Tournament'].map(tourn_map).fillna(2).astype(int)

# Is World Cup match
intl['is_world_cup'] = (intl['Tournament'] == 'FIFA World Cup').astype(int)

print(f'Processed: {intl.shape}')
print(f'\nResult distribution:')
print(intl['result'].value_counts())
print(f'\nYear range: {intl["year"].min()} â€“ {intl["year"].max()}')
print(f'Home Stadium: {intl["Home Stadium"].value_counts().to_dict()}')

## 2 Â· Feature Engineering â€” Team Strength

In [None]:
# â”€â”€ Build historical team stats (rolling) â”€â”€
# For each match, compute team's historical win rate, avg goals, etc.
# using all prior matches (expanding window)

# Create a long-format record: each row = team + match
records = []
for _, row in intl.iterrows():
    records.append({
        'date': row['Date'], 'team': row['Home Team'],
        'goals_for': row['Home Goals'], 'goals_against': row['Away Goals'],
        'is_home': 1,
        'result': 1 if row['goal_diff'] > 0 else (0.5 if row['goal_diff'] == 0 else 0)
    })
    records.append({
        'date': row['Date'], 'team': row['Away Team'],
        'goals_for': row['Away Goals'], 'goals_against': row['Home Goals'],
        'is_home': 0,
        'result': 1 if row['goal_diff'] < 0 else (0.5 if row['goal_diff'] == 0 else 0)
    })

team_log = pd.DataFrame(records).sort_values('date').reset_index(drop=True)

# Compute expanding stats per team
team_stats = {}
for team in team_log['team'].unique():
    t = team_log[team_log['team'] == team].copy()
    t['cum_wins'] = (t['result'] == 1).cumsum().shift(1)
    t['cum_matches'] = range(len(t))
    t['cum_matches'] = t['cum_matches'].replace(0, np.nan)
    t['win_rate'] = t['cum_wins'] / t['cum_matches']
    t['avg_gf'] = t['goals_for'].expanding().mean().shift(1)
    t['avg_ga'] = t['goals_against'].expanding().mean().shift(1)
    team_stats[team] = t[['date', 'win_rate', 'avg_gf', 'avg_ga']].set_index('date')

print(f'Built historical stats for {len(team_stats)} teams')

# Merge back into intl
def get_team_stat(team, date, stat):
    if team not in team_stats:
        return np.nan
    ts = team_stats[team]
    mask = ts.index <= date
    if mask.sum() == 0:
        return np.nan
    return ts.loc[mask, stat].iloc[-1]

# Vectorised: use last known stats at match date
home_wr, away_wr = [], []
home_gf, away_gf = [], []
home_ga, away_ga = [], []

for _, row in intl.iterrows():
    home_wr.append(get_team_stat(row['Home Team'], row['Date'], 'win_rate'))
    away_wr.append(get_team_stat(row['Away Team'], row['Date'], 'win_rate'))
    home_gf.append(get_team_stat(row['Home Team'], row['Date'], 'avg_gf'))
    away_gf.append(get_team_stat(row['Away Team'], row['Date'], 'avg_gf'))
    home_ga.append(get_team_stat(row['Home Team'], row['Date'], 'avg_ga'))
    away_ga.append(get_team_stat(row['Away Team'], row['Date'], 'avg_ga'))

intl['home_win_rate'] = home_wr
intl['away_win_rate'] = away_wr
intl['home_avg_gf'] = home_gf
intl['away_avg_gf'] = away_gf
intl['home_avg_ga'] = home_ga
intl['away_avg_ga'] = away_ga

# Derived features
intl['win_rate_diff'] = intl['home_win_rate'] - intl['away_win_rate']
intl['attack_diff'] = intl['home_avg_gf'] - intl['away_avg_gf']

# Drop rows with NaN team stats (first few matches per team)
df = intl.dropna(subset=['home_win_rate', 'away_win_rate',
                          'home_avg_gf', 'away_avg_gf']).copy()
print(f'After dropping NaN team stats: {len(df)} matches')
print(f'\nFeature columns added: home_win_rate, away_win_rate, home_avg_gf, away_avg_gf, '
      f'home_avg_ga, away_avg_ga, win_rate_diff, attack_diff')
print(f'\nSample:')
df[['Home Team', 'Away Team', 'year', 'result', 'home_win_rate', 'away_win_rate',
    'win_rate_diff', 'total_goals']].head(5)

## 3 Â· Exploratory Data Analysis

In [None]:
# â”€â”€ Plot 1: Goals per match over decades â”€â”€
fig, ax = plt.subplots(figsize=(10, 5))
decade_goals = df.groupby('decade')['total_goals'].mean()
ax.bar(decade_goals.index.astype(str), decade_goals.values, color='#2E86AB', edgecolor='white')
ax.set_xlabel('Decade'); ax.set_ylabel('Avg Goals per Match')
ax.set_title('Average Goals per Match by Decade')
for i, v in enumerate(decade_goals.values):
    ax.text(i, v + 0.05, f'{v:.2f}', ha='center', fontsize=8)
save(fig, 'goals_per_decade.png', 'Goals per Match by Decade')

# â”€â”€ Plot 2: Match outcome distribution â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
colors = ['#2E86AB', '#A23B72', '#F18F01']
df['result'].value_counts().plot.pie(ax=axes[0], colors=colors, autopct='%1.1f%%',
                                      startangle=90, textprops={'fontsize': 10})
axes[0].set_ylabel(''); axes[0].set_title('Match Outcome Distribution')

# By tournament importance
ct = pd.crosstab(df['tournament_importance'], df['result'], normalize='index') * 100
ct[['Home Win', 'Draw', 'Away Win']].plot.bar(ax=axes[1], color=colors, edgecolor='white')
axes[1].set_xlabel('Tournament Importance'); axes[1].set_ylabel('Percentage')
axes[1].set_title('Outcome by Tournament Importance')
axes[1].legend(fontsize=8); axes[1].tick_params(axis='x', rotation=0)
fig.tight_layout()
save(fig, 'outcome_distribution.png', 'Match Outcome Distribution')

# â”€â”€ Plot 3: Top 20 teams by win rate (min 50 matches) â”€â”€
all_teams = pd.concat([
    df[['Home Team', 'result']].rename(columns={'Home Team': 'team'}).assign(
        win=lambda x: (x['result'] == 'Home Win').astype(int)),
    df[['Away Team', 'result']].rename(columns={'Away Team': 'team'}).assign(
        win=lambda x: (x['result'] == 'Away Win').astype(int))
])
team_agg = all_teams.groupby('team')['win'].agg(['sum', 'count'])
team_agg.columns = ['wins', 'matches']
team_agg['win_rate'] = team_agg['wins'] / team_agg['matches']
team_top = team_agg[team_agg['matches'] >= 50].nlargest(20, 'win_rate')

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(range(len(team_top)), team_top['win_rate'], color='#2E86AB', edgecolor='white')
ax.set_yticks(range(len(team_top)))
ax.set_yticklabels(team_top.index, fontsize=9)
ax.set_xlabel('Win Rate'); ax.set_title('Top 20 Teams by Win Rate (min 50 matches)')
ax.invert_yaxis()
for i, (wr, m) in enumerate(zip(team_top['win_rate'], team_top['matches'])):
    ax.text(wr + 0.005, i, f'{wr:.2f} ({m})', va='center', fontsize=8)
fig.tight_layout()
save(fig, 'top20_teams.png', 'Top 20 Teams by Win Rate')

# â”€â”€ Plot 4: World Cup goals trend â”€â”€
wc_s = wc_summary.dropna(subset=['Goals Scored']).copy()
wc_s['goals_per_match'] = wc_s['Goals Scored'] / wc_s['Matches Played']

fig, ax1 = plt.subplots(figsize=(10, 5))
ax1.bar(wc_s['Year'].astype(str), wc_s['Goals Scored'], color='#2E86AB', alpha=0.7, label='Total Goals')
ax1.set_xlabel('Year'); ax1.set_ylabel('Total Goals', color='#2E86AB')
ax1.tick_params(axis='x', rotation=45)
ax2 = ax1.twinx()
ax2.plot(wc_s['Year'].astype(str), wc_s['goals_per_match'], 'o-', color='#F18F01', linewidth=2, label='Goals/Match')
ax2.set_ylabel('Goals per Match', color='#F18F01')
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
ax1.set_title('FIFA World Cup â€” Goals Over the Years')
fig.tight_layout()
save(fig, 'wc_goals_trend.png', 'World Cup Goals Trend')

# â”€â”€ Plot 5: Home advantage over time â”€â”€
ha = df.groupby('decade').apply(
    lambda x: (x['result'] == 'Home Win').mean() * 100).reset_index()
ha.columns = ['decade', 'home_win_pct']

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ha['decade'].astype(str), ha['home_win_pct'], 'o-', color='#2E86AB', linewidth=2, markersize=8)
ax.axhline(y=50, color='gray', linestyle='--', alpha=0.5)
ax.fill_between(range(len(ha)), 50, ha['home_win_pct'], alpha=0.2, color='#2E86AB')
ax.set_xticks(range(len(ha))); ax.set_xticklabels(ha['decade'].astype(str))
ax.set_xlabel('Decade'); ax.set_ylabel('Home Win %')
ax.set_title('Home Advantage Over Time')
fig.tight_layout()
save(fig, 'home_advantage.png', 'Home Advantage Over Time')

# â”€â”€ Plot 6: Correlation heatmap â”€â”€
num_cols = ['Home Goals', 'Away Goals', 'total_goals', 'goal_diff',
            'home_win_rate', 'away_win_rate', 'home_avg_gf', 'away_avg_gf',
            'home_avg_ga', 'away_avg_ga', 'win_rate_diff', 'attack_diff',
            'tournament_importance', 'Home Stadium', 'year']
fig, ax = plt.subplots(figsize=(12, 10))
corr = df[num_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, square=True, linewidths=0.5, ax=ax, annot_kws={'size': 7})
ax.set_title('Feature Correlation Heatmap')
fig.tight_layout()
save(fig, 'correlation_heatmap.png', 'Feature Correlation Heatmap')

print('All EDA plots saved!')

## 4 Â· Task 1 â€” Total Goals Regression
Predict the total number of goals scored in a match using team strength features, tournament info and home advantage.

In [None]:
# â”€â”€ Prepare regression data â”€â”€
reg_features = ['home_win_rate', 'away_win_rate', 'home_avg_gf', 'away_avg_gf',
                'home_avg_ga', 'away_avg_ga', 'win_rate_diff', 'attack_diff',
                'tournament_importance', 'Home Stadium', 'year', 'is_world_cup']

reg_df = df.dropna(subset=reg_features + ['total_goals']).copy()
X_reg = reg_df[reg_features].astype(float)
y_reg = reg_df['total_goals']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=SEED)

scaler_r = StandardScaler()
X_train_rs = scaler_r.fit_transform(X_train_r)
X_test_rs = scaler_r.transform(X_test_r)

print(f'Regression dataset: {len(reg_df)} rows')
print(f'Train: {len(X_train_r)}  |  Test: {len(X_test_r)}')

regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=SEED),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=SEED, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=SEED),
}

reg_results = {}
print(f'\nTotal Goals Regression Results:')
print('=' * 70)
for name, model in regressors.items():
    use_scaled = name in ['Ridge Regression', 'Lasso Regression']
    Xtr = X_train_rs if use_scaled else X_train_r
    Xte = X_test_rs if use_scaled else X_test_r
    model.fit(Xtr, y_train_r)
    pred = model.predict(Xte)
    r2 = r2_score(y_test_r, pred)
    mae = mean_absolute_error(y_test_r, pred)
    rmse = np.sqrt(mean_squared_error(y_test_r, pred))
    reg_results[name] = {'RÂ²': r2, 'MAE': mae, 'RMSE': rmse, 'model': model}
    print(f'  {name:30s} RÂ²={r2:.4f}  MAE={mae:.2f}  RMSE={rmse:.2f}')

# â”€â”€ Plot regression comparison â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
names = list(reg_results.keys())
r2s = [reg_results[n]['RÂ²'] for n in names]
maes = [reg_results[n]['MAE'] for n in names]

axes[0].barh(names, r2s, color='#2E86AB', edgecolor='white')
axes[0].set_xlabel('RÂ² Score'); axes[0].set_title('Regression â€” RÂ² Comparison')
for i, v in enumerate(r2s):
    axes[0].text(v + 0.002, i, f'{v:.4f}', va='center', fontsize=8)

axes[1].barh(names, maes, color='#F18F01', edgecolor='white')
axes[1].set_xlabel('MAE'); axes[1].set_title('Regression â€” MAE Comparison')
for i, v in enumerate(maes):
    axes[1].text(v + 0.02, i, f'{v:.2f}', va='center', fontsize=8)
fig.tight_layout()
save(fig, 'regression_results.png', 'Total Goals Regression Results')

# â”€â”€ Feature importance (best tree model) â”€â”€
best_tree_name = max(['Decision Tree', 'Random Forest', 'Gradient Boosting'],
                      key=lambda n: reg_results[n]['RÂ²'])
best_tree = reg_results[best_tree_name]['model']
imp = pd.Series(best_tree.feature_importances_, index=reg_features).sort_values()

fig, ax = plt.subplots(figsize=(8, 5))
imp.plot.barh(ax=ax, color='#2E86AB', edgecolor='white')
ax.set_xlabel('Feature Importance'); ax.set_title(f'Regression Feature Importance ({best_tree_name})')
fig.tight_layout()
save(fig, 'regression_feature_importance.png', 'Regression Feature Importance')

all_results['regression'] = {k: {m: v for m, v in v.items() if m != 'model'}
                              for k, v in reg_results.items()}

best_reg = max(reg_results, key=lambda n: reg_results[n]['RÂ²'])
print(f'\nBest: {best_reg} (RÂ² = {reg_results[best_reg]["RÂ²"]:.4f})')

## 5 Â· Task 2 â€” Match Outcome Classification
Predict whether a match ends in **Home Win**, **Draw**, or **Away Win** using pre-match features.

In [None]:
# â”€â”€ Classification setup â”€â”€
clf_features = ['home_win_rate', 'away_win_rate', 'home_avg_gf', 'away_avg_gf',
                'home_avg_ga', 'away_avg_ga', 'win_rate_diff', 'attack_diff',
                'tournament_importance', 'Home Stadium', 'year', 'is_world_cup']

clf_df = df.dropna(subset=clf_features).copy()
le_result = LabelEncoder()
clf_df['result_enc'] = le_result.fit_transform(clf_df['result'])
class_names = list(le_result.classes_)

X_clf = clf_df[clf_features].astype(float)
y_clf = clf_df['result_enc']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=SEED, stratify=y_clf)

scaler_c = StandardScaler()
X_train_cs = scaler_c.fit_transform(X_train_c)
X_test_cs = scaler_c.transform(X_test_c)

print(f'Classification dataset: {len(clf_df)} rows')
print(f'Classes: {class_names}')
print(clf_df['result'].value_counts())
print(f'\nTrain: {len(X_train_c)}  |  Test: {len(X_test_c)}')

def eval_clf(name, model, Xtr, ytr, Xte, yte, needs_scale=False):
    """Train, predict, return metrics dict."""
    Xtr_ = scaler_c.transform(Xtr) if needs_scale else Xtr
    Xte_ = scaler_c.transform(Xte) if needs_scale else Xte
    model.fit(Xtr_, ytr)
    pred = model.predict(Xte_)
    acc = accuracy_score(yte, pred)
    f1 = f1_score(yte, pred, average='weighted')
    prec = precision_score(yte, pred, average='weighted')
    rec = recall_score(yte, pred, average='weighted')
    return {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1,
            'model': model, 'predictions': pred}

In [None]:
clf_results = {}

classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=2000, random_state=SEED), True),
    ('Decision Tree', DecisionTreeClassifier(max_depth=10, random_state=SEED), False),
    ('Random Forest', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1), False),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED), False),
    ('AdaBoost', AdaBoostClassifier(n_estimators=100, random_state=SEED), False),
    ('SVM (linear)', SVC(kernel='linear', random_state=SEED), True),
    ('SVM (rbf)', SVC(kernel='rbf', random_state=SEED), True),
    ('Naive Bayes', GaussianNB(), True),
    ('MLP Neural Network', MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500,
                                          random_state=SEED, early_stopping=True), True),
]

print('Match Outcome Classification')
print('=' * 65)

for name, model, needs_scale in classifiers:
    r = eval_clf(name, model, X_train_c, y_train_c, X_test_c, y_test_c, needs_scale)
    clf_results[name] = r
    print(f'  {name:30s} Acc={r["Accuracy"]:.4f}  F1={r["F1 Score"]:.4f}')

# KNN with k search
k_scores = {}
for k in [3, 5, 7, 9, 11]:
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train_cs, y_train_c)
    k_scores[k] = accuracy_score(y_test_c, knn.predict(X_test_cs))
best_k = max(k_scores, key=k_scores.get)
print(f'  KNN by k: {k_scores} -> best k={best_k}')

r = eval_clf(f'KNN (k={best_k})', KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1),
             X_train_c, y_train_c, X_test_c, y_test_c, needs_scale=True)
clf_results[f'KNN (k={best_k})'] = r
print(f'  {"KNN (k="+str(best_k)+")":30s} Acc={r["Accuracy"]:.4f}  F1={r["F1 Score"]:.4f}')

print(f'\nAll {len(clf_results)} classifiers trained!')

## 6 Â· Task 3 â€” Player Position Classification
Predict a player's **position** (Goalkeeper, Defender, Midfielder, Forward) from age, caps, goals, and World Cup goals.

In [None]:
# â”€â”€ Player position classification using squad data â”€â”€
pos_df = squads[['Position', 'Age', 'Caps', 'Goals', 'WC Goals']].dropna().copy()
le_pos = LabelEncoder()
pos_df['pos_enc'] = le_pos.fit_transform(pos_df['Position'])
pos_classes = list(le_pos.classes_)

X_pos = pos_df[['Age', 'Caps', 'Goals', 'WC Goals']].astype(float)
y_pos = pos_df['pos_enc']

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_pos, y_pos, test_size=0.2, random_state=SEED, stratify=y_pos)

scaler_p = StandardScaler()
X_train_ps = scaler_p.fit_transform(X_train_p)
X_test_ps = scaler_p.transform(X_test_p)

print(f'Player position dataset: {len(pos_df)} rows')
print(f'Classes: {pos_classes}')
print(pos_df['Position'].value_counts())
print(f'Train: {len(X_train_p)}  |  Test: {len(X_test_p)}')

pos_results = {}
pos_classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=2000, random_state=SEED), True),
    ('Decision Tree', DecisionTreeClassifier(max_depth=8, random_state=SEED), False),
    ('Random Forest', RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1), False),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=200, random_state=SEED), False),
    ('SVM (rbf)', SVC(kernel='rbf', random_state=SEED), True),
    ('KNN (k=5)', KNeighborsClassifier(n_neighbors=5, n_jobs=-1), True),
    ('MLP', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500,
                           random_state=SEED, early_stopping=True), True),
]

print(f'\nPlayer Position Classification')
print('=' * 65)

for name, model, needs_scale in pos_classifiers:
    Xtr_ = X_train_ps if needs_scale else X_train_p
    Xte_ = X_test_ps if needs_scale else X_test_p
    model.fit(Xtr_, y_train_p)
    pred = model.predict(Xte_)
    acc = accuracy_score(y_test_p, pred)
    f1 = f1_score(y_test_p, pred, average='weighted')
    prec = precision_score(y_test_p, pred, average='weighted')
    rec = recall_score(y_test_p, pred, average='weighted')
    pos_results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec,
                          'F1 Score': f1, 'model': model, 'predictions': pred}
    print(f'  {name:30s} Acc={acc:.4f}  F1={f1:.4f}')

# â”€â”€ Position classification plots â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
names_p = list(pos_results.keys())
f1s_p = [pos_results[n]['F1 Score'] for n in names_p]
colors_p = plt.cm.Set2(np.linspace(0, 1, len(names_p)))
axes[0].barh(names_p, f1s_p, color=colors_p, edgecolor='white')
axes[0].set_xlabel('F1 Score'); axes[0].set_title('Player Position â€” Model F1 Scores')
for i, v in enumerate(f1s_p):
    axes[0].text(v + 0.005, i, f'{v:.4f}', va='center', fontsize=8)

# Best model confusion matrix
best_pos_name = max(pos_results, key=lambda n: pos_results[n]['F1 Score'])
cm = confusion_matrix(y_test_p, pos_results[best_pos_name]['predictions'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=pos_classes,
            yticklabels=pos_classes, ax=axes[1])
axes[1].set_xlabel('Predicted'); axes[1].set_ylabel('Actual')
axes[1].set_title(f'Confusion Matrix â€” {best_pos_name}')
fig.tight_layout()
save(fig, 'position_classification.png', 'Player Position Classification')

all_results['position_clf'] = {k: {m: v for m, v in v.items() if m not in ('model', 'predictions')}
                                for k, v in pos_results.items()}
print(f'\nBest: {best_pos_name} (F1 = {pos_results[best_pos_name]["F1 Score"]:.4f})')

## 7 Â· Task 4 â€” Country Performance Clustering
Cluster countries by their historical performance metrics: win rate, average goals, total matches.

In [None]:
# â”€â”€ Build country profiles â”€â”€
home_stats = df.groupby('Home Team').agg(
    home_matches=('result', 'count'),
    home_wins=('result', lambda x: (x == 'Home Win').sum()),
    home_gf=('Home Goals', 'mean'),
    home_ga=('Away Goals', 'mean'),
).rename_axis('team')

away_stats = df.groupby('Away Team').agg(
    away_matches=('result', 'count'),
    away_wins=('result', lambda x: (x == 'Away Win').sum()),
    away_gf=('Away Goals', 'mean'),
    away_ga=('Home Goals', 'mean'),
).rename_axis('team')

country_profiles = home_stats.join(away_stats, how='outer').fillna(0)
country_profiles['total_matches'] = country_profiles['home_matches'] + country_profiles['away_matches']
country_profiles['total_wins'] = country_profiles['home_wins'] + country_profiles['away_wins']
country_profiles['win_rate'] = country_profiles['total_wins'] / country_profiles['total_matches']
country_profiles['avg_gf'] = (country_profiles['home_gf'] * country_profiles['home_matches'] +
                               country_profiles['away_gf'] * country_profiles['away_matches']) / country_profiles['total_matches']
country_profiles['avg_ga'] = (country_profiles['home_ga'] * country_profiles['home_matches'] +
                               country_profiles['away_ga'] * country_profiles['away_matches']) / country_profiles['total_matches']

# Filter: at least 20 matches
cp = country_profiles[country_profiles['total_matches'] >= 20].copy()
cluster_features = ['win_rate', 'avg_gf', 'avg_ga', 'total_matches']

X_clust = cp[cluster_features].values
scaler_cl = StandardScaler()
X_clust_s = scaler_cl.fit_transform(X_clust)

print(f'Clustering: {len(cp)} countries (min 20 matches)')

# â”€â”€ Elbow + Silhouette â”€â”€
from sklearn.metrics import silhouette_score

K_range = range(2, 11)
inertias, sils = [], []
for k in K_range:
    km = KMeans(n_clusters=k, random_state=SEED, n_init=10)
    km.fit(X_clust_s)
    inertias.append(km.inertia_)
    sils.append(silhouette_score(X_clust_s, km.labels_))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(list(K_range), inertias, 'o-', color='#2E86AB', linewidth=2)
axes[0].set_xlabel('k'); axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[1].plot(list(K_range), sils, 'o-', color='#F18F01', linewidth=2)
axes[1].set_xlabel('k'); axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Scores')
fig.tight_layout()
save(fig, 'elbow_silhouette.png', 'Elbow & Silhouette Analysis')

best_k_cl = list(K_range)[np.argmax(sils)]
print(f'Best k = {best_k_cl} (silhouette = {max(sils):.4f})')

# â”€â”€ Final clustering â”€â”€
km_final = KMeans(n_clusters=best_k_cl, random_state=SEED, n_init=10)
cp['cluster'] = km_final.fit_predict(X_clust_s)

# PCA for visualisation
pca = PCA(n_components=2, random_state=SEED)
X_pca = pca.fit_transform(X_clust_s)

# DBSCAN
db = DBSCAN(eps=1.5, min_samples=3)
cp['dbscan_cluster'] = db.fit_predict(X_clust_s)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
colors_cl = plt.cm.Set1(np.linspace(0, 1, best_k_cl))
for cl in range(best_k_cl):
    mask = cp['cluster'] == cl
    axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1], label=f'Cluster {cl}',
                     alpha=0.7, s=60, edgecolors='white', linewidths=0.5)
# Annotate top teams
for i, (team, row) in enumerate(cp.iterrows()):
    if row['total_matches'] > 200:
        axes[0].annotate(team, (X_pca[i, 0], X_pca[i, 1]), fontsize=7, alpha=0.8)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
axes[0].set_title(f'K-Means Clustering (k={best_k_cl})')
axes[0].legend(fontsize=8)

# DBSCAN
unique_db = sorted(cp['dbscan_cluster'].unique())
for cl in unique_db:
    mask = cp['dbscan_cluster'] == cl
    label = f'Cluster {cl}' if cl >= 0 else 'Noise'
    axes[1].scatter(X_pca[mask, 0], X_pca[mask, 1], label=label, alpha=0.7, s=60,
                     edgecolors='white', linewidths=0.5)
axes[1].set_xlabel(f'PC1'); axes[1].set_ylabel(f'PC2')
axes[1].set_title('DBSCAN Clustering')
axes[1].legend(fontsize=8)
fig.tight_layout()
save(fig, 'clustering_results.png', 'Country Clustering Results')

# Cluster profiles
print(f'\nCluster Profiles:')
print(cp.groupby('cluster')[cluster_features].agg(['mean', 'count']).round(2).to_string())

all_results['clustering'] = {
    'k': best_k_cl, 'silhouette': max(sils),
    'profiles': cp.groupby('cluster')[cluster_features].mean().round(3).to_dict()
}

## 8 Â· Hyperparameter Tuning
GridSearchCV and RandomizedSearchCV on the best classifiers for match outcome prediction.

In [None]:
# â”€â”€ GridSearchCV â€” Random Forest â”€â”€
print('GridSearchCV: Random Forest ...')
rf_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
}
gs_rf = GridSearchCV(RandomForestClassifier(random_state=SEED, n_jobs=-1),
                      rf_grid, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=0)
gs_rf.fit(X_train_c, y_train_c)
pred_gs = gs_rf.predict(X_test_c)
acc_gs = accuracy_score(y_test_c, pred_gs)
f1_gs = f1_score(y_test_c, pred_gs, average='weighted')
print(f'  Best params: {gs_rf.best_params_}')
print(f'  Best CV F1:  {gs_rf.best_score_:.4f}')
print(f'  Test Acc:    {acc_gs:.4f}  F1: {f1_gs:.4f}')

clf_results['RF (Tuned)'] = {
    'Accuracy': acc_gs, 'F1 Score': f1_gs,
    'Precision': precision_score(y_test_c, pred_gs, average='weighted'),
    'Recall': recall_score(y_test_c, pred_gs, average='weighted'),
    'model': gs_rf.best_estimator_, 'predictions': pred_gs}

# â”€â”€ RandomizedSearchCV â€” Gradient Boosting â”€â”€
print(f'\nRandomizedSearchCV: Gradient Boosting ...')
gb_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'min_samples_split': [2, 5, 10],
}
rs_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=SEED),
                            gb_dist, n_iter=20, cv=3, scoring='f1_weighted',
                            random_state=SEED, n_jobs=-1, verbose=0)
rs_gb.fit(X_train_c, y_train_c)
pred_rs = rs_gb.predict(X_test_c)
acc_rs = accuracy_score(y_test_c, pred_rs)
f1_rs = f1_score(y_test_c, pred_rs, average='weighted')
print(f'  Best params: {rs_gb.best_params_}')
print(f'  Test Acc:    {acc_rs:.4f}  F1: {f1_rs:.4f}')

clf_results['GB (Tuned)'] = {
    'Accuracy': acc_rs, 'F1 Score': f1_rs,
    'Precision': precision_score(y_test_c, pred_rs, average='weighted'),
    'Recall': recall_score(y_test_c, pred_rs, average='weighted'),
    'model': rs_gb.best_estimator_, 'predictions': pred_rs}

print(f'\nHyperparameter tuning complete!')

## 9 Â· Cross-Validation, Confusion Matrices & Learning Curves

In [None]:
# â”€â”€ 5-Fold Cross-Validation â”€â”€
cv_models = {
    'Logistic Regression': (LogisticRegression(max_iter=2000, random_state=SEED), True),
    'Random Forest': (RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1), False),
    'Gradient Boosting': (GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED), False),
    'MLP': (MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=SEED, early_stopping=True), True),
}

cv_scores = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for name, (model, needs_scale) in cv_models.items():
    X_cv = X_train_cs if needs_scale else X_train_c
    scores = cross_val_score(model, X_cv, y_train_c, cv=skf, scoring='f1_weighted', n_jobs=-1)
    cv_scores[name] = scores
    print(f'{name:30s} CV F1: {scores.mean():.4f} Â± {scores.std():.4f}')

fig, ax = plt.subplots(figsize=(10, 5))
ax.boxplot(cv_scores.values(), labels=cv_scores.keys(), patch_artist=True,
           boxprops=dict(facecolor='#2E86AB', alpha=0.7))
ax.set_ylabel('F1 Score'); ax.set_title('5-Fold Cross-Validation â€” F1 Scores')
ax.tick_params(axis='x', rotation=15)
fig.tight_layout()
save(fig, 'cv_comparison.png', 'Cross-Validation Comparison')

# â”€â”€ Feature Importance â€” RF â”€â”€
rf_model = clf_results.get('RF (Tuned)', clf_results.get('Random Forest'))['model']
if hasattr(rf_model, 'feature_importances_'):
    imp_clf = pd.Series(rf_model.feature_importances_, index=clf_features).sort_values()
    fig, ax = plt.subplots(figsize=(8, 5))
    imp_clf.plot.barh(ax=ax, color='#2E86AB', edgecolor='white')
    ax.set_xlabel('Feature Importance')
    ax.set_title('Match Outcome â€” Feature Importance (Random Forest)')
    fig.tight_layout()
    save(fig, 'feature_importance.png', 'Match Outcome Feature Importance')

# â”€â”€ Confusion Matrices (top 4 classifiers) â”€â”€
top4 = sorted(clf_results, key=lambda n: clf_results[n]['F1 Score'], reverse=True)[:4]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for ax, name in zip(axes.ravel(), top4):
    cm = confusion_matrix(y_test_c, clf_results[name]['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names, ax=ax)
    ax.set_title(f'{name}\n(F1={clf_results[name]["F1 Score"]:.4f})')
    ax.set_xlabel('Predicted'); ax.set_ylabel('Actual')
fig.suptitle('Confusion Matrices â€” Top 4 Classifiers', fontsize=14, y=1.02)
fig.tight_layout()
save(fig, 'confusion_matrices.png', 'Confusion Matrices â€” Top 4')

# â”€â”€ Learning Curves (best model) â”€â”€
best_clf_name = top4[0]
best_needs_scale = best_clf_name in ['Logistic Regression', 'SVM (linear)', 'SVM (rbf)',
                                      'MLP Neural Network', 'MLP', 'Naive Bayes'] or 'KNN' in best_clf_name
X_lc = X_train_cs if best_needs_scale else X_train_c

# Re-create model for learning curve
if 'Random Forest' in best_clf_name or 'RF' in best_clf_name:
    lc_model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1)
elif 'Gradient Boosting' in best_clf_name or 'GB' in best_clf_name:
    lc_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED)
elif 'MLP' in best_clf_name:
    lc_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=SEED, early_stopping=True)
else:
    lc_model = LogisticRegression(max_iter=2000, random_state=SEED)

train_sizes, train_scores, val_scores = learning_curve(
    lc_model, X_lc, y_train_c, cv=5, scoring='f1_weighted',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1)

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(train_sizes, train_scores.mean(axis=1), 'o-', color='#2E86AB', label='Train')
ax.fill_between(train_sizes, train_scores.mean(axis=1) - train_scores.std(axis=1),
                train_scores.mean(axis=1) + train_scores.std(axis=1), alpha=0.1, color='#2E86AB')
ax.plot(train_sizes, val_scores.mean(axis=1), 'o-', color='#F18F01', label='Validation')
ax.fill_between(train_sizes, val_scores.mean(axis=1) - val_scores.std(axis=1),
                val_scores.mean(axis=1) + val_scores.std(axis=1), alpha=0.1, color='#F18F01')
ax.set_xlabel('Training Size'); ax.set_ylabel('F1 Score')
ax.set_title(f'Learning Curve â€” {best_clf_name}')
ax.legend(); ax.grid(True, alpha=0.3)
fig.tight_layout()
save(fig, 'learning_curves.png', f'Learning Curve â€” {best_clf_name}')

print('CV, confusion matrices, and learning curves complete!')

## 10 Â· Ensemble Methods & Final Comparison

In [None]:
# â”€â”€ Voting Classifier â”€â”€
print('Training Voting Classifier ...')
voting = VotingClassifier(estimators=[
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1)),
    ('gb', GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED)),
    ('lr', LogisticRegression(max_iter=2000, random_state=SEED)),
], voting='hard', n_jobs=-1)
# Use scaled data for LR inside pipeline â€” but VotingClassifier with hard voting
# We'll use unscaled; LR will converge anyway on this data
voting.fit(X_train_c, y_train_c)
pred_v = voting.predict(X_test_c)
clf_results['Voting Ensemble'] = {
    'Accuracy': accuracy_score(y_test_c, pred_v),
    'F1 Score': f1_score(y_test_c, pred_v, average='weighted'),
    'Precision': precision_score(y_test_c, pred_v, average='weighted'),
    'Recall': recall_score(y_test_c, pred_v, average='weighted'),
    'model': voting, 'predictions': pred_v}
print(f'  Voting Ensemble               Acc={clf_results["Voting Ensemble"]["Accuracy"]:.4f}  '
      f'F1={clf_results["Voting Ensemble"]["F1 Score"]:.4f}')

# â”€â”€ Stacking Classifier â”€â”€
print('Training Stacking Classifier ...')
stacking = StackingClassifier(estimators=[
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1)),
    ('gb', GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED)),
    ('dt', DecisionTreeClassifier(max_depth=10, random_state=SEED)),
], final_estimator=LogisticRegression(max_iter=2000, random_state=SEED),
    cv=3, n_jobs=-1)
stacking.fit(X_train_c, y_train_c)
pred_s = stacking.predict(X_test_c)
clf_results['Stacking Ensemble'] = {
    'Accuracy': accuracy_score(y_test_c, pred_s),
    'F1 Score': f1_score(y_test_c, pred_s, average='weighted'),
    'Precision': precision_score(y_test_c, pred_s, average='weighted'),
    'Recall': recall_score(y_test_c, pred_s, average='weighted'),
    'model': stacking, 'predictions': pred_s}
print(f'  Stacking Ensemble             Acc={clf_results["Stacking Ensemble"]["Accuracy"]:.4f}  '
      f'F1={clf_results["Stacking Ensemble"]["F1 Score"]:.4f}')

# â”€â”€ Final Comparison Table â”€â”€
comp = pd.DataFrame({
    name: {k: v for k, v in metrics.items() if k not in ('model', 'predictions')}
    for name, metrics in clf_results.items()
}).T.sort_values('F1 Score', ascending=False)
comp.index.name = 'Model'

print(f'\nFinal Model Comparison:')
print(comp.to_string(float_format=lambda x: f'{x:.6f}'))

# â”€â”€ Comparison Plot â”€â”€
fig, ax = plt.subplots(figsize=(10, 7))
colors_bar = plt.cm.viridis(np.linspace(0.2, 0.9, len(comp)))
bars = ax.barh(range(len(comp)), comp['F1 Score'], color=colors_bar, edgecolor='white')
ax.set_yticks(range(len(comp)))
ax.set_yticklabels(comp.index, fontsize=9)
ax.set_xlabel('F1 Score (weighted)')
ax.set_title('Match Outcome Classification â€” All Models')
for i, v in enumerate(comp['F1 Score']):
    ax.text(v + 0.002, i, f'{v:.4f}', va='center', fontsize=8)
ax.invert_yaxis()
fig.tight_layout()
save(fig, 'model_comparison.png', 'Model Comparison â€” Match Outcome')

all_results['match_clf'] = comp.to_dict('index')

best_overall = comp.index[0]
print(f'\nBest model: {best_overall} (F1 = {comp.loc[best_overall, "F1 Score"]:.4f})')

## 11 Â· HTML Report Generation

In [None]:
# â”€â”€ Generate HTML Report â”€â”€
def img_to_b64(path):
    with open(path, 'rb') as f:
        return base64.b64encode(f.read()).decode()

HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Football ML Analysis Report</title>
<style>
  :root { --primary: #1B4332; --accent: #2D6A4F; --light: #D8F3DC; --bg: #F8FAF9; }
  * { margin: 0; padding: 0; box-sizing: border-box; }
  body { font-family: 'Segoe UI', system-ui, sans-serif; background: var(--bg); color: #333; line-height: 1.6; }
  .header { background: linear-gradient(135deg, var(--primary), var(--accent)); color: white;
             padding: 3rem 2rem; text-align: center; }
  .header h1 { font-size: 2.2rem; margin-bottom: 0.5rem; }
  .header p { opacity: 0.9; font-size: 1.1rem; }
  .container { max-width: 1200px; margin: 0 auto; padding: 2rem; }
  .section { background: white; border-radius: 12px; padding: 2rem; margin-bottom: 2rem;
             box-shadow: 0 2px 8px rgba(0,0,0,0.08); }
  .section h2 { color: var(--primary); border-bottom: 3px solid var(--accent);
                 padding-bottom: 0.5rem; margin-bottom: 1.5rem; font-size: 1.5rem; }
  .section h3 { color: var(--accent); margin: 1rem 0 0.5rem; }
  table { width: 100%%; border-collapse: collapse; margin: 1rem 0; font-size: 0.9rem; }
  th { background: var(--accent); color: white; padding: 10px 12px; text-align: left; }
  td { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; }
  tr:nth-child(even) { background: var(--light); }
  tr:hover { background: #B7E4C7; }
  .best { background: #95D5B2 !important; font-weight: bold; }
  img { max-width: 100%%; border-radius: 8px; margin: 1rem 0; }
  .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 1.5rem; }
  .metric-card { background: var(--light); border-radius: 8px; padding: 1.5rem; text-align: center; }
  .metric-card .value { font-size: 2rem; font-weight: bold; color: var(--primary); }
  .metric-card .label { color: #666; font-size: 0.9rem; }
  .footer { text-align: center; padding: 2rem; color: #888; font-size: 0.85rem; }
</style>
</head>
<body>
<div class="header">
  <h1>âš½ FIFA World Cup & International Football â€” ML Report</h1>
  <p>Comprehensive Machine Learning Analysis â€¢ {{ n_matches }} matches â€¢ {{ n_teams }} teams â€¢ {{ year_range }}</p>
</div>
<div class="container">

  <!-- Key metrics -->
  <div class="grid" style="margin-bottom:2rem;">
    <div class="metric-card"><div class="value">{{ n_matches }}</div><div class="label">International Matches</div></div>
    <div class="metric-card"><div class="value">{{ n_teams }}</div><div class="label">Unique Teams</div></div>
    <div class="metric-card"><div class="value">{{ n_wc }}</div><div class="label">World Cup Editions</div></div>
    <div class="metric-card"><div class="value">{{ n_players }}</div><div class="label">Squad Players (2022)</div></div>
  </div>

  <!-- EDA -->
  <div class="section">
    <h2>1 Â· Exploratory Data Analysis</h2>
    <div class="grid">
    {% for title, b64 in eda_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <!-- Regression -->
  <div class="section">
    <h2>2 Â· Total Goals Regression</h2>
    <table>
      <tr><th>Model</th><th>RÂ²</th><th>MAE</th><th>RMSE</th></tr>
      {% for name, m in reg_results.items() %}
      <tr{% if loop.index == 1 %} class="best"{% endif %}>
        <td>{{ name }}</td><td>{{ "%.4f"|format(m['RÂ²']) }}</td>
        <td>{{ "%.2f"|format(m['MAE']) }}</td><td>{{ "%.2f"|format(m['RMSE']) }}</td></tr>
      {% endfor %}
    </table>
    <div class="grid">
    {% for title, b64 in reg_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <!-- Classification -->
  <div class="section">
    <h2>3 Â· Match Outcome Classification</h2>
    <table>
      <tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th><th>F1 Score</th></tr>
      {% for name, m in clf_table.items() %}
      <tr{% if loop.index == 1 %} class="best"{% endif %}>
        <td>{{ name }}</td><td>{{ "%.4f"|format(m['Accuracy']) }}</td>
        <td>{{ "%.4f"|format(m['Precision']) }}</td><td>{{ "%.4f"|format(m['Recall']) }}</td>
        <td>{{ "%.4f"|format(m['F1 Score']) }}</td></tr>
      {% endfor %}
    </table>
    <div class="grid">
    {% for title, b64 in clf_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <!-- Position Classification -->
  <div class="section">
    <h2>4 Â· Player Position Classification</h2>
    <table>
      <tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th><th>F1 Score</th></tr>
      {% for name, m in pos_table.items() %}
      <tr{% if loop.index == 1 %} class="best"{% endif %}>
        <td>{{ name }}</td><td>{{ "%.4f"|format(m['Accuracy']) }}</td>
        <td>{{ "%.4f"|format(m['Precision']) }}</td><td>{{ "%.4f"|format(m['Recall']) }}</td>
        <td>{{ "%.4f"|format(m['F1 Score']) }}</td></tr>
      {% endfor %}
    </table>
    <div class="grid">
    {% for title, b64 in pos_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <!-- Clustering -->
  <div class="section">
    <h2>5 Â· Country Performance Clustering</h2>
    <div class="grid" style="margin-bottom:1rem;">
      <div class="metric-card"><div class="value">{{ cluster_k }}</div><div class="label">Optimal Clusters</div></div>
      <div class="metric-card"><div class="value">{{ "%.4f"|format(cluster_sil) }}</div><div class="label">Silhouette Score</div></div>
    </div>
    <div class="grid">
    {% for title, b64 in cluster_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <!-- Analysis -->
  <div class="section">
    <h2>6 Â· Cross-Validation & Analysis</h2>
    <div class="grid">
    {% for title, b64 in analysis_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <!-- Final comparison -->
  <div class="section">
    <h2>7 Â· Final Model Comparison</h2>
    {% for title, b64 in final_plots %}
      <h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}">
    {% endfor %}
  </div>

</div>
<div class="footer">
  Generated automatically Â· Football ML Analysis Â· scikit-learn {{ sklearn_ver }}
</div>
</body></html>
"""

# Categorise saved plots
plot_categories = {
    'eda': ['goals_per_decade', 'outcome_distribution', 'top20_teams', 'wc_goals_trend',
            'home_advantage', 'correlation_heatmap'],
    'reg': ['regression_results', 'regression_feature_importance'],
    'clf': ['confusion_matrices', 'feature_importance'],
    'pos': ['position_classification'],
    'cluster': ['elbow_silhouette', 'clustering_results'],
    'analysis': ['cv_comparison', 'learning_curves'],
    'final': ['model_comparison'],
}

def get_plots(category):
    keys = plot_categories[category]
    result = []
    for title, path in saved_plots:
        fname = pathlib.Path(path).stem
        if fname in keys:
            result.append((title, img_to_b64(path)))
    return result

# Sort regression results by RÂ²
reg_sorted = dict(sorted(all_results['regression'].items(),
                          key=lambda x: x[1]['RÂ²'], reverse=True))

# Sort classification results by F1
clf_sorted = dict(sorted(
    {k: {m: v for m, v in met.items() if m not in ('model', 'predictions')}
     for k, met in clf_results.items()}.items(),
    key=lambda x: x[1]['F1 Score'], reverse=True))

# Sort position results by F1
pos_sorted = dict(sorted(all_results['position_clf'].items(),
                          key=lambda x: x[1]['F1 Score'], reverse=True))

import sklearn
tmpl = Template(HTML_TEMPLATE)
html = tmpl.render(
    n_matches=f'{len(intl):,}',
    n_teams=pd.concat([intl['Home Team'], intl['Away Team']]).nunique(),
    year_range=f'{intl["year"].min()}â€“{intl["year"].max()}',
    n_wc=len(wc_summary),
    n_players=len(squads),
    eda_plots=get_plots('eda'),
    reg_results=reg_sorted,
    reg_plots=get_plots('reg'),
    clf_table=clf_sorted,
    clf_plots=get_plots('clf'),
    pos_table=pos_sorted,
    pos_plots=get_plots('pos'),
    cluster_k=all_results['clustering']['k'],
    cluster_sil=all_results['clustering']['silhouette'],
    cluster_plots=get_plots('cluster'),
    analysis_plots=get_plots('analysis'),
    final_plots=get_plots('final'),
    sklearn_ver=sklearn.__version__,
)

report_path = pathlib.Path('outputs/football_ml_report.html')
report_path.write_text(html)
print(f'âœ… HTML Report generated: {report_path}')
print(f'   File size: {report_path.stat().st_size / 1024:.1f} KB')
print(f'   Embedded images: {html.count("data:image/png;base64,")}')
print(f'\nðŸŽ‰ Analysis complete! Open the HTML file to view the full report.')