In [3]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
# ===========================================
# FIFA World Cup Goals - Feature Engineering
# ===========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch

# ============================
# 1. LOAD CSV WITH CORRECT ENCODING
# ============================
df = pd.read_csv(
    "data/FIFA World Cup All Goals 1930-2022.csv",
    encoding='latin1'  # solves UnicodeDecodeError
)

# ============================
# 2. RENAME COLUMNS AND CREATE SCORER
# ============================
df.rename(columns={
    'team_name': 'Team',
    'minute_label': 'Minute',
    'family_name': 'FamilyName',
    'given_name': 'GivenName'
}, inplace=True)

# Combine first + last name
df['Scorer'] = df['GivenName'].astype(str) + ' ' + df['FamilyName'].astype(str)

# ============================
# 3. EXTRACT NUMERIC MINUTE
# ============================
def extract_minute(m):
    if isinstance(m, str):
        m = m.replace("'", "")  # remove apostrophe
        if "+" in m:
            parts = m.split("+")
            return int(parts[0]) + int(parts[1])
        else:
            return int(m)
    return int(m)

df['Minute'] = df['Minute'].apply(extract_minute)

# Drop rows missing essential info
df = df.dropna(subset=['Scorer', 'Team', 'Minute'])

# ============================
# 4. FEATURE ENGINEERING
# ============================

# Minute category
def minute_category(m):
    if m <= 15: return "Early"
    elif m <= 45: return "MidFirst"
    elif m <= 60: return "EarlySecond"
    elif m <= 75: return "MidSecond"
    else: return "Late"

df['MinuteCat'] = df['Minute'].apply(minute_category)

# Team Strength Proxy: normalize team goals
team_strength = df['Team'].value_counts() / df['Team'].value_counts().max()
df['TeamStrength'] = df['Team'].map(team_strength)

# Encode categorical features
cat_features = ['Team', 'Scorer', 'MinuteCat']
encoders = {}
for col in cat_features:
    enc = LabelEncoder()
    df[col] = enc.fit_transform(df[col])
    encoders[col] = enc

# ============================
# 5. FINAL FEATURE MATRIX
# ============================
feature_cols = ['Minute', 'MinuteCat', 'Team', 'Scorer', 'TeamStrength']
X = df[feature_cols].values

# Scale numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Target placeholder (all goals = 1)
y = np.ones(len(df))

# ============================
# 6. TRAIN-TEST SPLIT
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# 7. SAVE AS PYTORCH TENSORS
# ============================
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test, dtype=torch.float32)

torch.save(X_train_tensor, "data/X_train.pt")
torch.save(X_test_tensor,  "data/X_test.pt")
torch.save(y_train_tensor, "data/y_train.pt")
torch.save(y_test_tensor,  "data/y_test.pt")


âœ… Feature Engineering Complete! Tensors saved in /data/


In [5]:
# FIFA World Cup Goal Prediction with EM Clustering + PyTorch


# ---------------------------
# 1. Load & Preprocess Data
# ---------------------------
# Example dataset: FIFA World Cup Shot Events (StatsBomb or Kaggle)
data = pd.read_csv("data/FIFA World Cup All Goals 1930-2022.csv")

# Core numeric features
numeric_features = ['shot_distance', 'shot_angle']

# Optional defensive context (only if dataset has it)
extra_features = ['pressure', 'player_position']  # remove if not available
available_extras = [f for f in extra_features if f in data.columns]
numeric_features += available_extras

# Categorical features
categorical_features = ['shot_type', 'body_part', 'player']

for col in categorical_features:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

# Final features for EM + NN
features_for_nn = numeric_features + categorical_features
X = data[features_for_nn].values
y = data['goal'].values

# Standardize numeric data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# 2. EM Clustering (Shot Types)
# ---------------------------
n_clusters = 5  # latent shot categories (tweak if needed)
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
cluster_labels = gmm.fit_predict(X_scaled)

data['shot_cluster'] = cluster_labels
X_final = np.hstack([X_scaled, cluster_labels.reshape(-1,1)])
features_for_nn.append('shot_cluster')

# ---------------------------
# 3. Train/Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_test  = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test  = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# ---------------------------
# 4. PyTorch Model
# ---------------------------
class GoalPredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = GoalPredictor(X_train.shape[1])

# ---------------------------
# 5. Train Model
# ---------------------------
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 60
losses = []

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.4f}")

plt.plot(losses); plt.title("Training Loss"); plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.show()

# ---------------------------
# 6. Evaluate Model
# ---------------------------
model.eval()
with torch.no_grad():
    y_prob = model(X_test)
    y_pred = (y_prob >= 0.5).float()

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print(f"Test Accuracy: {acc:.4f}")
print(f"ROC-AUC: {auc:.4f}")

# ---------------------------
# 7. Visualize Shot Clusters
# ---------------------------
plt.scatter(data['shot_angle'], data['shot_distance'], 
            c=data['shot_cluster'], cmap='viridis', alpha=0.6)
plt.xlabel("Shot Angle (degrees)")
plt.ylabel("Shot Distance (meters)")
plt.title("EM Clusters of World Cup Shots")
plt.colorbar(label="Cluster")
plt.show()


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 814: invalid continuation byte