# Win Probability Prediction with PCA and Role/Champion Analysis

This notebook demonstrates:
1. Data exploration and role/champion-specific performance analysis
2. PCA feature reduction with consideration for role/champion context
3. Training and comparison of multiple classification models
4. Identifying outlier matches where predictions diverge from reality

## Problem Statement

**Challenge:** Performance metrics that indicate a "good" performance vary significantly by role and champion:
- Support players have different typical stats than carries
- Tank champions have different damage/vision patterns than assassins
- Raw stats don't account for these contextual differences

**Approach:** We'll explore multiple strategies to handle this:
1. **Role-specific normalization**: Normalize features within each role
2. **Role/champion as features**: Include role/champion as categorical features
3. **Separate models**: Train separate models for different roles/champions
4. **Combined approach**: Use PCA on normalized features + role context

In [None]:
# Imports
import asyncio
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC

warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully')

In [None]:
# Setup database connection
import sys
sys.path.insert(0, '../src')

from lol_data_center.database.engine import get_async_session
from lol_data_center.ml.data_extraction import MatchDataExtractor

print('Database modules loaded')

## 1. Data Extraction and Exploration

In [None]:
# Extract match data
async def load_data():
    async with get_async_session() as session:
        extractor = MatchDataExtractor(session)
        df = await extractor.extract_match_features()
        champion_stats = await extractor.get_champion_stats()
        role_stats = await extractor.get_role_stats()
    return df, champion_stats, role_stats

df, champion_stats, role_stats = await load_data()

print(f'Loaded {len(df)} match records')
print(f'\nDataset shape: {df.shape}')
print(f'\nColumns: {list(df.columns)}')

In [None]:
# Basic data exploration
print('Dataset Info:')
print(df.info())
print('\n' + '='*80 + '\n')
print('First few rows:')
df.head()

In [None]:
# Win rate distribution
win_rate = df['win'].mean()
print(f'Overall win rate: {win_rate:.2%}')

# Visualize win distribution
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Win rate by role
if 'team_position' in df.columns:
    role_wins = df.groupby('team_position')['win'].agg(['mean', 'count'])
    role_wins.columns = ['win_rate', 'count']
    role_wins = role_wins[role_wins['count'] >= 10]  # Filter roles with few games
    
    ax[0].bar(role_wins.index, role_wins['win_rate'])
    ax[0].set_title('Win Rate by Role')
    ax[0].set_xlabel('Role')
    ax[0].set_ylabel('Win Rate')
    ax[0].axhline(y=0.5, color='r', linestyle='--', label='50%')
    ax[0].legend()
    ax[0].tick_params(axis='x', rotation=45)

# Top 15 champions by games played
if 'champion_name' in df.columns:
    champ_wins = df.groupby('champion_name')['win'].agg(['mean', 'count'])
    champ_wins.columns = ['win_rate', 'count']
    top_champs = champ_wins.nlargest(15, 'count')
    
    ax[1].barh(range(len(top_champs)), top_champs['win_rate'])
    ax[1].set_yticks(range(len(top_champs)))
    ax[1].set_yticklabels(top_champs.index)
    ax[1].set_title('Win Rate - Top 15 Champions by Games')
    ax[1].set_xlabel('Win Rate')
    ax[1].axvline(x=0.5, color='r', linestyle='--', label='50%')
    ax[1].legend()

plt.tight_layout()
plt.show()

## 2. Role and Champion Context Analysis

Here we analyze how performance metrics vary by role and champion to understand the challenge.

In [None]:
# Analyze key metrics by role
key_metrics = ['damage_per_min', 'gold_per_min', 'cs_per_min', 'vision_score_per_min', 'kda']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, metric in enumerate(key_metrics):
    if metric in df.columns and 'team_position' in df.columns:
        df.boxplot(column=metric, by='team_position', ax=axes[idx])
        axes[idx].set_title(f'{metric} by Role')
        axes[idx].set_xlabel('Role')
        axes[idx].set_ylabel(metric)
        plt.sca(axes[idx])
        plt.xticks(rotation=45)

# Remove the last subplot if we have 6 subplots but only 5 metrics
fig.delaxes(axes[-1])

plt.suptitle('Performance Metrics Vary Significantly by Role', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print('\nKey Observation: Different roles have very different stat distributions!')
print('This confirms the need for role-aware feature engineering.')

In [None]:
# Statistical comparison of roles
if 'team_position' in df.columns:
    role_stats_detailed = df.groupby('team_position')[key_metrics].agg(['mean', 'std']).round(2)
    print('Detailed Role Statistics:')
    print(role_stats_detailed)
    print('\nCoefficient of Variation (CV) by metric across roles:')
    for metric in key_metrics:
        if metric in df.columns:
            role_means = df.groupby('team_position')[metric].mean()
            cv = role_means.std() / role_means.mean()
            print(f'{metric}: {cv:.2%} variation across roles')

## 3. Feature Engineering with Role Context

We'll create multiple feature sets to compare approaches:
1. **Raw features**: No normalization
2. **Role-normalized features**: Z-score normalization within each role
3. **Features + role encoding**: Include role as a categorical feature

In [None]:
# Select features for modeling (exclude identifiers and target)
exclude_cols = ['match_id', 'puuid', 'champion_id', 'champion_name', 
                'team_position', 'individual_position', 'win']
feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f'Selected {len(feature_cols)} features for modeling:')
print(feature_cols)

# Prepare feature sets
X_raw = df[feature_cols].copy()
y = df['win'].astype(int)

print(f'\nFeature matrix shape: {X_raw.shape}')
print(f'Target distribution: {y.value_counts().to_dict()}')

In [None]:
# Create role-normalized features
X_role_normalized = X_raw.copy().astype(float)  # Ensure we have float type for normalization

if 'team_position' in df.columns:
    for role in df['team_position'].unique():
        role_mask = df['team_position'] == role
        
        # Normalize each feature within this role
        for col in feature_cols:
            role_data = X_role_normalized.loc[role_mask, col]
            mean = role_data.mean()
            std = role_data.std()
            
            if std > 0:  # Avoid division by zero
                X_role_normalized.loc[role_mask, col] = (role_data - mean) / std
    
    print('Created role-normalized features')
else:
    print('No role information available - using standard normalization')
    scaler = StandardScaler()
    X_role_normalized = pd.DataFrame(
        scaler.fit_transform(X_raw),
        columns=X_raw.columns,
        index=X_raw.index
    )

In [None]:
# Create features with role encoding
X_with_role = X_raw.copy()

if 'team_position' in df.columns and 'champion_name' in df.columns:
    # One-hot encode role
    role_dummies = pd.get_dummies(df['team_position'], prefix='role')
    X_with_role = pd.concat([X_with_role, role_dummies], axis=1)
    
    # Encode champion (use label encoding to avoid too many features)
    le_champion = LabelEncoder()
    X_with_role['champion_encoded'] = le_champion.fit_transform(df['champion_name'])
    
    print(f'Features with role encoding shape: {X_with_role.shape}')
    print(f'Added {len(role_dummies.columns)} role features and 1 champion encoding')
else:
    print('Role/champion information not available')

## 4. PCA Feature Reduction

We'll apply PCA to reduce dimensionality while preserving variance.

In [None]:
# Standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
X_role_norm_scaled = scaler.fit_transform(X_role_normalized)

# Fit PCA to determine optimal number of components
pca_full = PCA()
pca_full.fit(X_scaled)

# Plot explained variance
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
axes[0].plot(range(1, len(pca_full.explained_variance_ratio_) + 1),
             pca_full.explained_variance_ratio_, 'bo-')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Scree Plot')
axes[0].grid(True)

# Cumulative variance
axes[1].plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-')
axes[1].axhline(y=0.95, color='g', linestyle='--', label='95% variance')
axes[1].axhline(y=0.90, color='b', linestyle='--', label='90% variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Find number of components for 90% and 95% variance
n_components_90 = np.argmax(cumulative_variance >= 0.90) + 1
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print(f'Components needed for 90% variance: {n_components_90}')
print(f'Components needed for 95% variance: {n_components_95}')
print(f'Original features: {X_raw.shape[1]}')

In [None]:
# Apply PCA with optimal number of components (using 95% variance)
n_components = min(n_components_95, 15)  # Cap at 15 for interpretability

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

pca_role = PCA(n_components=n_components)
X_pca_role_norm = pca_role.fit_transform(X_role_norm_scaled)

print(f'Reduced from {X_raw.shape[1]} to {n_components} features')
print(f'Explained variance: {pca.explained_variance_ratio_.sum():.2%}')
print(f'\nPCA-transformed data shape: {X_pca.shape}')

In [None]:
# Visualize first two principal components colored by win
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Raw features PCA
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='RdYlGn', alpha=0.5, s=10)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
axes[0].set_title('PCA - Raw Features')
plt.colorbar(scatter1, ax=axes[0], label='Win')

# Role-normalized PCA
scatter2 = axes[1].scatter(X_pca_role_norm[:, 0], X_pca_role_norm[:, 1], 
                          c=y, cmap='RdYlGn', alpha=0.5, s=10)
axes[1].set_xlabel(f'PC1 ({pca_role.explained_variance_ratio_[0]:.1%} var)')
axes[1].set_ylabel(f'PC2 ({pca_role.explained_variance_ratio_[1]:.1%} var)')
axes[1].set_title('PCA - Role-Normalized Features')
plt.colorbar(scatter2, ax=axes[1], label='Win')

plt.tight_layout()
plt.show()

## 5. Model Training and Comparison

We'll compare multiple models:
1. Logistic Regression (baseline)
2. Random Forest
3. SVM (Support Vector Machine)

For each model, we'll test:
- Raw features
- PCA-reduced features
- Role-normalized + PCA features

In [None]:
# Split data
test_size = 0.2
random_state = 42

# Create multiple train/test splits for different feature sets
datasets = {
    'Raw Features': (X_scaled, y),
    'PCA (Raw)': (X_pca, y),
    'PCA (Role-Normalized)': (X_pca_role_norm, y),
}

splits = {}
for name, (X, y_data) in datasets.items():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_data, test_size=test_size, random_state=random_state, stratify=y_data
    )
    splits[name] = (X_train, X_test, y_train, y_test)
    print(f'{name}: Train={X_train.shape}, Test={X_test.shape}')

In [None]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1),
    'SVM': SVC(probability=True, random_state=random_state),
}

# Train and evaluate all combinations
results = []

for model_name, model in models.items():
    print(f'\nTraining {model_name}...')
    
    for dataset_name, (X_train, X_test, y_train, y_test) in splits.items():
        print(f'  - {dataset_name}...', end=' ')
        
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        cv_mean = cv_scores.mean()
        
        results.append({
            'Model': model_name,
            'Dataset': dataset_name,
            'Accuracy': accuracy,
            'ROC-AUC': roc_auc,
            'CV Accuracy': cv_mean,
        })
        
        print(f'Acc={accuracy:.3f}, AUC={roc_auc:.3f}, CV={cv_mean:.3f}')

# Create results DataFrame
results_df = pd.DataFrame(results)
print('\n' + '='*80)
print('RESULTS SUMMARY')
print('='*80)
print(results_df.to_string(index=False))

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy comparison
pivot_acc = results_df.pivot(index='Model', columns='Dataset', values='Accuracy')
pivot_acc.plot(kind='bar', ax=axes[0], rot=0)
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_ylabel('Accuracy')
axes[0].legend(title='Dataset', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].set_ylim([0.5, 1.0])
axes[0].axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Baseline (50%)')

# ROC-AUC comparison
pivot_auc = results_df.pivot(index='Model', columns='Dataset', values='ROC-AUC')
pivot_auc.plot(kind='bar', ax=axes[1], rot=0)
axes[1].set_title('Model ROC-AUC Comparison')
axes[1].set_ylabel('ROC-AUC')
axes[1].legend(title='Dataset', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].set_ylim([0.5, 1.0])
axes[1].axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Baseline (50%)')

plt.tight_layout()
plt.show()

## 6. Best Model Analysis

Let's analyze the best performing model in detail.

In [None]:
# Find best model
best_idx = results_df['ROC-AUC'].idxmax()
best_model_name = results_df.loc[best_idx, 'Model']
best_dataset_name = results_df.loc[best_idx, 'Dataset']
best_roc_auc = results_df.loc[best_idx, 'ROC-AUC']

print(f'Best Model: {best_model_name}')
print(f'Best Dataset: {best_dataset_name}')
print(f'ROC-AUC: {best_roc_auc:.3f}')

# Retrain best model
best_model = models[best_model_name]
X_train, X_test, y_train, y_test = splits[best_dataset_name]
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=['Loss', 'Win']))

In [None]:
# Confusion Matrix and ROC Curve
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Loss', 'Win'], yticklabels=['Loss', 'Win'])
axes[0].set_title('Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[1].plot(fpr, tpr, label=f'ROC curve (AUC = {best_roc_auc:.3f})', linewidth=2)
axes[1].plot([0, 1], [0, 1], 'k--', label='Random (AUC = 0.5)')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curve')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (if Random Forest)
if best_model_name == 'Random Forest' and best_dataset_name == 'Raw Features':
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(15)
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_importance)), feature_importance['importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.xlabel('Importance')
    plt.title('Top 15 Feature Importances (Random Forest)')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print('\nTop 10 Most Important Features:')
    print(feature_importance.head(10).to_string(index=False))

## 7. Outlier Detection

Identify matches where the prediction diverges significantly from reality.

In [None]:
# Create predictions for full dataset using best model
# Use the test set for outlier analysis
predictions_df = pd.DataFrame({
    'win': y_test.values,
    'win_probability': y_pred_proba,
    'predicted_win': y_pred,
})

# Identify outliers
threshold = 0.7  # High confidence threshold

# Unexpected wins: won but had low win probability
unexpected_wins = (predictions_df['win'] == 1) & (predictions_df['win_probability'] < (1 - threshold))

# Unexpected losses: lost but had high win probability
unexpected_losses = (predictions_df['win'] == 0) & (predictions_df['win_probability'] > threshold)

outliers = predictions_df[unexpected_wins | unexpected_losses].copy()
outliers['outlier_type'] = np.where(outliers['win'], 'Unexpected Win', 'Unexpected Loss')
outliers['surprise_score'] = np.where(
    outliers['win'],
    1 - outliers['win_probability'],
    outliers['win_probability']
)

outliers = outliers.sort_values('surprise_score', ascending=False)

print(f'Total outliers: {len(outliers)} ({len(outliers)/len(predictions_df)*100:.1f}%)')
print(f'Unexpected wins: {unexpected_wins.sum()}')
print(f'Unexpected losses: {unexpected_losses.sum()}')
print('\nTop 10 Most Surprising Matches:')
print(outliers.head(10).to_string(index=False))

In [None]:
# Visualize outliers
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Probability distribution for wins and losses
wins_proba = predictions_df[predictions_df['win'] == 1]['win_probability']
losses_proba = predictions_df[predictions_df['win'] == 0]['win_probability']

axes[0].hist(wins_proba, bins=30, alpha=0.6, label='Actual Wins', color='green', edgecolor='black')
axes[0].hist(losses_proba, bins=30, alpha=0.6, label='Actual Losses', color='red', edgecolor='black')
axes[0].axvline(x=threshold, color='blue', linestyle='--', label=f'Threshold ({threshold})')
axes[0].axvline(x=1-threshold, color='blue', linestyle='--')
axes[0].set_xlabel('Predicted Win Probability')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Win Probabilities')
axes[0].legend()

# Outlier types
outlier_counts = outliers['outlier_type'].value_counts()
axes[1].bar(outlier_counts.index, outlier_counts.values, color=['green', 'red'])
axes[1].set_title('Outlier Types')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Outlier Type')

plt.tight_layout()
plt.show()

## 8. Findings and Recommendations

### Key Findings

1. **Role Context Matters**: Performance metrics vary significantly across roles (e.g., supports have lower damage, higher vision scores)

2. **PCA Effectiveness**: PCA successfully reduces dimensionality while retaining most variance. Role-normalized features may provide different perspectives.

3. **Model Performance**: [Results will show which model performs best]

4. **Outlier Detection**: The model can identify surprising match outcomes where performance metrics didn't align with the result

### Recommendations

1. **Feature Engineering Strategy**:
   - Use role-normalized features when role context is important
   - Consider training separate models per role for maximum accuracy
   - Alternatively, include role as a categorical feature in a unified model

2. **Model Selection**:
   - Random Forest tends to perform well with mixed feature types
   - Logistic Regression provides interpretability
   - Choose based on accuracy vs. interpretability trade-off

3. **PCA Usage**:
   - Beneficial for visualization and reducing overfitting
   - May sacrifice some interpretability
   - Consider using 90-95% variance threshold

4. **Implementation Path**:
   - Save the best model, scaler, and PCA transformer
   - Create API for real-time win probability prediction
   - Implement role-specific prediction pipelines if needed
   - Use outlier detection to highlight unusual match outcomes

5. **Future Improvements**:
   - Collect more data for better generalization
   - Experiment with deep learning models
   - Add temporal features (player improvement over time)
   - Include team composition features

## 9. Save Best Model

In [None]:
# Save the best model and associated artifacts
import pickle

model_dir = Path('../models')
model_dir.mkdir(exist_ok=True)

# Determine which scaler and PCA to save based on best dataset
if best_dataset_name == 'PCA (Role-Normalized)':
    scaler_to_save = StandardScaler()
    scaler_to_save.fit(X_role_normalized)
    pca_to_save = pca_role
elif best_dataset_name == 'PCA (Raw)':
    scaler_to_save = StandardScaler()
    scaler_to_save.fit(X_raw)
    pca_to_save = pca
else:  # Raw Features
    scaler_to_save = StandardScaler()
    scaler_to_save.fit(X_raw)
    pca_to_save = None

# Save model
model_path = model_dir / 'win_probability_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump({
        'model': best_model,
        'feature_names': feature_cols,
        'model_name': best_model_name,
        'dataset_name': best_dataset_name,
    }, f)
print(f'Saved model to {model_path}')

# Save scaler
scaler_path = model_dir / 'scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler_to_save, f)
print(f'Saved scaler to {scaler_path}')

# Save PCA if used
if pca_to_save is not None:
    pca_path = model_dir / 'pca.pkl'
    with open(pca_path, 'wb') as f:
        pickle.dump(pca_to_save, f)
    print(f'Saved PCA to {pca_path}')

print('\nModel artifacts saved successfully!')
print(f'Best configuration: {best_model_name} with {best_dataset_name}')
print(f'ROC-AUC: {best_roc_auc:.3f}')

## Conclusion

This notebook has demonstrated:
1. ✅ Data extraction and role/champion-specific analysis
2. ✅ PCA feature reduction with role context considerations
3. ✅ Comparison of multiple classification models (Logistic Regression, Random Forest, SVM)
4. ✅ Outlier detection for identifying surprising match outcomes
5. ✅ Model persistence for deployment

The trained model can now be used to predict win probabilities for new matches, accounting for the role and champion context that makes "good stats" different across different positions and characters.