# Exploratory Analysis of League of Legends Match Data

This notebook explores the match data collected from the Riot Games API.

## Objectives
- Load and understand the structure of match data
- Perform basic statistical analysis
- Visualize key performance metrics
- Identify patterns in player and team performance

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import List, Dict

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")

## 1. Load Match Data

Load all collected match data from the data directory.

In [None]:
# Set data directory path
data_dir = Path('../data/raw')

# Load all match files
match_files = list(data_dir.glob('match_*.json'))
print(f"Found {len(match_files)} match files")

# Load matches into list
matches = []
for match_file in match_files:
    with open(match_file, 'r') as f:
        matches.append(json.load(f))

print(f"Loaded {len(matches)} matches")

## 2. Understand Match Structure

Examine the structure of a single match to understand the data format.

In [None]:
# Display structure of first match
if matches:
    sample_match = matches[0]
    print("Match keys:", sample_match.keys())
    print("\nMetadata keys:", sample_match['metadata'].keys())
    print("\nInfo keys:", sample_match['info'].keys())
    print("\nNumber of participants:", len(sample_match['info']['participants']))
    print("\nSample participant keys:", sample_match['info']['participants'][0].keys())

## 3. Create Player Performance DataFrame

Extract player-level statistics from all matches.

In [None]:
def extract_player_data(matches: List[Dict]) -> pd.DataFrame:
    """
    Extract player-level data from matches.
    
    Args:
        matches: List of match dictionaries
        
    Returns:
        DataFrame with player statistics
    """
    player_data = []
    
    for match in matches:
        match_id = match['metadata']['matchId']
        game_duration = match['info']['gameDuration']
        
        for participant in match['info']['participants']:
            player_data.append({
                'matchId': match_id,
                'gameDuration': game_duration,
                'puuid': participant['puuid'],
                'summonerName': participant['summonerName'],
                'championName': participant['championName'],
                'championId': participant['championId'],
                'teamId': participant['teamId'],
                'win': participant['win'],
                'kills': participant['kills'],
                'deaths': participant['deaths'],
                'assists': participant['assists'],
                'goldEarned': participant['goldEarned'],
                'totalDamageDealt': participant['totalDamageDealt'],
                'totalDamageDealtToChampions': participant['totalDamageDealtToChampions'],
                'visionScore': participant['visionScore'],
                'totalMinionsKilled': participant['totalMinionsKilled'],
                'neutralMinionsKilled': participant['neutralMinionsKilled'],
                'wardsPlaced': participant['wardsPlaced'],
                'wardsKilled': participant['wardsKilled'],
                'lane': participant['lane'],
                'role': participant['role'],
            })
    
    return pd.DataFrame(player_data)

# Create DataFrame
df = extract_player_data(matches)
print(f"Created DataFrame with {len(df)} player records")
df.head()

## 4. Basic Statistics

Compute basic descriptive statistics for the dataset.

In [None]:
# Display basic statistics
print("Dataset shape:", df.shape)
print("\nMissing values:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)
print("\nDescriptive statistics:")
df.describe()

## 5. Performance Metrics Analysis

In [None]:
# Calculate KDA (Kills/Deaths/Assists ratio)
df['kda'] = (df['kills'] + df['assists']) / df['deaths'].replace(0, 1)

# Calculate CS (Creep Score) per minute
df['cs'] = df['totalMinionsKilled'] + df['neutralMinionsKilled']
df['csPerMinute'] = df['cs'] / (df['gameDuration'] / 60)

# Calculate damage per minute
df['damagePerMinute'] = df['totalDamageDealtToChampions'] / (df['gameDuration'] / 60)

print("Calculated derived metrics")
df[['kills', 'deaths', 'assists', 'kda', 'csPerMinute', 'damagePerMinute']].describe()

## 6. Win Rate Analysis

In [None]:
# Overall win rate
win_rate = df['win'].mean()
print(f"Overall win rate: {win_rate:.2%}")

# Win rate by champion (top 10)
champion_stats = df.groupby('championName').agg({
    'win': ['count', 'mean']
}).round(3)
champion_stats.columns = ['games', 'win_rate']
champion_stats = champion_stats[champion_stats['games'] >= 5].sort_values('win_rate', ascending=False)

print("\nTop 10 champions by win rate (min 5 games):")
print(champion_stats.head(10))

## 7. Visualization: KDA Distribution

In [None]:
# KDA distribution by win/loss
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# KDA boxplot
df.boxplot(column='kda', by='win', ax=axes[0])
axes[0].set_title('KDA Distribution by Match Outcome')
axes[0].set_xlabel('Win')
axes[0].set_ylabel('KDA')
axes[0].set_ylim(0, 10)

# KDA histogram
df[df['win'] == True]['kda'].hist(bins=30, alpha=0.7, label='Win', ax=axes[1])
df[df['win'] == False]['kda'].hist(bins=30, alpha=0.7, label='Loss', ax=axes[1])
axes[1].set_title('KDA Histogram by Match Outcome')
axes[1].set_xlabel('KDA')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].set_xlim(0, 10)

plt.tight_layout()
plt.show()

## 8. Visualization: Performance Metrics Correlation

In [None]:
# Select key metrics for correlation analysis
metrics = ['kills', 'deaths', 'assists', 'goldEarned', 'totalDamageDealtToChampions', 
           'visionScore', 'csPerMinute', 'damagePerMinute', 'win']

# Convert win to numeric
df['win_numeric'] = df['win'].astype(int)
metrics_df = df[metrics[:-1] + ['win_numeric']]

# Compute correlation matrix
correlation_matrix = metrics_df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix of Performance Metrics')
plt.tight_layout()
plt.show()

## 9. Champion Play Rate Analysis

In [None]:
# Top 15 most played champions
champion_counts = df['championName'].value_counts().head(15)

plt.figure(figsize=(12, 6))
champion_counts.plot(kind='bar')
plt.title('Top 15 Most Played Champions')
plt.xlabel('Champion')
plt.ylabel('Number of Games')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 10. Next Steps

Based on this exploratory analysis, potential next steps include:

1. **Feature Engineering**: Create additional features based on domain knowledge
2. **Machine Learning Models**: Build predictive models for match outcomes
3. **Player Clustering**: Group players by performance characteristics
4. **Team Analysis**: Analyze team composition effects on win rate
5. **Temporal Analysis**: Examine how performance metrics change over time
6. **Advanced Visualizations**: Create interactive dashboards with Plotly or Dash

In [None]:
# Save processed data for future use
output_path = Path('../data/processed')
output_path.mkdir(parents=True, exist_ok=True)

df.to_csv(output_path / 'player_stats.csv', index=False)
print(f"Saved processed data to {output_path / 'player_stats.csv'}")