# Lol Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

### Extract all csv into pd df

In [16]:
data_dir = "data"
all_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
data_frames = {}
for file in all_files:
    df_name = os.path.splitext(file)[0]
    data_frames[df_name] = pd.read_csv(os.path.join(data_dir, file))
    print(f"Loaded {file} into DataFrame '{df_name}' with shape {data_frames[df_name].shape}")

Loaded ChampionTbl.csv into DataFrame 'ChampionTbl' with shape (172, 2)
Loaded ItemTbl.csv into DataFrame 'ItemTbl' with shape (635, 2)
Loaded MatchStatsTbl.csv into DataFrame 'MatchStatsTbl' with shape (78863, 31)
Loaded MatchTbl.csv into DataFrame 'MatchTbl' with shape (35421, 5)
Loaded RankTbl.csv into DataFrame 'RankTbl' with shape (11, 2)
Loaded SummonerMatchTbl.csv into DataFrame 'SummonerMatchTbl' with shape (78863, 4)
Loaded TeamMatchTbl.csv into DataFrame 'TeamMatchTbl' with shape (35045, 24)


In [20]:
# Function to get DataFrame by name from dictionary of dfs
def get_dataframe(data, name):
    df =  data.get(name)
    if df is not None:
        return df
    else:
        raise ValueError(f"DataFrame '{name}' not found.")

df_MatchTbl = get_dataframe(data_frames, 'MatchTbl') 
df_MatchTbl.head()

Unnamed: 0,MatchId,Patch,QueueType,RankFk,GameDuration
0,EUW1_6681382047,13.22.541.9804,CLASSIC,0,1050
1,EUW1_6681412019,13.22.541.9804,CLASSIC,0,778
2,EUW1_6681445530,13.22.541.9804,ARAM,0,753
3,EUW1_6681464371,13.22.541.9804,ARAM,0,853
4,EUW1_6681718380,13.22.541.9804,ARAM,0,1226


## Exploratory Data Analysis (EDA)

Let's explore the League of Legends match data to understand patterns, distributions, and relationships.

### 1. Dataset Overview
First, let's examine the structure and basic information about each dataset.

In [None]:
# Display basic information about all datasets
for name, df in data_frames.items():
    print(f"\n{'='*60}")
    print(f"Dataset: {name}")
    print(f"{'='*60}")
    print(f"Shape: {df.shape} (rows, columns)")
    print(f"\nColumn Names and Types:")
    print(df.dtypes)
    print(f"\nMissing Values:")
    print(df.isnull().sum())
    print(f"\nFirst few rows:")
    print(df.head(3))

### 2. Match Statistics Analysis
Let's analyze the match statistics to understand game outcomes and performance metrics.

In [None]:
# Get MatchStats DataFrame
df_MatchStats = get_dataframe(data_frames, 'MatchStatsTbl')

# Basic statistics
print("Match Statistics Summary:")
print(df_MatchStats.describe())

# Check for numeric columns
numeric_cols = df_MatchStats.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns: {numeric_cols}")

In [None]:
# Visualize distributions of key match statistics
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Distribution of Match Statistics', fontsize=16, fontweight='bold')

# Select key columns to visualize (adjust based on actual columns)
stats_to_plot = numeric_cols[:6] if len(numeric_cols) >= 6 else numeric_cols

for idx, col in enumerate(stats_to_plot):
    row = idx // 3
    col_idx = idx % 3
    axes[row, col_idx].hist(df_MatchStats[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[row, col_idx].set_title(col, fontweight='bold')
    axes[row, col_idx].set_xlabel('Value')
    axes[row, col_idx].set_ylabel('Frequency')
    axes[row, col_idx].grid(True, alpha=0.3)

# Hide empty subplots
for idx in range(len(stats_to_plot), 6):
    row = idx // 3
    col_idx = idx % 3
    axes[row, col_idx].axis('off')

plt.tight_layout()
plt.show()

### 3. Champion Analysis
Analyze champion usage and performance across matches.

In [None]:
# Get Champion DataFrame
df_Champion = get_dataframe(data_frames, 'ChampionTbl')

print("Champion Data Overview:")
print(df_Champion.head(10))
print(f"\nTotal Champions: {len(df_Champion)}")
print(f"\nChampion Columns: {df_Champion.columns.tolist()}")

### 4. Rank Distribution Analysis
Examine the distribution of player ranks across the dataset.

In [None]:
# Get Rank DataFrame
df_Rank = get_dataframe(data_frames, 'RankTbl')

print("Rank Data Overview:")
print(df_Rank.head())
print(f"\nTotal Rank Entries: {len(df_Rank)}")

# Count ranks if there's a rank column
rank_cols = [col for col in df_Rank.columns if 'rank' in col.lower() or 'tier' in col.lower()]
if rank_cols:
    print(f"\nRank-related columns: {rank_cols}")
    for col in rank_cols[:3]:  # Show up to 3 rank columns
        print(f"\n{col} distribution:")
        print(df_Rank[col].value_counts().head(10))

In [None]:
# Visualize rank distribution
if rank_cols and len(rank_cols) > 0:
    fig, axes = plt.subplots(1, min(2, len(rank_cols)), figsize=(14, 6))
    fig.suptitle('Rank Distribution', fontsize=16, fontweight='bold')
    
    if len(rank_cols) == 1:
        axes = [axes]
    
    for idx, col in enumerate(rank_cols[:2]):
        rank_counts = df_Rank[col].value_counts().head(15)
        axes[idx].barh(range(len(rank_counts)), rank_counts.values, color='steelblue', edgecolor='black')
        axes[idx].set_yticks(range(len(rank_counts)))
        axes[idx].set_yticklabels(rank_counts.index)
        axes[idx].set_xlabel('Count')
        axes[idx].set_title(f'{col} Distribution', fontweight='bold')
        axes[idx].grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No rank columns found to visualize")

### 5. Match Duration and Outcome Analysis
Analyze match durations and win/loss patterns.

In [None]:
# Analyze Match Table
print("Match Table Analysis:")
print(df_MatchTbl.head())
print(f"\nMatch Table Shape: {df_MatchTbl.shape}")
print(f"\nColumn Names: {df_MatchTbl.columns.tolist()}")

# Look for duration or time-related columns
time_cols = [col for col in df_MatchTbl.columns if 'duration' in col.lower() or 'time' in col.lower() or 'length' in col.lower()]
win_cols = [col for col in df_MatchTbl.columns if 'win' in col.lower() or 'result' in col.lower() or 'outcome' in col.lower()]

print(f"\nTime-related columns: {time_cols}")
print(f"Win-related columns: {win_cols}")

In [None]:
# Visualize match duration if available
if time_cols:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle('Match Duration Analysis', fontsize=16, fontweight='bold')
    
    time_col = time_cols[0]
    
    # Histogram
    axes[0].hist(df_MatchTbl[time_col].dropna(), bins=50, edgecolor='black', alpha=0.7, color='teal')
    axes[0].set_xlabel(time_col)
    axes[0].set_ylabel('Frequency')
    axes[0].set_title(f'Distribution of {time_col}', fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Box plot
    axes[1].boxplot(df_MatchTbl[time_col].dropna(), vert=True)
    axes[1].set_ylabel(time_col)
    axes[1].set_title(f'Box Plot of {time_col}', fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n{time_col} Statistics:")
    print(df_MatchTbl[time_col].describe())
else:
    print("No time-related columns found")

### 6. Item Analysis
Explore item usage patterns in matches.

In [None]:
# Get Item DataFrame
df_Item = get_dataframe(data_frames, 'ItemTbl')

print("Item Data Overview:")
print(df_Item.head(15))
print(f"\nTotal Items: {len(df_Item)}")
print(f"\nItem Table Columns: {df_Item.columns.tolist()}")
print(f"\nItem Data Types:")
print(df_Item.dtypes)

### 7. Team Performance Analysis
Analyze team-level statistics and performance.

In [None]:
# Get TeamMatch DataFrame
df_TeamMatch = get_dataframe(data_frames, 'TeamMatchTbl')

print("Team Match Data Overview:")
print(df_TeamMatch.head())
print(f"\nTotal Team Match Records: {len(df_TeamMatch)}")
print(f"\nColumns: {df_TeamMatch.columns.tolist()}")

# Get numeric columns for team stats
team_numeric_cols = df_TeamMatch.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns in Team data: {team_numeric_cols}")

In [None]:
# Visualize team statistics
if len(team_numeric_cols) > 0:
    # Select up to 6 numeric columns for visualization
    cols_to_plot = team_numeric_cols[:6]
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle('Team Statistics Distribution', fontsize=16, fontweight='bold')
    
    for idx, col in enumerate(cols_to_plot):
        row = idx // 3
        col_idx = idx % 3
        axes[row, col_idx].hist(df_TeamMatch[col].dropna(), bins=30, edgecolor='black', alpha=0.7, color='coral')
        axes[row, col_idx].set_title(col, fontweight='bold')
        axes[row, col_idx].set_xlabel('Value')
        axes[row, col_idx].set_ylabel('Frequency')
        axes[row, col_idx].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(cols_to_plot), 6):
        row = idx // 3
        col_idx = idx % 3
        axes[row, col_idx].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("No numeric columns found in team data")

### 8. Correlation Analysis
Examine correlations between different match statistics.

In [None]:
# Calculate correlation matrix for match statistics
if len(numeric_cols) > 1:
    # Select a subset of columns for correlation (too many makes it hard to read)
    cols_for_corr = numeric_cols[:10] if len(numeric_cols) > 10 else numeric_cols
    
    corr_matrix = df_MatchStats[cols_for_corr].corr()
    
    # Create correlation heatmap
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Create heatmap
    im = ax.imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
    
    # Set ticks
    ax.set_xticks(np.arange(len(cols_for_corr)))
    ax.set_yticks(np.arange(len(cols_for_corr)))
    ax.set_xticklabels(cols_for_corr, rotation=45, ha='right')
    ax.set_yticklabels(cols_for_corr)
    
    # Add colorbar
    cbar = plt.colorbar(im, ax=ax)
    cbar.set_label('Correlation Coefficient', rotation=270, labelpad=20)
    
    # Add correlation values
    for i in range(len(cols_for_corr)):
        for j in range(len(cols_for_corr)):
            text = ax.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                          ha="center", va="center", color="black", fontsize=8)
    
    ax.set_title('Correlation Matrix of Match Statistics', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    print("\nHighest Correlations (excluding diagonal):")
    # Get correlations excluding diagonal
    corr_matrix_no_diag = corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool))
    high_corr = corr_matrix_no_diag.unstack().sort_values(ascending=False).head(10)
    print(high_corr)
else:
    print("Not enough numeric columns for correlation analysis")

### 9. Data Quality Summary
Summary of data quality issues and recommendations.

In [None]:
# Comprehensive data quality report
print("="*70)
print("DATA QUALITY SUMMARY REPORT")
print("="*70)

for name, df in data_frames.items():
    print(f"\n{name}:")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"  Missing Values: {missing.sum()} total")
        print(f"    Columns with missing data: {list(missing[missing > 0].index)}")
    else:
        print(f"  Missing Values: None")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"  Duplicate Rows: {duplicates}")
    
    # Data types
    print(f"  Data Types: {df.dtypes.value_counts().to_dict()}")

print("\n" + "="*70)
print("RECOMMENDATIONS:")
print("="*70)
print("1. Handle missing values through imputation or removal")
print("2. Check for and remove duplicate records if necessary")
print("3. Verify data types are appropriate for analysis")
print("4. Consider feature engineering for better insights")
print("5. Investigate outliers in numeric columns")