# 1. Data Loading and Basic EDA

This notebook focuses on loading the raw datasets, standardizing them, and performing initial Exploratory Data Analysis (EDA) as per the project requirements.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add src to path
sys.path.append(os.path.abspath('../'))

from src.data_loader import load_all_csvs, standardize_datasets
from src.preprocessing import process_injuries, create_panel_base

%matplotlib inline
pd.set_option('display.max_columns', None)

## 1.1 Load and Standardize Data

In [None]:
RAW_DIR = "../data/raw"
INTERIM_DIR = "../data/interim"

# Load
datasets = load_all_csvs(RAW_DIR)
datasets = standardize_datasets(datasets)

print("Loaded datasets:", datasets.keys())

## 1.2 Identify Key Tables

In [None]:
injuries_df = None
appearances_df = None
games_df = None
players_df = None

for name, df in datasets.items():
    if 'injur' in name:
        injuries_df = df
        print(f"Injuries table found: {name}")
    elif 'appearan' in name:
        appearances_df = df
        print(f"Appearances table found: {name}")
    elif 'game' in name and 'appear' not in name:
        games_df = df
        print(f"Games table found: {name}")
    elif 'player' in name and 'appear' not in name and 'injur' not in name:
        players_df = df
        print(f"Players table found: {name}")

## 1.3 Injury Data EDA

In [None]:
if injuries_df is not None:
    # Preprocess (Standardize dates, compute days_out)
    injuries_clean = process_injuries(injuries_df)
    
    print(f"Total Injury Episodes: {len(injuries_clean)}")
    
    # Distribution of Days Out
    plt.figure(figsize=(10, 5))
    sns.histplot(injuries_clean['days_out'], bins=50, kde=True)
    plt.title("Distribution of Days Out")
    plt.xlim(0, 200) # Zoom in
    plt.show()
    
    # Time-Loss Injuries (>= 7 days)
    tl_injuries = injuries_clean[injuries_clean['days_out'] >= 7]
    print(f"Time-Loss Injuries (>=7 days): {len(tl_injuries)} ({len(tl_injuries)/len(injuries_clean):.1%})")
    
    # Missingness
    print("\nMissing Values:")
    print(injuries_clean.isnull().sum())
    
else:
    print("No injury data to analyze.")

## 1.4 Appearances & Minutes EDA

In [None]:
if appearances_df is not None and games_df is not None:
    # Check dates in games
    if 'date' in games_df.columns:
        games_df['date'] = pd.to_datetime(games_df['date'])
        print(f"Games Date Range: {games_df['date'].min()} to {games_df['date'].max()}")
    
    # Minutes distribution
    if 'minutes_played' in appearances_df.columns:
        plt.figure(figsize=(10, 5))
        sns.histplot(appearances_df['minutes_played'], bins=30)
        plt.title("Distribution of Minutes Played per Match")
        plt.show()
        
    # Unique players
    if 'player_id' in appearances_df.columns:
        n_players = appearances_df['player_id'].nunique()
        print(f"Unique players with minutes: {n_players}")
else:
    print("No appearance/game data to analyze.")