# 01 Data Cleaning

This notebook cleans the raw IPL data.

**Steps:**
1. Load `matches.csv` and `deliveries.csv`.
2. Handle missing values and abandoned matches.
3. Standardize team names (e.g., *Delhi Daredevils* -> *Delhi Capitals*).
4. Derive critical columns (`is_wicket`).
5. Filter valid deliveries (remove Super Overs).
6. Save cleaned data to `data/processed/`.

In [None]:
import pandas as pd
import numpy as np
import os

# Schema definitions based on user input
MATCHES_PATH = '../data/raw/matches.csv'
DELIVERIES_PATH = '../data/raw/deliveries.csv'
PROCESSED_DIR = '../data/processed/'

os.makedirs(PROCESSED_DIR, exist_ok=True)

# --- 1. Load Data ---
try:
    matches = pd.read_csv(MATCHES_PATH)
    deliveries = pd.read_csv(DELIVERIES_PATH)
    print("✅ Data Loaded successfully")
except FileNotFoundError:
    print("❌ Raw data not found. Please place 'matches.csv' and 'deliveries.csv' in 'data/raw/'")
    # Fallback for demonstration if files missing (omitted for brevity in production script)
    raise

# --- 2. Clean Matches ---
print("Cleaning Matches...")
# Rename id -> match_id
matches = matches.rename(columns={'id': 'match_id', 'Season': 'season'})

# Drop abandoned/no result
matches = matches.dropna(subset=['winner'])
if 'result' in matches.columns:
    matches = matches[matches['result'] != 'no result']

# Standardize Team Names
team_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Pune Warriors': 'Rising Pune Supergiant',
    'Rising Pune Supergiants': 'Rising Pune Supergiant',
    'Gujarat Lions': 'Gujarat Titans', 
    'Kings XI Punjab': 'Punjab Kings'
}

matches['team1'] = matches['team1'].replace(team_mapping)
matches['team2'] = matches['team2'].replace(team_mapping)
matches['winner'] = matches['winner'].replace(team_mapping)
matches['toss_winner'] = matches['toss_winner'].replace(team_mapping)

# Filter relevant columns
cols_to_keep = ['match_id', 'season', 'city', 'venue', 'winner', 'toss_winner', 'toss_decision', 'team1', 'team2', 'date']
matches = matches[[c for c in cols_to_keep if c in matches.columns]]

# --- 3. Clean Deliveries ---
print("Cleaning Deliveries...")
# Standardize Team Names
deliveries['batting_team'] = deliveries['batting_team'].replace(team_mapping)
deliveries['bowling_team'] = deliveries['bowling_team'].replace(team_mapping)

# Remove Super Overs
if 'is_super_over' in deliveries.columns:
    deliveries = deliveries[deliveries['is_super_over'] == 0]

# Derive is_wicket
# If is_wicket column exists, ensure it's 0/1. If not, derive from dismissal_kind.
if 'is_wicket' not in deliveries.columns:
    if 'dismissal_kind' in deliveries.columns:
        deliveries['is_wicket'] = deliveries['dismissal_kind'].notna().astype(int)
    else:
        print("Warning: Neither 'is_wicket' nor 'dismissal_kind' found.")
        deliveries['is_wicket'] = 0
else:
    # Ensure binary
    deliveries['is_wicket'] = deliveries['is_wicket'].fillna(0).astype(int)

# --- 4. Save Processed Data ---
matches.to_csv(f'{PROCESSED_DIR}matches_clean.csv', index=False)
deliveries.to_csv(f'{PROCESSED_DIR}deliveries_clean.csv', index=False)

print(f"✅ Processed data saved to {PROCESSED_DIR}")
print(f"Matches: {matches.shape}")
print(f"Deliveries: {deliveries.shape}")