# 01 Data Cleaning

This notebook cleans the raw IPL data.

**Steps:**
1. Load `matches.csv` and `deliveries.csv`.
2. Handle missing values and abandoned matches.
3. Standardize team names (e.g., *Delhi Daredevils* -> *Delhi Capitals*).
4. Save cleaned data to `data/processed/`.

In [None]:
import pandas as pd
import numpy as np
import os

# Schema definitions based on user input
MATCHES_PATH = '../data/raw/matches.csv'
DELIVERIES_PATH = '../data/raw/deliveries.csv'
PROCESSED_DIR = '../data/processed/'

os.makedirs(PROCESSED_DIR, exist_ok=True)

# --- 1. Load Data ---
try:
    matches = pd.read_csv(MATCHES_PATH)
    deliveries = pd.read_csv(DELIVERIES_PATH)
    print("✅ Data Loaded successfully")
except FileNotFoundError:
    print("❌ Raw data not found. Please place 'matches.csv' and 'deliveries.csv' in 'data/raw/'")
    # Create Dummy Data for structure verification if files missing
    print("⚠️ Creating dummy data for demonstration...")
    matches = pd.DataFrame({
        'id': [1, 2],
        'season': [2023, 2023],
        'city': ['Mumbai', 'Delhi'],
        'date': ['2023-04-01', '2023-04-02'],
        'team1': ['Mumbai Indians', 'Delhi Daredevils'],
        'team2': ['Chennai Super Kings', 'Royal Challengers Bangalore'],
        'toss_winner': ['Mumbai Indians', 'Delhi Daredevils'],
        'toss_decision': ['bat', 'field'],
        'result': ['normal', 'normal'],
        'dl_applied': [0, 0],
        'winner': ['Mumbai Indians', 'Royal Challengers Bangalore'],
        'win_by_runs': [10, 0],
        'win_by_wickets': [0, 5],
        'player_of_match': ['RG Sharma', 'V Kohli'],
        'venue': ['Wankhede Stadium', 'Arun Jaitley Stadium']
    })
    deliveries = pd.DataFrame({
        'match_id': [1, 1, 2, 2],
        'inning': [1, 2, 1, 2],
        'batting_team': ['Mumbai Indians', 'Chennai Super Kings', 'Delhi Daredevils', 'Royal Challengers Bangalore'],
        'bowling_team': ['Chennai Super Kings', 'Mumbai Indians', 'Royal Challengers Bangalore', 'Delhi Daredevils'],
        'over': [1, 1, 1, 1],
        'ball': [1, 1, 1, 1],
        'batsman': ['RG Sharma', 'MS Dhoni', 'DA Warner', 'V Kohli'],
        'non_striker': ['Ishan Kishan', 'RA Jadeja', 'PP Shaw', 'F du Plessis'],
        'bowler': ['DL Chahar', 'JJ Bumrah', 'Mohammed Siraj', 'A Nortje'],
        'is_super_over': [0, 0, 0, 0],
        'wide_runs': [0, 0, 0, 0],
        'bye_runs': [0, 0, 0, 0],
        'legbye_runs': [0, 0, 0, 0],
        'noball_runs': [0, 0, 0, 0],
        'penalty_runs': [0, 0, 0, 0],
        'batsman_runs': [4, 1, 0, 6],
        'extra_runs': [0, 0, 0, 0],
        'total_runs': [4, 1, 0, 6],
        'player_dismissed': [np.nan, np.nan, np.nan, np.nan],
        'dismissal_kind': [np.nan, np.nan, np.nan, np.nan],
        'fielder': [np.nan, np.nan, np.nan, np.nan]
    })

# --- 2. Remove No Results ---
# Some datasets use 'result' column, checking if it exists
if 'result' in matches.columns:
    matches = matches[matches['result'] != 'no result']

# --- 3. Standardize Team Names ---
team_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Pune Warriors': 'Rising Pune Supergiant',
    'Rising Pune Supergiants': 'Rising Pune Supergiant',
    'Gujarat Lions': 'Gujarat Titans', # Simplifying mapping for continuity, though technically different franchises sometimes
    'Kings XI Punjab': 'Punjab Kings'
}

matches.replace(team_mapping, inplace=True)
deliveries.replace(team_mapping, inplace=True)

# --- 4. Date Conversion ---
if 'date' in matches.columns:
    matches['date'] = pd.to_datetime(matches['date'])

# --- 5. Save Processed Data ---
matches.to_csv(f'{PROCESSED_DIR}matches_clean.csv', index=False)
deliveries.to_csv(f'{PROCESSED_DIR}deliveries_clean.csv', index=False)

print(f"✅ Processed data saved to {PROCESSED_DIR}")