# 02 Feature Engineering

This notebook generates advanced metrics for analysis.

**Features Created:**
- **Player Level**: Runs, Strike Rate, Wickets, Economy, *Consistency Index*.
- **Match Level**: *Match Intensity Index* (Engagement Proxy).
- **Team Level**: *Team Efficiency* (Wins/Runs).
- **Venue Level**: Home Advantage Flag.

In [None]:
import pandas as pd
import numpy as np

PROCESSED_DIR = '../data/processed/'

# --- 1. Load Clean Data ---
matches = pd.read_csv(f'{PROCESSED_DIR}matches_clean.csv')
deliveries = pd.read_csv(f'{PROCESSED_DIR}deliveries_clean.csv')

# Merge to get match details on every ball
ipl = deliveries.merge(matches, left_on='match_id', right_on='id')

# --- 2. Player Level Metrics ---
# A. Batting Stats
batsman_stats = ipl.groupby('batsman').agg({
    'batsman_runs': 'sum',
    'ball': 'count',
    'match_id': 'nunique'
}).rename(columns={'match_id': 'matches_played', 'ball': 'balls_faced'})

batsman_stats['strike_rate'] = (batsman_stats['batsman_runs'] / batsman_stats['balls_faced']) * 100
batsman_stats['runs_per_match'] = batsman_stats['batsman_runs'] / batsman_stats['matches_played']

# B. Consistency Index (CV)
# Calculate standard deviation of runs per match for each batsman
run_std = ipl.groupby(['batsman', 'match_id'])['batsman_runs'].sum().groupby('batsman').std()
batsman_stats['run_std'] = run_std
batsman_stats['consistency_index'] = batsman_stats['runs_per_match'] / batsman_stats['run_std']

batsman_stats.reset_index(inplace=True)

# --- 3. Match Intensity (Engagement Proxy) ---
# Formula: Total Runs + (Wickets * 20) + (Boundaries * 5)
match_agg = ipl.groupby('match_id').agg({
    'total_runs': 'sum',
    'player_dismissed': 'count',
})

# Count boundaries (4s and 6s)
boundaries = ipl[ipl['batsman_runs'].isin([4, 6])].groupby('match_id').size()
match_agg['boundaries'] = boundaries
match_agg.fillna(0, inplace=True)

match_agg['match_intensity'] = (
    match_agg['total_runs'] +
    (match_agg['player_dismissed'] * 20) +
    (match_agg['boundaries'] * 5)
)

# Merge intensity back to matches
matches = matches.merge(match_agg[['match_intensity']], left_on='id', right_index=True)

# --- 4. Team Efficiency (Spend Proxy) ---
team_wins = matches['winner'].value_counts().reset_index()
team_wins.columns = ['team', 'wins']

team_runs = ipl.groupby('batting_team')['total_runs'].sum().reset_index()
team_runs.columns = ['team', 'total_runs_scored']

team_efficiency = team_wins.merge(team_runs, on='team')
team_efficiency['efficiency_score'] = team_efficiency['wins'] / team_efficiency['total_runs_scored']

# --- 5. Venue Advantage ---
# Needs 'home city' mapping, approximate logic: if team city == venue city
# For now, we will add a placeholder or simple logic if city column matches team name part

def is_home(row):
    if pd.isna(row['city']) or pd.isna(row['team1']): return 0
    if row['city'] in row['team1']: return 1 # Very naive check, e.g. 'Mumbai' in 'Mumbai Indians'
    return 0

matches['team1_home_advantage'] = matches.apply(is_home, axis=1)

# --- 6. Save Enriched Data ---

# We will save two main enriched datasets:
# 1. Player Stats (for Player Explorer)
# 2. Match Enriched (for Team/Venue analysis)

batsman_stats.to_csv(f'{PROCESSED_DIR}player_stats.csv', index=False)
matches.to_csv(f'{PROCESSED_DIR}matches_enriched.csv', index=False)
team_efficiency.to_csv(f'{PROCESSED_DIR}team_efficiency.csv', index=False)

# Also save the full merged ball-by-ball with key metrics for SQL
ipl_enriched = ipl.merge(match_agg[['match_intensity']], left_on='match_id', right_index=True)
ipl_enriched.to_csv(f'{PROCESSED_DIR}ipl_ball_by_ball_enriched.csv', index=False)

print("âœ… Feature Engineering Complete. Files saved to processed/.")