In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

# Load data
deliveries = pd.read_csv('all_matches_updated.csv')
df = deliveries.copy()

# Data preparation (following your exact pattern)
df = df.rename(columns={'striker': 'batsman'})
df = df.rename(columns={'runs_off_bat': 'runs_of_bat'})
df['innings'] = df['innings'].astype(int)

# Fill missing values
df['wides'] = df['wides'].fillna(0)
df['noballs'] = df['noballs'].fillna(0)

# Calculate total runs
#df['total_runs'] = df['runs_of_bat'] + df['wides'] + df['noballs']
df['total_runs'] = df['runs_of_bat'] + df['extras']
# Create bowling-specific indicators (following your bowling code pattern)
def is_bowler_wicket(player_dismissed, dismissal_kind):
    if type(player_dismissed) == str:
        if dismissal_kind not in ['run out', 'retired hurt', 'retired out']:
            return 1
    return 0

df['isBowlerWk'] = df.apply(lambda x: is_bowler_wicket(x.get('player_dismissed'), x.get('wicket_type', '')), axis=1)
df['isDot'] = df['runs_of_bat'].apply(lambda x: 1 if x == 0 else 0)
df['isFour'] = df['runs_of_bat'].apply(lambda x: 1 if x == 4 else 0)
df['isSix'] = df['runs_of_bat'].apply(lambda x: 1 if x == 6 else 0)

def enhanced_bowler_analysis_with_overall_stats(df, venue_name, input_bowlers, threshold=80):
    """Enhanced bowler analysis with comprehensive overall statistics and abbreviations"""
    
    venue_bowlers = df[df['venue'] == venue_name]['bowler'].unique().tolist()
    
    if not venue_bowlers:
        print(f"❌ No bowlers found at venue: {venue_name}")
        return
    
    print(f"🎳 BOWLER PERFORMANCE ANALYSIS")
    print(f"📍 Venue: {venue_name}")
    print("=" * 120)
    
    # Enhanced fuzzy search for bowlers
    matched_bowlers = {}
    exact_matches = []
    
    for bowler in input_bowlers:
        matches = process.extract(bowler, venue_bowlers, limit=3)
        good_matches = [(name, score) for name, score in matches if score >= threshold]
        
        if good_matches:
            best_match = good_matches[0][0]
            exact_matches.append(best_match)
            matched_bowlers[bowler] = {
                'found': True,
                'match': best_match,
                'confidence': good_matches[0][1]
            }
        else:
            matched_bowlers[bowler] = {
                'found': False,
                'suggestions': matches[:2]
            }
    
    # Display search results
    print(f"\n🔍 BOWLER SEARCH RESULTS:")
    print("-" * 50)
    found_count = 0
    for input_bowler, result in matched_bowlers.items():
        if result['found']:
            confidence_emoji = "🎯" if result['confidence'] == 100 else "✅"
            print(f"{confidence_emoji} {input_bowler} → {result['match']} ({result['confidence']}%)")
            found_count += 1
        else:
            print(f"❌ {input_bowler} → Not found")
            suggestions = ", ".join([f"{name} ({score}%)" for name, score in result['suggestions']])
            print(f"   💡 Try: {suggestions}")
    
    if found_count == 0:
        print("\n⚠️ No bowlers found at this venue.")
        return
    
    print(f"\n📊 Found {found_count} out of {len(input_bowlers)} bowlers")
    print("=" * 120)
    
    # Analyze each found bowler with comprehensive metrics
    all_summaries = []
    
    for bowler in exact_matches:
        # Get bowler data at this venue
        venue_bowler_data = df[(df['venue'] == venue_name) & (df['bowler'] == bowler)]
        
        # Get bowler data across ALL venues for overall statistics
        all_venues_bowler_data = df[df['bowler'] == bowler]
        
        if venue_bowler_data.empty:
            continue
        
        # VENUE-SPECIFIC BOWLING CALCULATIONS
        venue_total_matches = venue_bowler_data['match_id'].nunique()
        venue_total_balls = len(venue_bowler_data)
        venue_total_overs = venue_total_balls / 6
        venue_runs_conceded = venue_bowler_data['total_runs'].sum()
        venue_wickets_taken = venue_bowler_data['isBowlerWk'].sum()
        venue_dots = venue_bowler_data['isDot'].sum()
        venue_fours_conceded = venue_bowler_data['isFour'].sum()
        venue_sixes_conceded = venue_bowler_data['isSix'].sum()
        venue_boundaries_conceded = venue_fours_conceded + venue_sixes_conceded
        
        # OVERALL (ALL VENUES) BOWLING CALCULATIONS
        overall_total_matches = all_venues_bowler_data['match_id'].nunique()
        overall_total_balls = len(all_venues_bowler_data)
        overall_total_overs = overall_total_balls / 6
        overall_runs_conceded = all_venues_bowler_data['total_runs'].sum()
        overall_wickets_taken = all_venues_bowler_data['isBowlerWk'].sum()
        overall_dots = all_venues_bowler_data['isDot'].sum()
        overall_fours_conceded = all_venues_bowler_data['isFour'].sum()
        overall_sixes_conceded = all_venues_bowler_data['isSix'].sum()
        overall_boundaries_conceded = overall_fours_conceded + overall_sixes_conceded
        
        # Calculate bowling averages and metrics
        venue_economy = venue_runs_conceded / venue_total_overs if venue_total_overs > 0 else 0
        overall_economy = overall_runs_conceded / overall_total_overs if overall_total_overs > 0 else 0
        
        venue_bowling_avg = venue_runs_conceded / venue_wickets_taken if venue_wickets_taken > 0 else 0
        overall_bowling_avg = overall_runs_conceded / overall_wickets_taken if overall_wickets_taken > 0 else 0
        
        venue_strike_rate = venue_total_balls / venue_wickets_taken if venue_wickets_taken > 0 else 0
        overall_strike_rate = overall_total_balls / overall_wickets_taken if overall_wickets_taken > 0 else 0
        
        venue_dot_percentage = (venue_dots / venue_total_balls * 100) if venue_total_balls > 0 else 0
        overall_dot_percentage = (overall_dots / overall_total_balls * 100) if overall_total_balls > 0 else 0
        
        venue_boundary_percentage = (venue_boundaries_conceded / venue_total_balls * 100) if venue_total_balls > 0 else 0
        overall_boundary_percentage = (overall_boundaries_conceded / overall_total_balls * 100) if overall_total_balls > 0 else 0
        
        # VENUE innings-wise analysis
        venue_innings_data = venue_bowler_data.groupby(['match_id', 'innings']).agg({
            'total_runs': 'sum',
            'isBowlerWk': 'sum'
        }).reset_index()
        
        venue_inn1_data = venue_innings_data[venue_innings_data['innings'] == 1]
        venue_inn1_matches = len(venue_inn1_data)
        venue_inn1_wickets = venue_inn1_data['isBowlerWk'].sum()
        venue_inn1_runs = venue_inn1_data['total_runs'].sum()
        venue_inn1_avg_wickets = venue_inn1_data['isBowlerWk'].mean() if venue_inn1_matches > 0 else 0
        
        venue_inn2_data = venue_innings_data[venue_innings_data['innings'] == 2]
        venue_inn2_matches = len(venue_inn2_data)
        venue_inn2_wickets = venue_inn2_data['isBowlerWk'].sum()
        venue_inn2_runs = venue_inn2_data['total_runs'].sum()
        venue_inn2_avg_wickets = venue_inn2_data['isBowlerWk'].mean() if venue_inn2_matches > 0 else 0
        
        # OVERALL innings-wise analysis
        overall_innings_data = all_venues_bowler_data.groupby(['match_id', 'innings']).agg({
            'total_runs': 'sum',
            'isBowlerWk': 'sum'
        }).reset_index()
        
        overall_inn1_data = overall_innings_data[overall_innings_data['innings'] == 1]
        overall_inn1_avg_wickets = overall_inn1_data['isBowlerWk'].mean() if len(overall_inn1_data) > 0 else 0
        
        overall_inn2_data = overall_innings_data[overall_innings_data['innings'] == 2]
        overall_inn2_avg_wickets = overall_inn2_data['isBowlerWk'].mean() if len(overall_inn2_data) > 0 else 0
        
        # Best bowling figures
        venue_best_figures = venue_innings_data['isBowlerWk'].max() if not venue_innings_data.empty else 0
        overall_best_figures = overall_innings_data['isBowlerWk'].max() if not overall_innings_data.empty else 0
        
        # Calculate BPD (Balls Per Dismissal) - same as strike rate for bowlers
        venue_bpd = venue_strike_rate
        overall_bpd = overall_strike_rate
        
        # Calculate BPB (Balls Per Boundary conceded)
        venue_bpb = round(venue_total_balls / venue_boundaries_conceded, 2) if venue_boundaries_conceded > 0 else 0
        overall_bpb = round(overall_total_balls / overall_boundaries_conceded, 2) if overall_boundaries_conceded > 0 else 0
        
        # Bowling milestones (3+ wickets, 4+ wickets, 5+ wickets)
        venue_three_wickets = (venue_innings_data['isBowlerWk'] >= 3).sum()
        venue_four_wickets = (venue_innings_data['isBowlerWk'] >= 4).sum()
        venue_five_wickets = (venue_innings_data['isBowlerWk'] >= 5).sum()
        
        overall_three_wickets = (overall_innings_data['isBowlerWk'] >= 3).sum()
        overall_four_wickets = (overall_innings_data['isBowlerWk'] >= 4).sum()
        overall_five_wickets = (overall_innings_data['isBowlerWk'] >= 5).sum()
        
        # Display individual bowler summary
        print(f"\n🎳 {bowler}")
        print("-" * 80)
        print(f"📈 Bowling Performance Comparison:")
        print(f"   • Overall: {overall_total_matches} Mat, {overall_wickets_taken} Wkts, Avg: {overall_bowling_avg:.1f}, Eco: {overall_economy:.1f}")
        print(f"   • At Venue: {venue_total_matches} Mat, {venue_wickets_taken} Wkts, Avg: {venue_bowling_avg:.1f}, Eco: {venue_economy:.1f}")
        print(f"   • Overall Dot%: {overall_dot_percentage:.1f} | Venue Dot%: {venue_dot_percentage:.1f}")
        print(f"   • Overall BF: {overall_best_figures} | Venue BF: {venue_best_figures}")
        print(f"   • Overall Milestones: {overall_three_wickets}|{overall_four_wickets}|{overall_five_wickets} | Venue: {venue_three_wickets}|{venue_four_wickets}|{venue_five_wickets}")
        
        # Store for comparison table with ALL requested bowling columns
        all_summaries.append({
            'Bowler': bowler,
            'O_Mat': overall_total_matches,           # Overall Matches
            'O_Wkts': overall_wickets_taken,          # Overall Wickets
            'O_Avg': round(overall_bowling_avg, 1),   # Overall Bowling Average
            'O_1st': round(overall_inn1_avg_wickets, 1),  # Overall 1st Inn Avg Wickets
            'O_2nd': round(overall_inn2_avg_wickets, 1),  # Overall 2nd Inn Avg Wickets
            'O_BPD': round(overall_bpd, 1),           # Overall BPD (Strike Rate)
            'O_Mile': f"{overall_three_wickets}|{overall_four_wickets}|{overall_five_wickets}",  # Overall Milestones
            'V_Mat': venue_total_matches,             # Venue Matches
            'V_Wkts': venue_wickets_taken,            # Venue Wickets
            'V_Avg': round(venue_bowling_avg, 1),     # Venue Bowling Average
            'V_Eco': round(venue_economy, 1),         # Venue Economy Rate
            'V_BF': venue_best_figures,               # Venue Best Figures
            'V_Dot%': round(venue_dot_percentage, 1), # Venue Dot Percentage
            'V_1st': round(venue_inn1_avg_wickets, 1), # Venue 1st Inn Avg Wickets
            'V_2nd': round(venue_inn2_avg_wickets, 1), # Venue 2nd Inn Avg Wickets
            'V_BPD': round(venue_bpd, 1),             # Venue BPD (Strike Rate)
            'V_BPB': venue_bpb,                       # Venue BPB (Balls Per Boundary)
            'V_Mile': f"{venue_three_wickets}|{venue_four_wickets}|{venue_five_wickets}"  # Venue Milestones
        })
    
    # Display comparison table with abbreviated column names
    if len(all_summaries) > 1:
        comparison_df = pd.DataFrame(all_summaries)
        comparison_df = comparison_df.sort_values('V_Wkts', ascending=False)
        comparison_df.index = range(1, len(comparison_df) + 1)
        
        print(f"\n📋 COMPREHENSIVE BOWLER COMPARISON AT {venue_name.upper()}")
        print("=" * 120)
        print("Legend: O_ = Overall (All Venues), V_ = Venue Specific")
        print("Mat=Matches, Wkts=Wickets, Avg=Bowling Average, Eco=Economy, BF=Best Figures, Dot%=Dot Ball %")
        print("BPD=Balls Per Dismissal, BPB=Balls Per Boundary, Mile=Milestones (3+|4+|5+ wickets)")
        print("-" * 120)
        
        from IPython.display import display
        display(comparison_df)
        
        # Enhanced insights
        best_overall_avg = comparison_df.loc[comparison_df['O_Avg'].idxmin()] if comparison_df['O_Avg'].max() > 0 else None
        best_venue_avg = comparison_df.loc[comparison_df['V_Avg'].idxmin()] if comparison_df['V_Avg'].max() > 0 else None
        most_overall_wickets = comparison_df.loc[comparison_df['O_Wkts'].idxmax()]
        most_venue_wickets = comparison_df.loc[comparison_df['V_Wkts'].idxmax()]
        best_economy = comparison_df.loc[comparison_df['V_Eco'].idxmin()] if comparison_df['V_Eco'].max() > 0 else None
        
        print(f"\n💡 QUICK INSIGHTS:")
        if best_overall_avg is not None:
            print(f"🌟 Best Overall Average: {best_overall_avg['Bowler']} ({best_overall_avg['O_Avg']})")
        if best_venue_avg is not None:
            print(f"🏟️ Best Venue Average: {best_venue_avg['Bowler']} ({best_venue_avg['V_Avg']})")
        print(f"🏆 Most Overall Wickets: {most_overall_wickets['Bowler']} ({most_overall_wickets['O_Wkts']} wickets)")
        print(f"🎯 Most Venue Wickets: {most_venue_wickets['Bowler']} ({most_venue_wickets['V_Wkts']} wickets)")
        if best_economy is not None:
            print(f"💰 Best Economy Rate: {best_economy['Bowler']} ({best_economy['V_Eco']})")

def quick_bowler_search(df, venue_name, max_results=10):
    """Quick function to see top bowlers at a venue"""
    
    venue_data = df[df['venue'] == venue_name]
    if venue_data.empty:
        print(f"❌ No data for venue: {venue_name}")
        return
    
    # Get top bowling performers
    bowler_stats = venue_data.groupby('bowler').agg({
        'isBowlerWk': 'sum',
        'match_id': 'nunique',
        'total_runs': 'sum'
    }).rename(columns={'isBowlerWk': 'total_wickets', 'match_id': 'matches', 'total_runs': 'runs_conceded'})
    
    bowler_stats = bowler_stats[bowler_stats['matches'] >= 2]  # Minimum 2 matches
    bowler_stats['bowling_avg'] = bowler_stats['runs_conceded'] / bowler_stats['total_wickets']
    bowler_stats['bowling_avg'] = bowler_stats['bowling_avg'].fillna(0)
    bowler_stats = bowler_stats.sort_values('total_wickets', ascending=False).head(max_results)
    
    print(f"🎳 TOP BOWLERS AT {venue_name.upper()}")
    print("=" * 70)
    for i, (bowler, stats) in enumerate(bowler_stats.iterrows(), 1):
        avg_display = f"({stats['bowling_avg']:.1f} avg)" if stats['total_wickets'] > 0 else "(No wickets)"
        print(f"{i:2d}. {bowler:<25} {stats['total_wickets']:>3} wickets {avg_display}")

# Example usage with comprehensive bowling statistics
print("🎳 ENHANCED CRICKET BOWLER ANALYSIS")
print("=" * 50)

# Show top bowlers first
quick_bowler_search(df,"Edgbaston, Birmingham")

print("\n" + "="*120)

# Analyze specific bowlers with comprehensive stats
test_bowlers = [    "Caleb Jewell", "AHT Donald", "DL Lloyd", "WL Madsen", "SR Patel", 
        "MK Andersson", "RA Whiteley", "Mohammad Ghazanfar", "Nick Potts", "PR Brown","RS Patel","RH Patel", 
    "SG Budinger", "LPJ Kimber", "Shan Masood", "LJ Hill",
    "OB Cox", "LV van Beek", "L Trevaskis", "TAR Scriven", "MET Salisbury", "RI Walker"]

enhanced_bowler_analysis_with_overall_stats(df,"Edgbaston, Birmingham",test_bowlers)


🎳 ENHANCED CRICKET BOWLER ANALYSIS
🎳 TOP BOWLERS AT EDGBASTON, BIRMINGHAM
 1. DR Briggs                 45.0 wickets (15.7 avg)
 2. JB Lintott                39.0 wickets (18.8 avg)
 3. DR Mousley                27.0 wickets (16.0 avg)
 4. CN Miles                  20.0 wickets (26.3 avg)
 5. CR Brathwaite             16.0 wickets (20.2 avg)
 6. HJH Brookes               15.0 wickets (17.5 avg)
 7. MM Ali                    13.0 wickets (17.5 avg)
 8. WMH Rhodes                9.0 wickets (10.7 avg)
 9. BA Raine                  9.0 wickets (13.8 avg)
10. GHS Garton                9.0 wickets (22.1 avg)

🎳 BOWLER PERFORMANCE ANALYSIS
📍 Venue: Edgbaston, Birmingham

🔍 BOWLER SEARCH RESULTS:
--------------------------------------------------
❌ Caleb Jewell → Not found
   💡 Try: LWP Wells (57%), LC Norwell (55%)
❌ AHT Donald → Not found
   💡 Try: LA Dawson (53%), AT Thomson (50%)
❌ DL Lloyd → Not found
   💡 Try: DJ Willey (59%), L Wood (57%)
❌ WL Madsen → Not found
   💡 Try: JL Denly (59%

Unnamed: 0,Bowler,O_Mat,O_Wkts,O_Avg,O_1st,O_2nd,O_BPD,O_Mile,V_Mat,V_Wkts,V_Avg,V_Eco,V_BF,V_Dot%,V_1st,V_2nd,V_BPD,V_BPB,V_Mile
1,SR Patel,145,135,28.9,0.9,0.9,23.2,9|1|0,5,6,17.8,7.0,2,41.3,1.3,1.0,15.3,8.36,0|0|0
2,SR Patel,145,135,28.9,0.9,0.9,23.2,9|1|0,5,6,17.8,7.0,2,41.3,1.3,1.0,15.3,8.36,0|0|0
3,SR Patel,145,135,28.9,0.9,0.9,23.2,9|1|0,5,6,17.8,7.0,2,41.3,1.3,1.0,15.3,8.36,0|0|0
4,PR Brown,86,119,23.3,1.2,1.6,15.5,16|3|0,4,5,37.2,11.2,2,28.0,1.2,0.0,20.0,3.7,0|0|0
5,L Trevaskis,64,59,29.4,1.0,0.9,22.9,2|1|0,3,5,19.4,8.7,2,43.3,2.0,1.5,13.4,4.79,0|0|0
6,LJ Fletcher,71,81,26.3,1.3,0.9,18.0,9|2|2,2,2,12.5,5.0,2,30.0,2.0,0.0,15.0,0.0,0|0|0
7,LPJ Kimber,10,4,40.0,0.1,1.0,25.8,0|0|0,2,1,45.0,8.7,1,45.2,0.0,1.0,31.0,7.75,0|0|0
8,LV van Beek,28,31,25.3,0.9,1.6,17.3,4|1|0,1,1,33.0,9.9,1,60.0,1.0,0.0,20.0,3.33,0|0|0



💡 QUICK INSIGHTS:
🌟 Best Overall Average: PR Brown (23.3)
🏟️ Best Venue Average: LJ Fletcher (12.5)
🏆 Most Overall Wickets: SR Patel (135 wickets)
🎯 Most Venue Wickets: SR Patel (6 wickets)
💰 Best Economy Rate: LJ Fletcher (5.0)
