In [None]:
# 03_debug_dls_comparison.ipynb

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
import json
import os
from glob import glob
import warnings
warnings.filterwarnings('ignore')

print("🏏 DEBUG: Real DLS vs Your Model Comparison")
print("="*60)

# Same DLS resource table as before
DLS_RESOURCE_TABLE = {
    20: {0: 100.0, 1: 93.4, 2: 85.1, 3: 74.9, 4: 62.7, 5: 49.0, 6: 34.9, 7: 22.6, 8: 12.2, 9: 4.2},
    19: {0: 96.4, 1: 90.3, 2: 82.4, 3: 72.8, 4: 61.1, 5: 47.9, 6: 34.2, 7: 22.2, 8: 12.0, 9: 4.1},
    18: {0: 92.3, 1: 86.7, 2: 79.1, 3: 70.1, 4: 59.1, 5: 46.4, 6: 33.2, 7: 21.6, 8: 11.7, 9: 4.0},
    17: {0: 87.5, 1: 82.4, 2: 75.1, 3: 66.9, 4: 56.7, 5: 44.7, 6: 32.0, 7: 20.8, 8: 11.3, 9: 3.9},
    16: {0: 82.1, 1: 77.8, 2: 70.7, 3: 63.2, 4: 54.0, 5: 42.7, 6: 30.6, 7: 20.0, 8: 10.8, 9: 3.7},
    15: {0: 76.1, 1: 72.6, 2: 65.8, 3: 59.0, 4: 50.8, 5: 40.2, 6: 28.9, 7: 18.9, 8: 10.2, 9: 3.5},
    14: {0: 69.6, 1: 66.9, 2: 60.5, 3: 54.4, 4: 47.4, 5: 37.6, 6: 27.0, 7: 17.7, 8: 9.6, 9: 3.3},
    13: {0: 62.7, 1: 60.7, 2: 54.8, 3: 49.4, 4: 43.6, 5: 34.6, 6: 24.9, 7: 16.4, 8: 8.9, 9: 3.0},
    12: {0: 55.6, 1: 54.1, 2: 48.8, 3: 44.2, 4: 39.4, 5: 31.4, 6: 22.6, 7: 14.9, 8: 8.1, 9: 2.8},
    11: {0: 48.3, 1: 47.4, 2: 42.6, 3: 38.6, 4: 34.8, 5: 27.8, 6: 20.1, 7: 13.3, 8: 7.2, 9: 2.5},
    10: {0: 40.9, 1: 40.5, 2: 36.4, 3: 33.0, 4: 30.0, 5: 24.1, 6: 17.5, 7: 11.6, 8: 6.3, 9: 2.2},
    9: {0: 33.6, 1: 33.6, 2: 30.3, 3: 27.5, 4: 25.2, 5: 20.3, 6: 14.8, 7: 9.8, 8: 5.3, 9: 1.8},
    8: {0: 26.5, 1: 26.8, 2: 24.3, 3: 22.2, 4: 20.5, 5: 16.6, 6: 12.1, 7: 8.0, 8: 4.3, 9: 1.5},
    7: {0: 19.8, 1: 20.3, 2: 18.6, 3: 17.1, 4: 15.9, 5: 12.9, 6: 9.4, 7: 6.2, 8: 3.4, 9: 1.2},
    6: {0: 13.6, 1: 14.2, 2: 13.1, 3: 12.2, 4: 11.4, 5: 9.3, 6: 6.8, 7: 4.5, 8: 2.4, 9: 0.8},
    5: {0: 8.1, 1: 8.8, 2: 8.2, 3: 7.7, 4: 7.3, 5: 6.0, 6: 4.4, 7: 2.9, 8: 1.6, 9: 0.5},
    4: {0: 3.8, 1: 4.3, 2: 4.1, 3: 3.9, 4: 3.7, 5: 3.1, 6: 2.3, 7: 1.5, 8: 0.8, 9: 0.3},
    3: {0: 1.4, 1: 1.7, 2: 1.7, 3: 1.6, 4: 1.6, 5: 1.3, 6: 1.0, 7: 0.7, 8: 0.4, 9: 0.1},
    2: {0: 0.3, 1: 0.4, 2: 0.4, 3: 0.4, 4: 0.4, 5: 0.3, 6: 0.3, 7: 0.2, 8: 0.1, 9: 0.0},
    1: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0},
    0: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}
}

def get_dls_resource(overs_remaining, wickets_lost):
    """Get DLS resource percentage from official tables"""
    overs_remaining = max(0, min(20, int(overs_remaining)))
    wickets_lost = max(0, min(9, int(wickets_lost)))
    
    if overs_remaining in DLS_RESOURCE_TABLE:
        return DLS_RESOURCE_TABLE[overs_remaining].get(wickets_lost, 0.0)
    return 0.0

def calculate_real_dls_target(team1_score, team1_resources, team2_resources):
    """Calculate real DLS target using official methodology"""
    if team2_resources >= team1_resources:
        return team1_score
    else:
        par_score = team1_score * (team2_resources / team1_resources)
        return par_score

# Load model architecture (same as before)
class CricketRNN(nn.Module):
    def __init__(self, input_size, hidden_size=256, num_layers=3, dropout=0.3):
        super(CricketRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True,
            bidirectional=False
        )
        
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn1 = nn.BatchNorm1d(hidden_size // 2)
        self.fc2 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.bn2 = nn.BatchNorm1d(hidden_size // 4)
        self.fc3 = nn.Linear(hidden_size // 4, 1)
        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_output = lstm_out[:, -1, :]
        
        out = self.fc1(last_output)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        return out.squeeze()

def load_trained_model():
    """Load the trained RNN model"""
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    
    model_path = '../data/processed/best_rnn_model_enhanced.pth'
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found: {model_path}")
    
    checkpoint = torch.load(model_path, map_location=device)
    input_size = 13
    
    model = CricketRNN(input_size=input_size)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    
    feature_mean = checkpoint.get('feature_mean', torch.zeros(input_size))
    feature_std = checkpoint.get('feature_std', torch.ones(input_size))
    
    print(f"✅ Model loaded successfully on {device}")
    return model, feature_mean, feature_std, device

# DEBUG: Detailed investigation
def debug_data_processing():
    """Debug why no matches are being processed"""
    
    # Check data directory
    data_dir = '../data/processed/match_sequences'
    if not os.path.exists(data_dir):
        print(f"❌ Data directory not found: {data_dir}")
        return
    
    # Load sequence files
    sequence_files = glob(f'{data_dir}/*.csv')
    print(f"📁 Found {len(sequence_files)} sequence files")
    
    # Debug: Check first few files
    print(f"\n🔍 DEBUGGING FIRST 5 FILES:")
    
    for i, seq_file in enumerate(sequence_files[:5]):
        print(f"\n--- File {i+1}: {os.path.basename(seq_file)} ---")
        
        try:
            # Try to load CSV
            df = pd.read_csv(seq_file)
            print(f"✅ CSV loaded: {df.shape}")
            print(f"   Columns: {list(df.columns)}")
            
            # Check for metadata
            meta_file = seq_file.replace('.csv', '_meta.json')
            if os.path.exists(meta_file):
                with open(meta_file, 'r') as f:
                    meta = json.load(f)
                print(f"✅ Metadata loaded: {meta}")
            else:
                print(f"❌ No metadata file: {meta_file}")
                
            # Check data quality
            print(f"   First few rows:")
            print(df.head(2))
            
            # Check specific columns we need
            required_cols = ['cumulative_runs', 'cumulative_wickets', 'overs_remaining']
            missing_cols = [col for col in required_cols if col not in df.columns]
            if missing_cols:
                print(f"❌ Missing columns: {missing_cols}")
            else:
                print(f"✅ Required columns present")
                
        except Exception as e:
            print(f"❌ Error processing file: {e}")
    
    print(f"\n🔍 DETAILED ANALYSIS OF FIRST VALID FILE:")
    
    # Find first valid file
    for seq_file in sequence_files:
        try:
            df = pd.read_csv(seq_file)
            meta_file = seq_file.replace('.csv', '_meta.json')
            
            if os.path.exists(meta_file):
                with open(meta_file, 'r') as f:
                    meta = json.load(f)
                
                print(f"\n📊 File: {os.path.basename(seq_file)}")
                print(f"   Shape: {df.shape}")
                print(f"   Columns: {list(df.columns)}")
                print(f"   Metadata: {meta}")
                
                # Check specific values
                if len(df) >= 10:
                    test_row = df.iloc[9]  # Over 10
                    print(f"\n   Test row (over 10):")
                    print(f"   Current runs: {test_row.get('cumulative_runs', 'MISSING')}")
                    print(f"   Current wickets: {test_row.get('cumulative_wickets', 'MISSING')}")
                    print(f"   Overs remaining: {test_row.get('overs_remaining', 'MISSING')}")
                    
                    # Try DLS calculation
                    if all(col in df.columns for col in ['cumulative_runs', 'cumulative_wickets', 'overs_remaining']):
                        current_runs = test_row['cumulative_runs']
                        current_wickets = test_row['cumulative_wickets']
                        overs_remaining = test_row['overs_remaining']
                        
                        team2_resources = get_dls_resource(overs_remaining, current_wickets)
                        final_score = meta.get('final_score', 0)
                        
                        print(f"\n   DLS calculation test:")
                        print(f"   Team 2 resources: {team2_resources}%")
                        print(f"   Final score: {final_score}")
                        
                        if final_score > 0:
                            dls_par = calculate_real_dls_target(final_score, 100.0, team2_resources)
                            dls_remaining = max(0, dls_par - current_runs)
                            actual_remaining = final_score - current_runs
                            
                            print(f"   DLS par score: {dls_par:.1f}")
                            print(f"   DLS remaining: {dls_remaining:.1f}")
                            print(f"   Actual remaining: {actual_remaining}")
                            
                            # Check if this would pass our filters
                            if 0 <= actual_remaining <= 200 and 0 <= dls_remaining <= 300:
                                print(f"   ✅ This match would be VALID for comparison")
                            else:
                                print(f"   ❌ This match would be FILTERED OUT")
                                print(f"       Actual remaining: {actual_remaining} (need 0-200)")
                                print(f"       DLS remaining: {dls_remaining} (need 0-300)")
                else:
                    print(f"   ❌ Too few rows: {len(df)} (need at least 10)")
                
                break  # Stop after first valid file
                
        except Exception as e:
            continue

# Load model first
try:
    model, feature_mean, feature_std, device = load_trained_model()
    
    # Run debugging
    debug_data_processing()
    
except Exception as e:
    print(f"❌ Error loading model: {e}")

print(f"\n🔧 SUGGESTIONS TO FIX:")
print(f"1. Check if column names match expected format")
print(f"2. Verify metadata files exist and are valid")
print(f"3. Check if data values are reasonable")
print(f"4. Ensure sequence files have enough rows")
print(f"5. Verify feature count matches model expectations")

🏏 Real DLS vs Your Model Comparison
🚀 Starting comprehensive comparison with real DLS...
✅ Model loaded successfully on mps
📁 Found 2185 sequence files
🔄 Processing matches for real DLS comparison...
✅ Successfully processed 0 matches
❌ No successful predictions made
❌ Comparison failed. Please check your data and model files.

🏏 Real DLS Comparison Analysis Complete!
