In [4]:
import pandas as pd
import numpy as np
import joblib      #
import xgboost as xgb # Required library
from sklearn.model_selection import train_test_split        #        
from sklearn.preprocessing import OneHotEncoder             #
from sklearn.compose import ColumnTransformer               #
from sklearn.metrics import accuracy_score              #
import os

def preprocess_data(matches_path, deliveries_path):
    """
    Loads, cleans, and preprocesses IPL data for second-inning Win Probability prediction.
    """
    # 1. Load datasets
    try:
        matches = pd.read_csv("./data/matches.csv")
        deliveries = pd.read_csv("./data/deliveries.csv")
    except FileNotFoundError:
        print(f"Error: One or both files not found at the expected path: {matches_path}, {deliveries_path}")
        # Return empty dataframes to prevent downstream errors
        return pd.DataFrame(), pd.DataFrame()
    
    # 2. Standardize Column Names and Team Names
    deliveries.rename(columns={'match_id': 'id'}, inplace=True)
    deliveries.columns = deliveries.columns.str.lower()
    matches.columns = matches.columns.str.lower()
    
    team_mapping = {
        'Delhi Daredevils': 'Delhi Capitals',
        'Deccan Chargers': 'Sunrisers Hyderabad',
        'Kings XI Punjab': 'Punjab Kings',
        'Pune Warriors': 'Rising Pune Supergiants',
        'Rising Pune Supergiant': 'Rising Pune Supergiants',
        'Gujarat Lions': 'Gujarat Titans',
        'Royal Challengers Bangalore': 'Royal Challengers Bengaluru'
    }
    
    for df in [deliveries, matches]:
        for col in ['batting_team', 'bowling_team', 'winner', 'team1', 'team2']:
            if col in df.columns:
                df[col].replace(team_mapping, inplace=True)
                
    # CRITICAL: Impute missing venue before merge
    matches['venue'].fillna('Unknown Venue', inplace=True)
    
    # 3. Feature Engineering
    
    # Calculate Target Score (First Inning Total)
    first_inning_scores = deliveries[deliveries['inning'] == 1].groupby('id')['total_runs'].sum().reset_index()
    first_inning_scores.columns = ['id', 'target']
    
    # Merge all data
    df = deliveries.merge(matches, on='id')
    df = df.merge(first_inning_scores, on='id')

    # Filter for Second Innings and remove Super Overs
    df = df[df['inning'] == 2]
    df = df[df['super_over'] == 'N'].copy()
    
    # FIX: Handle missing 'winner' values (matches with no result)
    df['winner'].fillna('No Result', inplace=True) 
    df = df[df['winner'] != 'No Result'].copy()

    # Calculate Core Features
    df['current_score'] = df.groupby('id')['total_runs'].cumsum()
    df['wickets_taken'] = df.groupby('id')['is_wicket'].cumsum() 
    df['wickets_left'] = 10 - df['wickets_taken']
    df['balls_left'] = 120 - (df['over'] * 6 + df['ball'])
    
    # Calculate Run Rate Features
    balls_bowled = 120 - df['balls_left']
    df['current_run_rate'] = np.where(balls_bowled > 0, (df['current_score'] * 6) / balls_bowled, 0)
    df['required_run_rate'] = np.where(df['balls_left'] > 0, (df['target'] - df['current_score']) / (df['balls_left'] / 6), np.inf)

    # Create Target Variable (1 if chasing team wins, 0 otherwise)
    df['result'] = np.where(df['winner'] == df['batting_team'], 1, 0)
    
    # 4. Final Cleanup
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Drop rows where essential features are still NaN, and remove balls that completed the match (0 balls left)
    df.dropna(subset=['venue', 'batting_team', 'bowling_team', 'current_score', 'wickets_left', 'balls_left', 'target', 'current_run_rate', 'required_run_rate', 'result'], inplace=True)
    df = df[df['balls_left'] > 0]
    
    # 5. Select features and rename
    processed_df = df[[
        'venue', 'batting_team', 'bowling_team', 'current_score',
        'wickets_left', 'balls_left', 'target', 'current_run_rate',
        'required_run_rate', 'result'
    ]]
    
    processed_df.rename(columns={
        'target': 'runs_to_chase',
        'wickets_left': 'wickets_remaining'
    }, inplace=True)
    
    analysis_df = df[[
        'id', 'over', 'ball', 'batting_team', 'bowling_team', 
        'current_score', 'wickets_taken', 'target', 'total_runs', 'result', 'venue'
    ]].copy()
    
    analysis_df.rename(columns={'id': 'ID', 'over': 'overs', 'ball': 'ballnumber', 'total_runs': 'total_run'}, inplace=True)
    
    return processed_df, analysis_df


def train_and_save_model(processed_df, analysis_df):
    """Trains the XGBoost model and saves all necessary files."""
    
    if processed_df.empty:
        print("Training skipped: Processed DataFrame is empty.")
        return
        
    # 1. Define Features (X) and Target (Y)
    X = processed_df.drop(columns=['result'])
    Y = processed_df['result']
    
    # 2. Setup the Column Transformer (One-Hot Encoder)
    categorical_features = ['venue', 'batting_team', 'bowling_team']
    trf = ColumnTransformer(
        transformers=[
            ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )
    
    # 3. Apply the transformation and save the fitted transformer
    X_transformed = trf.fit_transform(X)
    joblib.dump(trf, 'column_transformer.joblib')
    
    # 4. Split the data
    X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y, test_size=0.2, random_state=42)
    
    # 5. Initialize and Train XGBoost Classifier
    # NOTE: Set objective to binary:logistic for classification
    xgb_classifier = xgb.XGBClassifier(
        n_estimators=100, 
        learning_rate=0.05, 
        max_depth=5, 
        random_state=42,
        objective='binary:logistic',
        use_label_encoder=False, 
        eval_metric='logloss'
    )
    
    print("Starting XGBoost Model Training...")
    xgb_classifier.fit(X_train, Y_train)
    print("Training Complete.")
    
    # 6. Evaluate and Save the model
    Y_pred = xgb_classifier.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")
    
    # Save the model with the correct XGBoost name
    joblib.dump(xgb_classifier, 'xgb_ipl_predictor.joblib')
    joblib.dump(analysis_df, 'analysis_data.joblib') 
    
    print("\nAll required files: xgb_ipl_predictor.joblib, column_transformer.joblib, and analysis_data.joblib have been saved.")


if __name__ == '__main__':
    # ASSUME files are in the current working directory
    matches_csv = 'matches.csv'
    deliveries_csv = 'deliveries.csv'
    
    print("--- Starting Data Preprocessing ---")
    processed_df, analysis_df = preprocess_data(matches_csv, deliveries_csv)
    print(f"Preprocessed DataFrame ready for ML (Shape: {processed_df.shape})")
    
    if not processed_df.empty:
        print("--- Starting Model Training ---")
        train_and_save_model(processed_df, analysis_df)
    else:
        print("Training cannot proceed due to empty DataFrame.")
    print("--- Training and Saving Script Finished ---")

--- Starting Data Preprocessing ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace(team_mapping, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  matches['venue'].fillna('Unknown Venue', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

Preprocessed DataFrame ready for ML (Shape: (123428, 10))
--- Starting Model Training ---
Starting XGBoost Model Training...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Complete.
Model Accuracy on Test Set: 85.19%

All required files: xgb_ipl_predictor.joblib, column_transformer.joblib, and analysis_data.joblib have been saved.
--- Training and Saving Script Finished ---


final data processing and testing 