In [1]:
# selecting best ml-algorithm for our regression use-case

import mlflow
import dagshub
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import (r2_score, 
                            mean_absolute_error, 
                            mean_squared_error,
                            mean_absolute_percentage_error)
from datetime import datetime

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")

# ==================================== CONFIGURE LOGGING ==============================

import sys
import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", force=True)

# =========================== CONFIGURATION =================================

CONFIG = {
    "data_path": "data.csv",
    "mlflow_tracking_uri": "https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow",
    "dagshub_repo_owner": "nayanparvez90",
    "dagshub_repo_name": 'Innings-Score-Predictor',
    "experiment_name": "Best Model Selection"
}


# ========================== SETUP MLflow & DAGSHUB ==========================

mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])
dagshub.init(repo_owner=CONFIG["dagshub_repo_owner"], repo_name=CONFIG["dagshub_repo_name"], mlflow=True)
mlflow.set_experiment(CONFIG["experiment_name"])



# ========================== Preprocessing Steps ==========================

def remove_unwanted_columns(df):
    columns_to_remove = ['mid','venue', 'batsman', 'bowler', 'striker', 'non-striker']
    try:
        df = df.drop(labels=columns_to_remove, axis=1)
        return df
    except Exception as e:
        print(f"Error removing columns: {e}")
        raise


def convert_date_column(df):
    try:
        df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
        return df
    except Exception as e:
        print(f"Error converting 'date' column: {e}")
        raise


def filter_consistent_teams(df):
    consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                        'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                        'Delhi Daredevils', 'Sunrisers Hyderabad']
    try:
        df = df[(df['bat_team'].isin(consistent_teams)) & (df['bowl_team'].isin(consistent_teams))]
        return df
    except Exception as e:
        print(f"Error filtering teams: {e}")
        raise


def remove_initial_overs(df, min_overs=4.1):
    try:
        df = df[df['overs'] >= min_overs]
        return df
    except Exception as e:
        print(f"Error removing initial few overs: {e}")
        raise

# ================================== Preprocessing Pipeline =============================

def preprocess_df(df):
    try:
        df = convert_date_column(df)
        df = remove_unwanted_columns(df)
        df = filter_consistent_teams(df)
        df = remove_initial_overs(df)
        return df

    except Exception as e:
        print(f"Error occurred in preprocessing step: {e}")
        raise


# ============================ Feature Engineering Steps ================================= 

# scaling numerical-features
scaler = StandardScaler()

def scale_numerical_features(df, numerical_columns, fit_scaler=False):
    try:
        if fit_scaler:
            df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
        else:
            df[numerical_columns] = scaler.transform(df[numerical_columns])
        return df
    except Exception as e:
        print(f"Error scaling numerical features: {e}")
        raise

# encoding categorical-features
def encode_categorical_features(df, categorical_columns):
    try:
        df = pd.get_dummies(data=df, columns=categorical_columns)
        return df
    except Exception as e:
        print(f"Error encoding categorical feature: {e}")
        raise

# ================================== Feature Engineering Pipeline ===============================

def apply_feature_engineering(df, fit_scaler=False):
    numerical_columns = ['overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5']
    categorical_columns = ['bat_team', 'bowl_team']

    try:
        df = encode_categorical_features(df, categorical_columns)
        df = scale_numerical_features(df,numerical_columns, fit_scaler=fit_scaler)
        return df
    except Exception as e:
        print(f"Error in feature engg. step: {e}")
        raise


# ========================== LOAD DATA & APPLYING PREPROCESSING STEPS ==========================

def load_data(filepath):
    try:
        df = pd.read_csv(filepath)
        logging.info(f"Size of dataframe before preprocessing: {len(df)}")


        df = preprocess_df(df)
        logging.info("Applied all the necessary preprocessing steps")
        logging.info(f"Size of dataframe after preprocessing: {len(df)}")
        logging.info("This decrease in data size is because we've reduced data for the first few overs of each innings")

        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        logging.error(f"Exception occurred: {e}")
        raise


# =================================== DEFINING Regression ALGORITHMS ================================

ALGORITHMS = {
    'LinearRegression': LinearRegression(fit_intercept=True, n_jobs=5),
    'RidgeRegression': Ridge(alpha=1.0, solver='auto', random_state=42),
    'LassoRegression': Lasso(alpha=0.1, max_iter=1000, tol=0.0001),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
}



# ========================= TRAINING & MODEL EVALUATION ==========================

'''Set to distinguish and store Best model and it's params and results.'''
best_score = float('-inf')                                       # Initializing with a very low score
best_model = None
best_params = {}

def train_and_evaluate(df):
    global best_score, best_model, best_params                   # specifying the global variables created.
    logging.info("Entered training and evaluation method")

    with mlflow.start_run(run_name="Best Model selector") as parent_run:
        
        # Split using date (no random split)
        train_df = df[df['date'].dt.year <= 2015].copy()
        test_df = df[df['date'].dt.year >= 2016].copy()

        # Drop target column and split further into X & y / train & test sets
        X_train = train_df.drop(columns=['total'])
        y_train = train_df['total']

        X_test = test_df.drop(columns=['total'])
        y_test = test_df['total']
        logging.info("Splitting dataframe into train and test sets")

        # Drop 'date' before encoding
        if 'date' in X_train.columns:
            X_train = X_train.drop(columns='date') 
        if 'date' in X_test.columns:                    # the best time to drop the date column is after splitting and before feature engg.
            X_test = X_test.drop(columns='date')


        # Feature Engineering applied separately to train and test
        X_train = apply_feature_engineering(X_train, fit_scaler=True)               # fits and transforms train data
        X_test = apply_feature_engineering(X_test, fit_scaler=False)                # only transforms test data

        # Align columns of test set to match train set (fixing dummy mismatch)
        X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
        logging.info("Applied Feature Engineering separately to train and test")


        for algo_name, algorithm in ALGORITHMS.items():
            with mlflow.start_run(run_name=f"{algo_name}", nested=True) as child_run:
                try:
                    # Log the algorithm name explicitly
                    mlflow.log_param("algorithm", algo_name)
                    
                    # Train model
                    model = algorithm
                    model.fit(X_train, y_train)

                    # Log model parameters
                    log_model_params(algo_name, model)

                    # Predictions
                    y_pred = model.predict(X_test)

                    # Model Evaluation
                    metrics = {
    "r2_score": r2_score(y_test, y_pred),                      # Measures how well predictions match actual values (higher is better)
    "mae": mean_absolute_error(y_test, y_pred),                                            # Average absolute error (lower is better)
    "mse": mean_squared_error(y_test, y_pred),                                            # Penalizes larger errors (lower is better)
    "mape": mean_absolute_percentage_error(y_test, y_pred),                    # Measures relative percentage error (lower is better)
}

                    mlflow.log_metrics(metrics)    # Logging metrics

                    # Checks each model for the best one
                    if metrics["r2_score"] > best_score:
                        best_score = metrics["r2_score"]
                        best_model = model
                        best_params = model.get_params()       # Store the best model's params

                    # Logging model
                    input_example = X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5].toarray()
                    mlflow.sklearn.log_model(model, "model", input_example=input_example)

                    # Printing results
                    print(f"\nAlgorithm: {algo_name}")
                    print(f"Metrics: {metrics}")

                except Exception as e:
                    print(f"Error in training {algo_name}: {e}")
                    mlflow.log_param("error", str(e))

                
# After all models have been evaluated, logs the best model, its params and metrics
        if best_model:
            logging.info(f"Best model: {best_model.__class__.__name__} with R2 score: {best_score}")
            mlflow.log_param("best_model", best_model.__class__.__name__)
            mlflow.log_params(best_params)
            mlflow.log_metric("best_r2_score", best_score)

            # Save the best model in MLflow
            mlflow.sklearn.log_model(best_model, "best_model")

    logging.info("Training and evaluation completed.")



# ================================== LOG MODEL PARAMETERS ===========================================

def log_model_params(algo_name, model):
    """Logs hyperparameters of the trained model to MLflow."""

    params_to_log = {}
    if algo_name == 'LinearRegression':
        params_to_log["fit_intercept"] = model.fit_intercept

    elif algo_name == 'RidgeRegression':
        params_to_log["alpha"] = model.alpha

    elif algo_name == 'LassoRegression':
        params_to_log["alpha"] = model.alpha

    elif algo_name == 'RandomForest':
        params_to_log["n_estimators"] = model.n_estimators
        params_to_log["max_depth"] = model.max_depth
        params_to_log["min_samples_leaf"] = model.min_samples_leaf

    elif algo_name == 'XGBoost':
        params_to_log["n_estimators"] = model.get_params().get("n_estimators", None)
        params_to_log["learning_rate"] = model.get_params().get("learning_rate", None)
        params_to_log["max_depth"] = model.get_params().get("max_depth", None)

    mlflow.log_params(params_to_log)       # Logging parameters

''' We needed to use .get_params() for XGBoost because XGBoost models don’t always 
expose parameters as direct attributes like model.n_estimators in RandomForestRegressor. 
Instead, XGBoost stores its parameters in a dictionary, accessible via .get_params().'''


# ========================== EXECUTION ==========================
if __name__ == "__main__":
    df = load_data(CONFIG["data_path"])
    train_and_evaluate(df)


2025-04-11 12:57:23,463 - INFO - Accessing as nayanparvez90


2025-04-11 12:57:33,674 - INFO - Initialized MLflow to track repo "nayanparvez90/Innings-Score-Predictor"


2025-04-11 12:57:33,679 - INFO - Repository nayanparvez90/Innings-Score-Predictor initialized!


2025/04/11 12:57:43 INFO mlflow.tracking.fluent: Experiment with name 'Best Model Selection' does not exist. Creating a new experiment.


2025-04-11 12:58:01,864 - INFO - Size of dataframe before preprocessing: 76014
2025-04-11 12:58:02,664 - INFO - Applied all the necessary preprocessing steps
2025-04-11 12:58:02,664 - INFO - Size of dataframe after preprocessing: 42812
2025-04-11 12:58:02,664 - INFO - This decrease in data size is because we've reduced data for the first few overs of each innings
2025-04-11 12:58:02,664 - INFO - Entered training and evaluation method
2025-04-11 12:58:03,399 - INFO - Splitting dataframe into train and test sets
2025-04-11 12:58:03,436 - INFO - Applied Feature Engineering separately to train and test


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.40it/s]



Algorithm: LinearRegression
Metrics: {'r2_score': 0.6823818232428203, 'mae': 12.75853969035791, 'mse': 283.564297872011, 'mape': 0.08293583582718538}


2025/04/11 12:58:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run LinearRegression at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1/runs/645e4b36beef4df68a74b377a3ca9d16.
2025/04/11 12:58:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:06<00:00,  1.08it/s]



Algorithm: RidgeRegression
Metrics: {'r2_score': 0.6823613616719547, 'mae': 12.758456355064027, 'mse': 283.5825656268199, 'mape': 0.0829356029399067}


2025/04/11 12:58:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run RidgeRegression at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1/runs/be5d1ce233ec4a84a3ab9cacc8423e0d.
2025/04/11 12:58:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:07<00:00,  1.00s/it]



Algorithm: LassoRegression
Metrics: {'r2_score': 0.6837391055684274, 'mae': 12.683223474910529, 'mse': 282.3525384771161, 'mape': 0.08269908038888958}


2025/04/11 12:59:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run LassoRegression at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1/runs/029da3cf19c645aaa9279725429c74b1.
2025/04/11 12:59:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  2.35it/s]



Algorithm: RandomForest
Metrics: {'r2_score': 0.6529555305207221, 'mae': 13.354333326867867, 'mse': 309.83560929351466, 'mape': 0.08527552134076938}


2025/04/11 12:59:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1/runs/ed4e57ca982f49d2987130c426b8f71e.
2025/04/11 12:59:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1.
Downloading artifacts: 100%|██████████| 7/7 [00:07<00:00,  1.01s/it]



Algorithm: XGBoost
Metrics: {'r2_score': 0.6310387849807739, 'mae': 13.933145093933694, 'mse': 329.4025271133268, 'mape': 0.0885137345286565}


2025/04/11 13:00:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1/runs/f236b400b5db4817b9a100d70227b28a.
2025/04/11 13:00:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1.


2025-04-11 13:00:03,036 - INFO - Best model: Lasso with R2 score: 0.6837391055684274


2025/04/11 13:00:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run Best Model selector at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1/runs/4cb4d448db564261a4def8cf19e37fea.
2025/04/11 13:00:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/1.


2025-04-11 13:00:15,634 - INFO - Training and evaluation completed.
