In [1]:
import os
import logging
import pandas as pd
import mlflow
import mlflow.sklearn
import dagshub
import time
import scipy
from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# =====================================================================================================================

def train_and_tune_model(X_train, y_train, model_name, model, param_grid):
    """Trains and tunes a given model using GridSearchCV."""
    
    logging.info(f"Training and tuning {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring="r2", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    
    logging.info(f"Best {model_name} R² Score: {best_score}")
    logging.info(f"Best {model_name} Parameters: {best_params}")
    
    return best_model, best_score, best_params


def evaluate_and_log_model(model, X_test, y_test, model_name):
    """Evaluates the model and logs metrics to MLflow."""
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    logging.info(f"{model_name} Evaluation - MAE: {mae}, MSE: {mse}, R² Score: {r2}")

    # Logging metrics to MLflow
    mlflow.log_metric(f"{model_name}_mae", mae)
    mlflow.log_metric(f"{model_name}_mse", mse)
    mlflow.log_metric(f"{model_name}_r2_score", r2)

    return mae, mse, r2


# ========================== Defining functions for feature engineering pipeline =================================

# scaling numerical columns
scaler = StandardScaler()

def scale_numerical_features(df, numerical_columns, fit_scaler=False):
    if fit_scaler:
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

# encoding categorical columns
def encode_categorical_features(df, categorical_columns):
    return pd.get_dummies(data=df, columns=categorical_columns)

# feature-engineering pipeline
def apply_feature_engineering(df, fit_scaler=False):
    categorical_columns = ['bat_team', 'bowl_team']
    numerical_columns = ['overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5']
    
    df = encode_categorical_features(df, categorical_columns)
    df = scale_numerical_features(df, numerical_columns, fit_scaler=fit_scaler)
    return df


#=================== Load the preprocessed data for our ease ========================
''' When we save the preprocessed DataFrame to CSV and then reloaded it later using pd.read_csv(), 
 the coverted [date] column goes back to string from datetime format, unless we re-parse it.'''

df = pd.read_csv('preprocessed_data.csv', parse_dates=['date'])

# Split using date (no random split)
train_df = df[df['date'].dt.year <= 2015].copy()
test_df = df[df['date'].dt.year >= 2016].copy()

# Drop target column and split further into X & y / train & test sets
X_train = train_df.drop(columns=['total'])
y_train = train_df['total']

X_test = test_df.drop(columns=['total'])
y_test = test_df['total']
logging.info("Splitting dataframe into train and test sets")

# Drop 'date' before encoding
if 'date' in X_train.columns:
    X_train = X_train.drop(columns='date') 
if 'date' in X_test.columns:                    
    X_test = X_test.drop(columns='date')


# Feature Engineering applied separately to train and test
X_train = apply_feature_engineering(X_train, fit_scaler=True)      # fits and transforms train data
X_test = apply_feature_engineering(X_test, fit_scaler=False)       # only transforms test data

# Align columns of test set to match train set (fixing dummy mismatch)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
logging.info("Applied Feature Engineering separately to train and test")


# Initialize MLflow
mlflow.set_tracking_uri('https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow')
dagshub.init(repo_owner='nayanparvez90', repo_name='Innings-Score-Predictor', mlflow=True)

# mlflow set experiment
mlflow.set_experiment("Model-Hyper-Parameter Tuning")


# Defining models and their hyperparameter grids
models = {
    "LinearRegression": (LinearRegression(), {                         # using all 3 coz r2-score nearly equal to eachother
        "fit_intercept": [True, False],
        "positive": [True, False],
        "n_jobs": [-1]
    }),

    "Lasso": (Lasso(), {
        "alpha": [0.01, 0.1, 1.0, 10.0],
        "fit_intercept": [True, False],
        "positive": [True, False],
        "max_iter": [1000, 5000]
    }),

    "Ridge": (Ridge(), {
        "alpha": [0.01, 0.1, 1.0, 10.0],
        "fit_intercept": [True, False],
        "solver": ['auto', 'svd', 'cholesky'],
        "max_iter": [1000, 5000]
    }),

# use in-case these algorithms give better results when running algorithm-selection 

    # "RandomForest": (RandomForestRegressor(), {
    #     "n_estimators": [50, 100, 200],
    #     "max_depth": [None, 10, 20],                    
    #     "min_samples_split": [2, 5, 10],
    #     "n_jobs": [-1]
    # }),

    # "XGBoost": (XGBRegressor(objective="reg:squarederror", eval_metric="rmse"), {
    #     "n_estimators": [50, 100, 200],
    #     "max_depth": [3, 6, 10],
    #     "learning_rate": [0.01, 0.1, 0.2],
    #     "n_jobs": [-1]
    # })
}


# initializing variables to store results
best_model = None
best_score = float("-inf")
best_model_name = ""
best_model_metrics = {}

# Train, evaluate, and log models separately
for model_name, (model, param_grid) in models.items():
    with mlflow.start_run(run_name=f"{model_name}_Run"):
        start_time = time.time()
        
        best_trained_model, model_score, best_params = train_and_tune_model(X_train, y_train, model_name, model, param_grid)

        mlflow.log_param("best_model", model_name)
        mlflow.log_params(best_params)                         # Log all best params here
        
        # Evaluate the trained model
        mae, mse, r2 = evaluate_and_log_model(best_trained_model, X_test, y_test, model_name)
        
        # Log model to MLflow
        input_example = X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5].toarray()
        mlflow.sklearn.log_model(best_trained_model, model_name, input_example=input_example)
        logging.info(f"{model_name} logged successfully in MLflow.")

# Log feature importances if available, it's optional u can comment this if block if no need for it
        if hasattr(best_trained_model, "feature_importances_"):
            try:
                feature_importances = best_trained_model.feature_importances_
                feature_names = X_train.columns if hasattr(X_train, "columns") else [f"f{i}" for i in range(len(feature_importances))]
                
                importance_df = pd.DataFrame({
                    "feature": feature_names,
                    "importance": feature_importances
                }).sort_values(by="importance", ascending=False)
                
                # Log as a CSV file
                importance_path = f"feature_importances_{model_name}.csv"
                importance_df.to_csv(importance_path, index=False)
                mlflow.log_artifact(importance_path)

                # Clean up temp files
                os.remove(importance_path)

            except Exception as e:
                logging.warning(f"Could not log feature importances for {model_name}: {e}")
        
        # Compare models and store the best one
        if model_score > best_score:
            best_score = model_score
            best_model = best_trained_model
            best_model_name = model_name
            best_model_metrics = {"mae": mae, "mse": mse, "r2": r2}
            best_model_params = best_params                            # Stores best params for final logging



# Log the best model metrics separately
with mlflow.start_run(run_name="Best_Model_Run"):
    mlflow.log_param("final_best_model", best_model_name)
    mlflow.log_params(best_model_params)                               #  Logs best params of the best model
    mlflow.log_metric("final_best_mae", best_model_metrics["mae"])
    mlflow.log_metric("final_best_mse", best_model_metrics["mse"])
    mlflow.log_metric("final_best_r2_score", best_model_metrics["r2"])
    
    input_example = X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5].toarray()
    mlflow.sklearn.log_model(best_model, "Best_Model", input_example=input_example)

    logging.info(f"Final best model ({best_model_name}) logged successfully in MLflow.")

logging.info("All models trained, evaluated, and logged successfully.")


2025-04-11 13:09:28,985 - INFO - Splitting dataframe into train and test sets
2025-04-11 13:09:29,045 - INFO - Applied Feature Engineering separately to train and test


2025-04-11 13:09:30,095 - INFO - Accessing as nayanparvez90


2025-04-11 13:09:31,011 - INFO - Initialized MLflow to track repo "nayanparvez90/Innings-Score-Predictor"


2025-04-11 13:09:31,023 - INFO - Repository nayanparvez90/Innings-Score-Predictor initialized!
2025/04/11 13:09:31 INFO mlflow.tracking.fluent: Experiment with name 'Model-Hyper-Parameter Tuning' does not exist. Creating a new experiment.
2025-04-11 13:09:33,508 - INFO - Training and tuning LinearRegression...
3 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dell\anaconda3\envs\ipl_project\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dell\anaconda3\envs\ipl_project\lib\site-packages\sklearn\base.py", line 

### Why the R² is different in both i.e. for eg. (Best Ridge R² Score: 0.58 and Ridge Evaluation: 0.68)

- Best R² Score from GridSearchCV:

-- This is the cross-validation score from the training data (X_train, y_train).

-- It averages the performance across multiple training/validation folds.

-- It’s used to choose the best hyperparameters during tuning.

- Evaluation R² Score:

-- This is calculated on the test set (X_test, y_test) i.e. data the model has never seen during training or tuning.