In [1]:
import logging
import time
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import dagshub
from datetime import datetime
import warnings
warnings.filterwarnings('ignore', category=UserWarning)


# =========================== Data Preprocessing Functions ====================================

def remove_unwanted_columns(df, columns_to_remove):
    """Removes specified columns from the DataFrame."""
    return df.drop(labels=columns_to_remove, axis=1)

def filter_consistent_teams(df, consistent_teams):
    """Filters data to keep only matches with consistent teams."""
    return df[(df['bat_team'].isin(consistent_teams)) & (df['bowl_team'].isin(consistent_teams))]

def remove_initial_overs(df, min_overs=4.1):
    """Removes data for the first few overs in each match."""
    return df[df['overs'] >= min_overs]

def convert_date_column(df):
    """Converts 'date' column from string to datetime object."""
    df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    return df

# Complete Preprocessing Pipeline
def preprocess_raw_data(df):
    """Applies all preprocessing steps to clean and prepare the dataset for training and inference."""
    columns_to_remove = ['mid', 'venue', 'batsman', 'bowler','striker', 'non-striker']  

    consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                        'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                        'Delhi Daredevils', 'Sunrisers Hyderabad']
    
    df = convert_date_column(df)                     
    df = remove_unwanted_columns(df, columns_to_remove)
    df = filter_consistent_teams(df, consistent_teams)
    df = remove_initial_overs(df)
    return df


#============================== Feature Engineering Function =====================================

# Create a global scaler so it can be used across train/test
scaler = StandardScaler()

# scaling function
def scale_numerical_features(df, numerical_columns, fit_scaler=False):
    """
    Scale numerical features using StandardScaler.
    
    Parameters:
    - df: input DataFrame
    - numerical_columns: list of column names to scale
    - fit_scaler: if True, fits the scaler; otherwise, transforms using already fit scaler
    
    Returns:
    - Scaled DataFrame
    """
    if fit_scaler:
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

# encoding function
def encode_categorical_features(df, categorical_columns):
    """Apply one-hot encoding to categorical columns."""
    return pd.get_dummies(data=df, columns=categorical_columns)

# Complete Feature Engineering Pipeline
def apply_feature_engineering(df, fit_scaler=False):
    """
    Apply full feature engineering: encoding + scaling.
    
    Parameters:
    - df: input DataFrame
    - fit_scaler: True for training data, False for test data
    
    Returns:
    - Processed DataFrame
    """
    categorical_columns = ['bat_team', 'bowl_team']
    numerical_columns = ['overs', 'runs', 'wickets','runs_last_5', 'wickets_last_5']
    
    df = encode_categorical_features(df, categorical_columns)
    df = scale_numerical_features(df, numerical_columns, fit_scaler=fit_scaler)
    return df


#================================== Model Training and Evaluation ================================

def train_model(X_train, y_train):
    """Trains a Linear Regression model."""
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluates the model and logs metrics."""
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)
    
    logging.info(f"Model Evaluation: MAE={mae}, MSE={mse}, R2-Score={r2}")
    return mae, mse, r2


# Load data
df = pd.read_csv('data.csv')
df = preprocess_raw_data(df)                           # Preprocessing raw data

'''Saving the preprocessed data to a new CSV file which we'll later use for our ease when we'll be tuning the best algorithm'''

df.to_csv("preprocessed_data.csv", index=False)
logging.info("Preprocessed data saved as 'preprocessed_data.csv' for future use.")

# Split using date (no random split coz its time-series-like usecase)
train_df = df[df['date'].dt.year <= 2015].copy()
test_df = df[df['date'].dt.year >= 2016].copy()

# Drop target column
X_train = train_df.drop(columns=['total'])
y_train = train_df['total']

X_test = test_df.drop(columns=['total'])
y_test = test_df['total']

# Drop 'date' before encoding
if 'date' in X_train.columns:
    X_train = X_train.drop(columns='date') 
if 'date' in X_test.columns:                 # the best time to drop the date column is after splitting and before feature engg.
    X_test = X_test.drop(columns='date')


# Feature Engineering applied separately to train and test
X_train = apply_feature_engineering(X_train, fit_scaler=True)      # fits and transforms train data
X_test = apply_feature_engineering(X_test, fit_scaler=False)       # only transforms test data

# Align columns of test set to match train set (fixing dummy mismatch)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


# Initialize MLflow
mlflow.set_tracking_uri('https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow')
dagshub.init(repo_owner='nayanparvez90', repo_name='Innings-Score-Predictor', mlflow=True)


# mlflow.set_experiment
mlflow.set_experiment("Base Regression Model")

with mlflow.start_run():
    start_time = time.time()
    logging.info("Starting MLflow run...")

    # Calculate and log year ranges from the original DataFrame
    train_years = f"{train_df['date'].dt.year.min()}-{train_df['date'].dt.year.max()}"
    test_years = f"{test_df['date'].dt.year.min()}-{test_df['date'].dt.year.max()}"

    mlflow.log_param("train_years", train_years)
    mlflow.log_param("test_years", test_years)
    
    mlflow.log_param("model", "Linear Regression")
    mlflow.log_param("test_size", 'roughly about ~0.27')
    
    logging.info("Training the model...")
    model = train_model(X_train, y_train)
    
    logging.info("Evaluating the model...")
    evaluate_model(model, X_test, y_test)
    
    logging.info("Logging model to MLflow...")
    mlflow.sklearn.log_model(model, "model")
    
    logging.info("MLflow run completed.")



2025/04/11 13:13:40 INFO mlflow.tracking.fluent: Experiment with name 'Base Regression Model' does not exist. Creating a new experiment.
2025/04/11 13:13:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run enchanting-ant-938 at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/3/runs/951853f629844229aac925e41132c104.
2025/04/11 13:13:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/nayanparvez90/Innings-Score-Predictor.mlflow/#/experiments/3.


### On deletion of an experiment in mlflow an error as such might occur:


- MlflowException: Cannot set a deleted experiment 'Hyperparameter Tuning Experiment' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.

- Here, MlflowException suggests that the experiment "Hyperparameter Tuning Experiment" was deleted and is now in a deleted state in MLflow's backend. This prevents it from being set as the active experiment.

- Here's how we fix if an exception as such occurs for us.


### Step 1: Check for Deleted Experiments

use the code below

In [2]:
# import mlflow

# client = mlflow.tracking.MlflowClient()

# # Get deleted experiments
# deleted_experiments = [exp for exp in client.search_experiments() if exp.lifecycle_stage == "deleted"]

# # Print results in the notebook
# if deleted_experiments:
#     print("Deleted Experiments Found:")
#     for exp in deleted_experiments:
#         print(f" Name: {exp.name} | ID: {exp.experiment_id}")
# else:
#     print(" No deleted experiments found.")

If the experiment appears in the output, proceed to Step 2.
### Step 2: Restore the Deleted Experiment
If the experiment exists but is deleted, restore it:

In [3]:
# import mlflow
# client = mlflow.tracking.MlflowClient()

# experiment_id = "3"                         # Replace with the actual experiment ID from Step 1
# client.restore_experiment(experiment_id)