In [1]:
!pip install catboost --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

### Importing Necessary Libraries

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

### Reading the Datasets

In [3]:
# Load the dataset
train_data_path = 'train_data_with_samplefeatures.csv'
# Load additional datasets
batsman_data_path = '/content/batsman_level_scorecard.csv'
bowler_data_path = '/content/bowler_level_scorecard.csv'
match_data_path = '/content/match_level_scorecard.csv'
test_data_path = '/content/r2_data_with_samplefeatures.csv'

match_lvl_data = pd.read_csv(match_data_path)
batsman_lvl_data = pd.read_csv(batsman_data_path)
bowler_lvl_data = pd.read_csv(bowler_data_path)
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

## Data Pre processing and features Creation

In [4]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

# If team1 won the toss and chose to bat or team2 won the toss and chose to bowl, the feature takes the value 1, else 2.
match_lvl_data['team1_bat_inning'] = np.where( ((match_lvl_data['team1']==match_lvl_data['toss winner'])&(match_lvl_data['toss decision']=='bat'))|\
                                               ((match_lvl_data['team2']==match_lvl_data['toss winner'])&(match_lvl_data['toss decision']=='field')) , 1, 2)



In [5]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)


### Features Name: team_count_100runs_last15

In [8]:
def no100sLastn(player_list, date, n):
    '''
    Function to get total number of 100s scored by players in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- int value denoting sum of 100s scored by all players in the roster.
    '''

    # Split string of ':' separated player ids into a list of ids
    player_list = str(player_list).split(':')
    res_list = [] # Initialize an empty list to store the count of 100s for each player

    # Loop over each player_id in the roster
    for player in player_list:
        # Get batting stats from last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        # Create a binary indicator to denote whether the player scored a 100 in the game (runs>=100)
        df_rel['gte_100runs'] = np.where(df_rel['runs']>=100, 1, 0)
        # Sum up number of 100s for the player and append to the list
        res_list.append(np.nansum(df_rel['gte_100runs']))

    # Sum up values of the list which is the total number of 100s by all players in the roster
    return np.nansum(res_list)

# Computing number of 100 runs in last 15 games for team1 for the train dataset
train_data['team1_count_100runs_last15'] = train_data.progress_apply(lambda x: \
            no100sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Computing number of 100 runs in last 15 games for team2 for the train dataset
train_data['team2_count_100runs_last15'] = train_data.progress_apply(lambda x: \
            no100sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Taking ratio of (number of 100 runs in last 15 games for team1) to (number of 100 runs in last 15 games for team2)
# Adding 1 to the denominator to handle divide by zero exceptions
train_data['team_count_100runs_last15'] = (train_data['team1_count_100runs_last15']+1) / \
                                          (train_data['team2_count_100runs_last15']+1)

# Dropping intermediate columns as they are no longer needed
train_data.drop(columns=['team1_count_100runs_last15','team2_count_100runs_last15'], inplace=True)

# Computing number of 100 runs in last 15 games for team1 for the test dataset
test_data['team1_count_100runs_last15'] = test_data.progress_apply(lambda x: \
            no100sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Computing number of 100 runs in last 15 games for team2 for the test dataset
test_data['team2_count_100runs_last15'] = test_data.progress_apply(lambda x: \
            no100sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Taking ratio of (number of 100 runs in last 15 games for team1) to (number of 100 runs in last 15 games for team2)
# Adding 1 to the denominator to handle divide by zero exceptions
test_data['team_count_100runs_last15'] = (test_data['team1_count_100runs_last15']) / \
                                         (test_data['team2_count_100runs_last15']+1)

# Dropping intermediate columns as they are no longer needed
test_data.drop(columns=['team1_count_100runs_last15','team2_count_100runs_last15'], inplace=True)


100%|██████████| 948/948 [00:34<00:00, 27.60it/s]
100%|██████████| 948/948 [00:35<00:00, 26.97it/s]
100%|██████████| 207/207 [00:08<00:00, 23.07it/s]
100%|██████████| 207/207 [00:08<00:00, 24.77it/s]


### Feature Name: team1_avg_per_total_bat_SR_last15

In [9]:
def team_bat_strike_rate(player_list, date, n):
    '''
    Function to get average strike rate of team in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- float value denoting average strike rate of all players in the roster.
    '''

    # Split string of ':' separated player ids into a list of ids
    player_list = str(player_list).split(':')
    runs_list = []  # Initialize an empty list to store the total runs for each player
    balls_list = [] # Initialize an empty list to store the total balls faced for each player

    # Loop over each player_id in the roster
    for player in player_list:
        # Get batting stats from last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        # Sum up the runs for the player and append to the list
        runs_list.append(np.nansum(df_rel['runs']))
        # Sum up the balls faced for the player and append to the list
        balls_list.append(np.nansum(df_rel['balls_faced']))

    # Sum up total runs and total balls faced by all players in the roster
    total_runs = np.nansum(runs_list)
    total_balls = np.nansum(balls_list)

    # Check if total balls faced is zero to avoid division by zero
    if total_balls == 0:
        return float('nan')

    # Calculate and return the average strike rate of the team
    return (total_runs) / (total_balls)


In [10]:
# Calculate team1 batting strike rate for the last 15 games for the train dataset
train_data['team1_bat_strike_rate'] = train_data.progress_apply(lambda x: team_bat_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate team2 batting strike rate for the last 15 games for the train dataset
train_data['team2_bat_strike_rate'] = train_data.progress_apply(lambda x: team_bat_strike_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Function to calculate the batting strike rate impact of team1 relative to team2
def calculate_bat_strike_impact(row):
    team1_score = row['team1_bat_strike_rate']
    team2_score = row['team2_bat_strike_rate']
    if pd.isna(team1_score) or pd.isna(team2_score):
        return 1
    elif team2_score == 0:
        if team1_score == 0:
            return 1
        return np.inf
    return team1_score / team2_score

# Apply the function to create the new column for the train dataset
train_data['team1_avg_per_total_bat_SR_last15'] = train_data.apply(calculate_bat_strike_impact, axis=1)

# Dropping intermediate columns
train_data.drop(columns=['team1_bat_strike_rate', 'team2_bat_strike_rate'], inplace=True)

# Calculate team1 batting strike rate for the last 15 games for the test dataset
test_data['team1_bat_strike_rate'] = test_data.progress_apply(lambda x: team_bat_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate team2 batting strike rate for the last 15 games for the test dataset
test_data['team2_bat_strike_rate'] = test_data.progress_apply(lambda x: team_bat_strike_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Apply the function to create the new column for the test dataset
test_data['team1_avg_per_total_bat_SR_last15'] = test_data.apply(calculate_bat_strike_impact, axis=1)

# Dropping intermediate columns
test_data.drop(columns=['team1_bat_strike_rate', 'team2_bat_strike_rate'], inplace=True)

100%|██████████| 948/948 [00:28<00:00, 32.79it/s]
100%|██████████| 948/948 [00:27<00:00, 34.24it/s]
100%|██████████| 207/207 [00:05<00:00, 35.33it/s]
100%|██████████| 207/207 [00:06<00:00, 31.82it/s]


### Feature Name: team1_avg_per_total_batting_AVG_last15

In [11]:
def team_bat_avg(player_list, date, n):
    '''
    Function to get average batting average of a team in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- float value denoting the average batting average of all players in the roster.
    '''

    # Split string of ':' separated player ids into a list of ids
    player_list = str(player_list).split(':')
    runs_list = []  # Initialize an empty list to store the total runs for each player
    match_played_list = [] # Initialize an empty list to store the total matches played for each player

    # Loop over each player_id in the roster
    for player in player_list:
        # Get batting stats from last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        # Sum up the runs for the player and append to the list
        runs_list.append(np.nansum(df_rel['runs']))
        # Sum up the number of matches played for the player and append to the list
        match_played_list.append(len(df_rel))

    # Sum up total runs and total matches played by all players in the roster
    total_runs = np.nansum(runs_list)
    total_matches = np.nansum(match_played_list)

    # Check if total matches played is zero to avoid division by zero
    if total_matches == 0:
        return float('nan')

    # Calculate and return the average batting average of the team
    return (total_runs) / (total_matches)


In [12]:
# Calculate team1 batting average for the last 15 games for the train dataset
train_data['team1_bat_avg'] = train_data.progress_apply(lambda x: team_bat_avg(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate team2 batting average for the last 15 games for the train dataset
train_data['team2_bat_avg'] = train_data.progress_apply(lambda x: team_bat_avg(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Function to calculate the batting average impact of team1 relative to team2
def calculate_bat_avg_impact(row):
    team1_score = row['team1_bat_avg']
    team2_score = row['team2_bat_avg']
    if pd.isna(team1_score) or pd.isna(team2_score):
        return 1
    elif team2_score == 0:
        if team1_score == 0:
            return 1
        return np.inf
    return team1_score / team2_score

# Apply the function to create the new column for the train dataset
train_data['team1_avg_per_total_batting_AVG_last15'] = train_data.apply(calculate_bat_avg_impact, axis=1)

# Dropping intermediate columns
train_data.drop(columns=['team1_bat_avg', 'team2_bat_avg'], inplace=True)

# Calculate team1 batting average for the last 15 games for the test dataset
test_data['team1_bat_avg'] = test_data.progress_apply(lambda x: team_bat_avg(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate team2 batting average for the last 15 games for the test dataset
test_data['team2_bat_avg'] = test_data.progress_apply(lambda x: team_bat_avg(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Apply the function to create the new column for the test dataset
test_data['team1_avg_per_total_batting_AVG_last15'] = test_data.apply(calculate_bat_avg_impact, axis=1)

# Dropping intermediate columns
test_data.drop(columns=['team1_bat_avg', 'team2_bat_avg'], inplace=True)


100%|██████████| 948/948 [00:27<00:00, 35.10it/s]
100%|██████████| 948/948 [00:26<00:00, 35.33it/s]
100%|██████████| 207/207 [00:06<00:00, 30.47it/s]
100%|██████████| 207/207 [00:06<00:00, 33.40it/s]


### Feature Name: team1_per_total_bowling_economy_last15

In [13]:
def team_bowl_economy(player_list, date, n):
    '''
    Function to get average bowling economy of a team in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- float value denoting the average bowling economy of all players in the roster.
    '''

    # Split string of ':' separated player ids into a list of ids
    player_list = str(player_list).split(':')
    runs_list = []  # Initialize an empty list to store the total runs conceded for each player
    balls_list = [] # Initialize an empty list to store the total balls bowled for each player

    # Loop over each player_id in the roster
    for player in player_list:
        # Get bowling stats from last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
        # Sum up the runs conceded for the player and append to the list
        runs_list.append(np.nansum(df_rel['runs']))
        # Sum up the balls bowled for the player and append to the list
        balls_list.append(np.nansum(df_rel['balls_bowled']))

    # Sum up total runs conceded and total balls bowled by all players in the roster
    total_runs = np.nansum(runs_list)
    total_balls = np.nansum(balls_list)

    # Check if total balls bowled is zero to avoid division by zero
    if total_balls == 0:
        return float('nan')

    # Calculate and return the average bowling economy of the team
    return (total_runs) / (total_balls)


In [14]:
# Calculate team1 bowling economy for the last 15 games for the train dataset
train_data['team1_bowl_eco'] = train_data.progress_apply(lambda x: team_bowl_economy(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Calculate team2 bowling economy for the last 15 games for the train dataset
train_data['team2_bowl_eco'] = train_data.progress_apply(lambda x: team_bowl_economy(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Function to calculate the bowling economy impact of team1 relative to team2
def calculate_bowl_eco(row):
    team1_score = row['team1_bowl_eco']
    team2_score = row['team2_bowl_eco']
    if pd.isna(team1_score) or pd.isna(team2_score):
        return 1
    elif team2_score == 0:
        if team1_score == 0:
            return 1
        return np.inf
    return team1_score / team2_score

# Apply the function to create the new column for the train dataset
train_data['team1_per_total_bowling_economy_last15'] = train_data.apply(calculate_bowl_eco, axis=1)

# Dropping intermediate columns
train_data.drop(columns=['team1_bowl_eco', 'team2_bowl_eco'], inplace=True)

# Calculate team1 bowling economy for the last 15 games for the test dataset
test_data['team1_bowl_eco'] = test_data.progress_apply(lambda x: team_bowl_economy(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Calculate team2 bowling economy for the last 15 games for the test dataset
test_data['team2_bowl_eco'] = test_data.progress_apply(lambda x: team_bowl_economy(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Apply the function to create the new column for the test dataset
test_data['team1_per_total_bowling_economy_last15'] = test_data.apply(calculate_bowl_eco, axis=1)

# Dropping intermediate columns
test_data.drop(columns=['team1_bowl_eco', 'team2_bowl_eco'], inplace=True)

100%|██████████| 948/948 [00:33<00:00, 28.25it/s]
100%|██████████| 948/948 [00:33<00:00, 28.38it/s]
100%|██████████| 207/207 [00:08<00:00, 25.31it/s]
100%|██████████| 207/207 [00:06<00:00, 30.39it/s]


### Feature Name: team1_avg_per_total_bowl_AVG_last15

In [15]:
def team_bowl_avg(player_list, date, n):
    '''
    Function to get the average bowling average of a team in the roster of a team in last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- float value denoting the average bowling average of all players in the roster.
    '''

    # Split string of ':' separated player ids into a list of ids
    player_list = str(player_list).split(':')
    runs_list = []  # Initialize an empty list to store the total runs conceded for each player
    wkt_list = []   # Initialize an empty list to store the total wickets taken for each player

    # Loop over each player_id in the roster
    for player in player_list:
        # Get bowling stats from last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
        # Sum up the runs conceded for the player and append to the list
        runs_list.append(np.nansum(df_rel['runs']))
        # Sum up the wickets taken for the player and append to the list
        wkt_list.append(np.nansum(df_rel['wicket_count']))

    # Sum up total runs conceded and total wickets taken by all players in the roster
    total_runs = np.nansum(runs_list)
    total_wkt = np.nansum(wkt_list)

    # Check if total wickets taken is zero to avoid division by zero
    if total_wkt == 0:
        return float('nan')

    # Calculate and return the average bowling average of the team
    return (total_runs) / (total_wkt)


In [16]:
# Calculate team1 bowling average for the last 15 games for the train dataset
train_data['team1_bowl_avg'] = train_data.progress_apply(lambda x: team_bowl_avg(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Calculate team2 bowling average for the last 15 games for the train dataset
train_data['team2_bowl_avg'] = train_data.progress_apply(lambda x: team_bowl_avg(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Function to calculate the bowling average impact of team1 relative to team2
def calculate_bowl_avg_impact(row):
    team1_score = row['team1_bowl_avg']
    team2_score = row['team2_bowl_avg']
    if pd.isna(team1_score) or pd.isna(team2_score):
        return 1
    elif team2_score == 0:
        if team1_score == 0:
            return 1
        return np.inf
    return team1_score / team2_score

# Apply the function to create the new column for the train dataset
train_data['team1_avg_per_total_bowl_AVG_last15'] = train_data.apply(calculate_bowl_avg_impact, axis=1)

# Dropping intermediate columns
train_data.drop(columns=['team1_bowl_avg', 'team2_bowl_avg'], inplace=True)

# Calculate team1 bowling average for the last 15 games for the test dataset
test_data['team1_bowl_avg'] = test_data.progress_apply(lambda x: team_bowl_avg(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Calculate team2 bowling average for the last 15 games for the test dataset
test_data['team2_bowl_avg'] = test_data.progress_apply(lambda x: team_bowl_avg(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Apply the function to create the new column for the test dataset
test_data['team1_avg_per_total_bowl_AVG_last15'] = test_data.apply(calculate_bowl_avg_impact, axis=1)

# Dropping intermediate columns
test_data.drop(columns=['team1_bowl_avg', 'team2_bowl_avg'], inplace=True)


100%|██████████| 948/948 [00:34<00:00, 27.78it/s]
100%|██████████| 948/948 [00:33<00:00, 28.61it/s]
100%|██████████| 207/207 [00:08<00:00, 25.51it/s]
100%|██████████| 207/207 [00:07<00:00, 28.75it/s]


### Feature Name: team_avg_wicket_last15

In [17]:
def teamAvgWicketLastn(team_id, date, n):
    '''
    Function to calculate a team's average wickets taken in their last n games.

    Input-
    1. team_id: ID of the team to calculate average wickets.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting average of wickets taken by the team in their last n games.
    '''
    # Filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # Combine two dataframes - one where input team is bowling first, and another one where input team is bowling second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning2_wickets']].rename(columns={'inning2_wickets':'wicket'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning1_wickets']].rename(columns={'inning1_wickets':'wicket'}) ] )
    return df_rel['wicket'].mean() # Return mean of the combined dataframe.

# Compute average wickets taken by team1 in their last 15 games for train data.
train_data['team1only_avg_wicket_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgWicketLastn(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute average wickets taken by team2 in their last 15 games for train data.
train_data['team2only_avg_wicket_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgWicketLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute ratio of average wickets taken by team1 to team2 in their last 15 games for train data.
train_data['team_avg_wicket_last15'] = (train_data['team1only_avg_wicket_last15']+1)/(train_data['team2only_avg_wicket_last15']+1)

# Dropping intermediate columns
train_data.drop(columns=['team1only_avg_wicket_last15','team2only_avg_wicket_last15'], inplace=True)

# Compute average wickets taken by team1 in their last 15 games for test data.
test_data['team1only_avg_wicket_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgWicketLastn(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute average wickets taken by team2 in their last 15 games for test data.
test_data['team2only_avg_wicket_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgWicketLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute ratio of average wickets taken by team1 to team2 in their last 15 games for test data.
test_data['team_avg_wicket_last15'] = (test_data['team1only_avg_wicket_last15']+1)/(test_data['team2only_avg_wicket_last15']+1)

# Dropping intermediate columns
test_data.drop(columns=['team1only_avg_wicket_last15','team2only_avg_wicket_last15'], inplace=True)


100%|██████████| 948/948 [00:05<00:00, 164.23it/s]
100%|██████████| 948/948 [00:05<00:00, 186.81it/s]
100%|██████████| 207/207 [00:01<00:00, 181.72it/s]
100%|██████████| 207/207 [00:01<00:00, 136.97it/s]


### Feature Name: team_boundary_rate_last15

In [18]:
def team_boundry_rate(player_list, date, n):
    '''
    Function to get the average boundary rate of a team in the roster of a team in the last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.

    Output-None

    Returns- float value denoting the average number of boundaries (fours and sixes) scored by all players in the roster per match.
    '''
    # Split string of ':' separated player ids into a list of ids
    player_list = str(player_list).split(':')
    fours_list = []  # Initialize an empty list to store the total fours hit by each player
    sixes_list = []  # Initialize an empty list to store the total sixes hit by each player
    match_played_list = []  # Initialize an empty list to store the total matches played by each player

    # Loop over each player_id in the roster
    for player in player_list:
        # Get batting stats from the last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        # Sum up the fours hit by the player and append to the list
        fours_list.append(np.nansum(df_rel['Fours']))
        # Sum up the sixes hit by the player and append to the list
        sixes_list.append(np.nansum(df_rel['Sixes']))
        # Sum up the number of matches played by the player and append to the list
        match_played_list.append(len(df_rel))

    # Sum up total fours, sixes, and matches played by all players in the roster
    total_fours = np.nansum(fours_list)
    total_sixes = np.nansum(sixes_list)
    total_matches = np.nansum(match_played_list)

    # Calculate and return the average boundary rate (fours and sixes per match) of the team
    return (total_fours + total_sixes) / (total_matches + 1)  # Adding 1 to avoid division by zero


In [19]:
# Calculate team1 boundary rate for the last 15 games for the train dataset
train_data['team1_boundry_rate'] = train_data.progress_apply(lambda x: team_boundry_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate team2 boundary rate for the last 15 games for the train dataset
train_data['team2_boundry_rate'] = train_data.progress_apply(lambda x: team_boundry_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Function to calculate the boundary rate impact of team1 relative to team2
def calculate_boundry_impact(row):
    team1_score = row['team1_boundry_rate']
    team2_score = row['team2_boundry_rate']
    return (team1_score + 1) / (team2_score + 1)

# Apply the function to create the new column for the train dataset
train_data['team_boundary_rate_last15'] = train_data.apply(calculate_boundry_impact, axis=1)

# Dropping intermediate columns
train_data.drop(columns=['team1_boundry_rate', 'team2_boundry_rate'], inplace=True)

# Calculate team1 boundary rate for the last 15 games for the test dataset
test_data['team1_boundry_rate'] = test_data.progress_apply(lambda x: team_boundry_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate team2 boundary rate for the last 15 games for the test dataset
test_data['team2_boundry_rate'] = test_data.progress_apply(lambda x: team_boundry_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Apply the function to create the new column for the test dataset
test_data['team_boundary_rate_last15'] = test_data.apply(calculate_boundry_impact, axis=1)

# Dropping intermediate columns
test_data.drop(columns=['team1_boundry_rate', 'team2_boundry_rate'], inplace=True)


100%|██████████| 948/948 [00:31<00:00, 30.45it/s]
100%|██████████| 948/948 [00:27<00:00, 33.95it/s]
100%|██████████| 207/207 [00:05<00:00, 35.90it/s]
100%|██████████| 207/207 [00:07<00:00, 28.78it/s]


### Feature Name: team_run_rate_last15

In [20]:
def teamAvgRunRateLastn(team_id, date, n):
    '''
    Function to calculate a team's average run rate in their last n games.

    Input-
    1. team_id: ID of the team to calculate average run rate.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting the average run rate of the team in their last n games.
    '''
    # Filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) & \
                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))] \
                        .sort_values(by='match_dt', ascending=False).head(n)
    # Combine two dataframes - one where input team is batting first, and another one where input team is batting second
    df_rel = pd.concat([df_rel[df_rel['team1_bat_inning'] == 1][['inning1_runs', 'inning1_balls']].rename(columns={'inning1_runs': 'runs', 'inning1_balls': 'balls'}), \
                         df_rel[df_rel['team1_bat_inning'] == 2][['inning2_runs', 'inning2_balls']].rename(columns={'inning2_runs': 'runs', 'inning2_balls': 'balls'})])
    return (df_rel['runs'].sum() + 1) / (df_rel['balls'].sum() + 1) # Return the average run rate of the combined dataframe

# Compute average run rate of team1 in their last 15 games for train data
train_data['team1_run_rate_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunRateLastn(x['team1_id'], x['match_dt'], 15), axis=1)

# Compute average run rate of team2 in their last 15 games for train data
train_data['team2_run_rate_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunRateLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the ratio of average run rate of team1 to team2 in their last 15 games for train data
train_data['team_run_rate_last15'] = (train_data['team1_run_rate_last15'] + 1) / (train_data['team2_run_rate_last15'] + 1)

# Dropping intermediate columns
train_data.drop(columns=['team1_run_rate_last15', 'team2_run_rate_last15'], inplace=True)

# Compute average run rate of team1 in their last 15 games for test data
test_data['team1_run_rate_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunRateLastn(x['team1_id'], x['match_dt'], 15), axis=1)

# Compute average run rate of team2 in their last 15 games for test data
test_data['team2_run_rate_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunRateLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the ratio of average run rate of team1 to team2 in their last 15 games for test data
test_data['team_run_rate_last15'] = (test_data['team1_run_rate_last15'] + 1) / (test_data['team2_run_rate_last15'] + 1)

# Dropping intermediate columns
test_data.drop(columns=['team1_run_rate_last15', 'team2_run_rate_last15'], inplace=True)


100%|██████████| 948/948 [00:05<00:00, 173.00it/s]
100%|██████████| 948/948 [00:06<00:00, 149.16it/s]
100%|██████████| 207/207 [00:01<00:00, 175.22it/s]
100%|██████████| 207/207 [00:01<00:00, 174.09it/s]


### Feature Name: team_run_per_wicket_last15

In [21]:
def teamAvgRunPerWicketLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs per wicket in their last n games.

    Input-
    1. team_id: ID of the team to calculate average runs per wicket.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting the average runs per wicket scored by the team in their last n games.
    '''
    # Filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) & \
                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))] \
                        .sort_values(by='match_dt', ascending=False).head(n)
    # Combine two dataframes - one where input team is batting first, and another one where input team is batting second
    df_rel = pd.concat([df_rel[df_rel['team1_bat_inning'] == 1][['inning1_runs', 'inning1_wickets']].rename(columns={'inning1_runs': 'runs', 'inning1_wickets': 'wickets'}), \
                         df_rel[df_rel['team1_bat_inning'] == 2][['inning2_runs', 'inning2_wickets']].rename(columns={'inning2_runs': 'runs', 'inning2_wickets': 'wickets'})])
    return (df_rel['runs'].sum() + 1) / (df_rel['wickets'].sum() + 1) # Return the average runs per wicket of the combined dataframe

# Compute average runs per wicket scored by team1 in their last 15 games for train data
train_data['team1_run_per_wicket_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunPerWicketLastn(x['team1_id'], x['match_dt'], 15), axis=1)

# Compute average runs per wicket scored by team2 in their last 15 games for train data
train_data['team2_run_per_wicket_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunPerWicketLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the ratio of average runs per wicket scored by team1 to team2 in their last 15 games for train data
train_data['team_run_per_wicket_last15'] = (train_data['team1_run_per_wicket_last15'] + 1) / (train_data['team2_run_per_wicket_last15'] + 1)

# Dropping intermediate columns
train_data.drop(columns=['team1_run_per_wicket_last15', 'team2_run_per_wicket_last15'], inplace=True)

# Compute average runs per wicket scored by team1 in their last 15 games for test data
test_data['team1_run_per_wicket_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunPerWicketLastn(x['team1_id'], x['match_dt'], 15), axis=1)

# Compute average runs per wicket scored by team2 in their last 15 games for test data
test_data['team2_run_per_wicket_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunPerWicketLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the ratio of average runs per wicket scored by team1 to team2 in their last 15 games for test data
test_data['team_run_per_wicket_last15'] = (test_data['team1_run_per_wicket_last15'] + 1) / (test_data['team2_run_per_wicket_last15'] + 1)

# Dropping intermediate columns
test_data.drop(columns=['team1_run_per_wicket_last15', 'team2_run_per_wicket_last15'], inplace=True)


100%|██████████| 948/948 [00:05<00:00, 173.20it/s]
100%|██████████| 948/948 [00:06<00:00, 153.56it/s]
100%|██████████| 207/207 [00:01<00:00, 181.72it/s]
100%|██████████| 207/207 [00:01<00:00, 185.64it/s]


### Feature Name: team_avg_economy_last15

In [22]:
def teamAvgEconomyLastn(team_id, date, n):
    '''
    Function to calculate a team's average economy rate in their last n games.

    Input-
    1. team_id: ID of the team to calculate average economy rate.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting the average economy rate of the team in their last n games.
    '''
    # Filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) & \
                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))] \
                        .sort_values(by='match_dt', ascending=False).head(n)
    # Combine two dataframes - one where input team is bowling first, and another one where input team is bowling second
    df_rel = pd.concat([df_rel[df_rel['team1_bat_inning'] == 1][['inning2_runs', 'inning2_balls']].rename(columns={'inning2_runs': 'runs', 'inning2_balls': 'balls'}), \
                         df_rel[df_rel['team1_bat_inning'] == 2][['inning1_runs', 'inning1_balls']].rename(columns={'inning1_runs': 'runs', 'inning1_balls': 'balls'})])
    return (df_rel['runs'].sum() + 1) / (df_rel['balls'].sum() + 1) # Return the average economy rate of the combined dataframe

# Compute average economy rate of team1 in their last 15 games for train data
train_data['team1_avg_economy_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgEconomyLastn(x['team1_id'], x['match_dt'], 15), axis=1)

# Compute average economy rate of team2 in their last 15 games for train data
train_data['team2_avg_economy_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgEconomyLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the ratio of average economy rate of team1 to team2 in their last 15 games for train data
train_data['team_avg_economy_last15'] = (train_data['team1_avg_economy_last15'] + 1) / (train_data['team2_avg_economy_last15'] + 1)

# Dropping intermediate columns
train_data.drop(columns=['team1_avg_economy_last15', 'team2_avg_economy_last15'], inplace=True)

# Compute average economy rate of team1 in their last 15 games for test data
test_data['team1_avg_economy_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgEconomyLastn(x['team1_id'], x['match_dt'], 15), axis=1)

# Compute average economy rate of team2 in their last 15 games for test data
test_data['team2_avg_economy_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgEconomyLastn(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the ratio of average economy rate of team1 to team2 in their last 15 games for test data
test_data['team_avg_economy_last15'] = (test_data['team1_avg_economy_last15'] + 1) / (test_data['team2_avg_economy_last15'] + 1)

# Dropping intermediate columns
test_data.drop(columns=['team1_avg_economy_last15', 'team2_avg_economy_last15'], inplace=True)


100%|██████████| 948/948 [00:05<00:00, 178.60it/s]
100%|██████████| 948/948 [00:06<00:00, 152.08it/s]
100%|██████████| 207/207 [00:01<00:00, 173.27it/s]
100%|██████████| 207/207 [00:01<00:00, 173.47it/s]


### Feature Name: team_num_fast_bowlers_ratio & team_num_spinners_ratio

In [23]:
def classify_bowler(description):
    '''
    Function to classify bowlers based on their description.

    Input-
    1. description: String describing the bowler's style.

    Output-
    String value denoting the type of bowler ('Spinner' or 'Fast bowler').
    '''
    spinners_keywords = [
        "spin", "googly", "offbreak", "legbreak", "wrist-spin",
        "slow", "orthodox", "left-arm slow", "left-arm orthodox"
    ]

    fast_bowlers_keywords = [
        "medium", "fast", "fast-medium", "right-arm medium",
        "left-arm medium", "left-arm fast", "right-arm fast",
        "right-arm fast-medium", "left-arm fast-medium"
    ]

    # Check for spinner keywords in the description
    for keyword in spinners_keywords:
        if keyword in description.lower():
            return "Spinner"

    # Check for fast bowler keywords in the description
    for keyword in fast_bowlers_keywords:
        if keyword in description.lower():
            return "Fast bowler"

    return "Unknown"

# Apply the function to classify each player
bowler_lvl_data['bowler_type'] = bowler_lvl_data['bowler_details'].apply(classify_bowler)


def count_bowler_types(roster_ids):
    '''
    Function to count the number of spinners and fast bowlers in a team's roster.

    Input-
    1. roster_ids: ':' separated list of player ids in the roster of a team.

    Output-
    Tuple containing the count of spinners and fast bowlers in the roster.
    '''
    player_list = str(roster_ids).split(':')
    bowler_type = []

    # Loop over each player_id in the roster
    for player in player_list:
        types = bowler_lvl_data[bowler_lvl_data['bowler_id'] == float(player)]['bowler_type']
        if not types.empty:
            bowler_type.append(types.iloc[0])
        else:
            bowler_type.append('Unknown')

    # Count the number of spinners and fast bowlers
    num_spinners = bowler_type.count('Spinner')
    num_fast_bowlers = bowler_type.count('Fast bowler')

    return num_spinners, num_fast_bowlers


# Add columns for team1 and team2 spinners and fast bowlers count for train data
train_data[['team1_num_spinners', 'team1_num_fast_bowlers']] = train_data['team1_roster_ids'].apply(lambda x: pd.Series(count_bowler_types(x)))
train_data[['team2_num_spinners', 'team2_num_fast_bowlers']] = train_data['team2_roster_ids'].apply(lambda x: pd.Series(count_bowler_types(x)))

# Calculate the ratio of spinners and fast bowlers between team1 and team2 for train data
train_data['team_num_spinners_ratio'] = (train_data['team1_num_spinners'] + 1) / (train_data['team2_num_spinners'] + 1)
train_data['team_num_fast_bowlers_ratio'] = (train_data['team1_num_fast_bowlers'] + 1) / (train_data['team2_num_fast_bowlers'] + 1)

# Dropping intermediate columns
train_data.drop(columns=['team1_num_fast_bowlers', 'team2_num_fast_bowlers', 'team1_num_spinners', 'team2_num_spinners'], inplace=True)

# Add columns for team1 and team2 spinners and fast bowlers count for test data
test_data[['team1_num_spinners', 'team1_num_fast_bowlers']] = test_data['team1_roster_ids'].apply(lambda x: pd.Series(count_bowler_types(x)))
test_data[['team2_num_spinners', 'team2_num_fast_bowlers']] = test_data['team2_roster_ids'].apply(lambda x: pd.Series(count_bowler_types(x)))

# Calculate the ratio of spinners and fast bowlers between team1 and team2 for test data
test_data['team_num_spinners_ratio'] = (test_data['team1_num_spinners'] + 1) / (test_data['team2_num_spinners'] + 1)
test_data['team_num_fast_bowlers_ratio'] = (test_data['team1_num_fast_bowlers'] + 1) / (test_data['team2_num_fast_bowlers'] + 1)

# Dropping intermediate columns
test_data.drop(columns=['team1_num_fast_bowlers', 'team2_num_fast_bowlers', 'team1_num_spinners', 'team2_num_spinners'], inplace=True)


### Feature Name: captain_impact_team1

In [24]:
# Extract captain information from batsman_lvl_data and bowler_lvl_data
batsman_captains = batsman_lvl_data[batsman_lvl_data['is_batsman_captain'] == 1][['match id', 'batsman_id', 'match_dt']]
batsman_captains = batsman_captains.rename(columns={'batsman_id': 'captain_id'})

bowler_captains = bowler_lvl_data[bowler_lvl_data['is_bowler_captain'] == 1][['match id', 'bowler_id', 'match_dt']]
bowler_captains = bowler_captains.rename(columns={'bowler_id': 'captain_id'})

# Combine batsman and bowler captains into a single DataFrame and remove duplicates
captain_performance = pd.concat([batsman_captains, bowler_captains]).drop_duplicates().reset_index(drop=True)

# Function to check if captain_id is in team1_roster_ids for a given match_id
def is_captain_in_team1_roster(match_id, captain_id):
    roster_ids = match_lvl_data.loc[match_lvl_data['match id'] == match_id, 'team1_roster_ids'].values[0]
    return str(captain_id) in roster_ids.split(':')

# Apply the function to create 'is_in_team1' column indicating if the captain is in team1_roster_ids
captain_performance['is_in_team1'] = captain_performance.apply(lambda row: is_captain_in_team1_roster(row['match id'], row['captain_id']), axis=1)

# Function to determine if the team with the captain won the match
def determine_is_winner(row):
    winner_id = match_lvl_data.loc[match_lvl_data['match id'] == row['match id'], 'winner_id'].values[0]
    team1_id = match_lvl_data.loc[match_lvl_data['match id'] == row['match id'], 'team1_id'].values[0]
    return int(row['is_in_team1']) == int(winner_id == team1_id)

# Apply the function to create 'is_winner' column indicating if the captain's team won the match
captain_performance['is_winner'] = captain_performance.apply(determine_is_winner, axis=1)


In [25]:
def giveLastNgamesPlayerCap(player_id, date, n):
    '''
    Function to get the last n games stats of a captain before an input date.

    Input-
    1. player_id: ID of the captain to get historical data.
    2. date: Date to look back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.

    Output- None

    Returns- DataFrame having stats from the last n games of the captain before an input date.
    The results are sorted by date.
    '''
    df_topick = captain_performance
    id_col = 'captain_id'

    return df_topick[(df_topick['match_dt'] < date) & (df_topick[id_col] == float(player_id))] \
                .sort_values(by='match_dt', ascending=False).head(n)

def team_cap_win_ratio(player_list, date, n):
    '''
    Function to get the win ratio of captains in the roster of a team in the last n games.

    Input-
    1. player_list: ':' separated list of player IDs in the roster of a team.
    2. date: Match date of the game to calculate this feature.
    3. n: Number of games to look back and create this feature.

    Output- None

    Returns- Float value denoting the highest win ratio among captains in the roster.
    '''
    # Split string of ':' separated player IDs into a list of IDs
    player_list = str(player_list).split(':')
    win_per_list = []

    # Loop over each player_id in the roster
    for player in player_list:
        # Get the last n games stats for the captain
        df_rel = giveLastNgamesPlayerCap(player_id=player, date=date, n=n)
        total_games = len(df_rel['match id'])  # Count the total number of games
        games_won = np.nansum(df_rel['is_winner'])  # Count the number of games won

        if total_games > 0:
            # Calculate the win ratio and append to the list
            win_per_list.append(games_won / total_games)
        else:
            win_per_list.append(0)

    return np.max(win_per_list)  # Return the highest win ratio among captains in the roster


In [26]:
# Calculate the best captaincy score for team1 in their last 15 games for train data
train_data['team1_best_captaincy_score'] = train_data.progress_apply(lambda x: team_cap_win_ratio(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate the best captaincy score for team2 in their last 15 games for train data
train_data['team2_best_captaincy_score'] = train_data.progress_apply(lambda x: team_cap_win_ratio(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Function to calculate the captain impact ratio for team1 relative to team2
def calculate_cap_ratio(row):
    team1_score = row['team1_best_captaincy_score']
    team2_score = row['team2_best_captaincy_score']
    return (team1_score + 1) / (team2_score + 1)

# Apply the function to create the new column 'captain_impact_team1' for train data
train_data['captain_impact_team1'] = train_data.apply(calculate_cap_ratio, axis=1)

# Dropping intermediate columns
train_data.drop(columns=['team1_best_captaincy_score', 'team2_best_captaincy_score'], inplace=True)

#-------------------------------------------------#

# Calculate the best captaincy score for team1 in their last 15 games for test data
test_data['team1_best_captaincy_score'] = test_data.progress_apply(lambda x: team_cap_win_ratio(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Calculate the best captaincy score for team2 in their last 15 games for test data
test_data['team2_best_captaincy_score'] = test_data.progress_apply(lambda x: team_cap_win_ratio(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Apply the function to create the new column 'captain_impact_team1' for test data
test_data['captain_impact_team1'] = test_data.apply(calculate_cap_ratio, axis=1)

# Dropping intermediate columns
test_data.drop(columns=['team1_best_captaincy_score', 'team2_best_captaincy_score'], inplace=True)


100%|██████████| 948/948 [00:19<00:00, 48.77it/s]
100%|██████████| 948/948 [00:18<00:00, 50.08it/s]
100%|██████████| 207/207 [00:04<00:00, 44.14it/s]
100%|██████████| 207/207 [00:03<00:00, 54.44it/s]


### Feature Name: ground_favorability_bat_first

In [27]:

match_lvl_data['team1_won'] = match_lvl_data['team1_id'] == match_lvl_data['winner_id']

def winp_ground_bat_first_last15(ground_id, date, n):
    '''
    Function to calculate average runs scored in ground/venue.

    Input-
    1. ground_id: ID of the ground to calculate the feature for.
    2. date: match date of the current game to calculate the feature for.
    3. n: look-back window of games for the ground.

    Output- None

    Returns- Average runs scored in the ground.
    '''
    # filter out games with ground_id being the input ground_id and date earlier than current game's input date. Sort desc by date, and select top n rows (games).
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&(match_lvl_data['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)

    total_games = len(df_rel['ground_id'])
    team1_wins = np.nansum(df_rel['team1_won'])
    if total_games > 0 :
      return team1_wins / total_games
    else:
      return np.nan

train_data['ground_favorability_bat_first'] = train_data.progress_apply(lambda x: winp_ground_bat_first_last15(ground_id=x['ground_id'], date=x['match_dt'], n=15), axis=1)
test_data['ground_favorability_bat_first'] = test_data.progress_apply(lambda x: winp_ground_bat_first_last15(ground_id=x['ground_id'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:01<00:00, 602.21it/s]
100%|██████████| 207/207 [00:00<00:00, 592.35it/s]


### Feature Name: ground_favorability_team_last15

In [28]:
def winp_ground_team1_last15(team1_id, ground_id, date, n):
    '''
    Function to calculate win percentage for team1 at a specific ground over the last n matches.

    Input-
    1. team1_id: ID of team1.
    2. ground_id: ID of the ground to calculate the feature for.
    3. date: match date of the current game to calculate the feature for.
    4. n: look-back window of games for the ground.

    Output- None

    Returns- Win percentage of team1 at the specified ground.
    '''
    # Filter out games with ground_id and team1_id being the input values and date earlier than the current game's input date
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) &
                            (match_lvl_data['ground_id'] == ground_id) &
                            ((match_lvl_data['team1_id'] == team1_id) | (match_lvl_data['team2_id'] == team1_id))].sort_values(by='match_dt', ascending=False).head(n)

    total_games = len(df_rel)
    team1_wins = np.nansum(df_rel['winner_id'] == team1_id)

    if total_games > 0:
        return team1_wins / total_games
    else:
        return 0.5

# Apply the function to your training and test data
train_data['ground_favorability_team1_last15'] = train_data.progress_apply(lambda x: winp_ground_team1_last15(team1_id=x['team1_id'], ground_id=x['ground_id'], date=x['match_dt'], n=15), axis=1)
test_data['ground_favorability_team1_last15'] = test_data.progress_apply(lambda x: winp_ground_team1_last15(team1_id=x['team1_id'], ground_id=x['ground_id'], date=x['match_dt'], n=15), axis=1)

# Apply the function to your training and test data
train_data['ground_favorability_team2_last15'] = train_data.progress_apply(lambda x: winp_ground_team1_last15(team1_id=x['team2_id'], ground_id=x['ground_id'], date=x['match_dt'], n=15), axis=1)
test_data['ground_favorability_team2_last15'] = test_data.progress_apply(lambda x: winp_ground_team1_last15(team1_id=x['team2_id'], ground_id=x['ground_id'], date=x['match_dt'], n=15), axis=1)


train_data['ground_favorability_team_last15'] = (train_data['ground_favorability_team1_last15']+1)/(train_data['ground_favorability_team2_last15']+1)

train_data.drop(columns=['ground_favorability_team1_last15','ground_favorability_team2_last15'],inplace=True)


test_data['ground_favorability_team_last15'] = (test_data['ground_favorability_team1_last15']+1)/(test_data['ground_favorability_team2_last15']+1)

test_data.drop(columns=['ground_favorability_team1_last15','ground_favorability_team2_last15'],inplace=True)


100%|██████████| 948/948 [00:02<00:00, 456.48it/s]
100%|██████████| 207/207 [00:00<00:00, 464.69it/s]
100%|██████████| 948/948 [00:02<00:00, 331.57it/s]
100%|██████████| 207/207 [00:00<00:00, 274.66it/s]


### Feature Name: lighting_favorability_bat_first

In [29]:
def winp_lighting_first_last15(lighting_type, date, n):
    '''
    Function to calculate the winning percentage of the first batting team under a specific lighting condition over the last n games.

    Input-
    1. lighting_type: Type of lighting condition to filter the games (e.g., day, night).
    2. date: Match date of the current game to calculate the feature for.
    3. n: Look-back window of games to consider for the calculation.

    Output- None

    Returns- Winning percentage of the first batting team under the given lighting condition.
    '''
    # Filter out games with the given lighting type and a date earlier than the current game's input date. Sort in descending order by date and select the top n rows (games).
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&(match_lvl_data['lighting']==lighting_type)].sort_values(by='match_dt', ascending=False).head(n)

    total_games = len(df_rel['lighting'])
    team1_wins = np.nansum(df_rel['team1_won'])
    if total_games > 0 :
        return team1_wins / total_games
    else:
        return np.nan

train_data['lighting_favorability_bat_first'] = train_data.progress_apply(lambda x: winp_lighting_first_last15(lighting_type=x['lighting'], date=x['match_dt'], n=15), axis=1)
test_data['lighting_favorability_bat_first'] = test_data.progress_apply(lambda x: winp_lighting_first_last15(lighting_type=x['lighting'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:02<00:00, 373.65it/s]
100%|██████████| 207/207 [00:00<00:00, 351.73it/s]


### Feature Name: series_type

In [30]:

series_data = match_lvl_data[['series_name', 'series_type']]
# If duplicates exist, drop them, keeping the first occurrence
series_data = series_data.drop_duplicates(subset=['series_name'], keep='first')

train_data = pd.merge(train_data, series_data, on='series_name', how='left')

# Convert series_type to categorical
train_data['series_type'] = train_data['series_type'].astype('category')


test_data = pd.merge(test_data, series_data, on='series_name', how='left')

# Convert series_type to categorical
test_data['series_type'] = test_data['series_type'].astype('category')

In [31]:
test_data['series_type'].fillna('international', inplace=True)
train_data['series_type'].fillna('international', inplace=True)

### Feature Name: home_ground

In [32]:
# Define home grounds
home_grounds = {
    'Mi Is': ['Mumbai', 'Navi Mumbai', 'Pune'],
    'Rl Cs Be': ['Bengaluru'],
    'Ci Sr Ks': ['Chennai'],
    'Di Cs': ['Delhi'],
    'Ka Kt Rs': ['Kolkata'],
    'Ss Hd': ['Hyderabad'],
    'Gt Ts': ['Ahmedabad'],
    'Lw Sr Gs': ['Lucknow'],
    'Pb Ks': ['Chandigarh', 'Dharamsala'],
    'Rn Rs': ['Jaipur']
}

def determine_home_ground(row):
    if row['series_type'] == 'international':
        series_parts = row['series_name'].split(' tr ')
        if row['team1'] in series_parts[0]:
            return 0
        elif row['team2'] in series_parts[0]:
            return 1
        else:
            return 0.5
    elif row['series_type'] == 'In Pr Le':
        for team, cities in home_grounds.items():
            if row['team1'] == team and row['city'] in cities:
                return 1
            elif row['team2'] == team and row['city'] in cities:
                return 0
        return 0.5
    elif row['series_type'] == 'other_domestic':
        return 0.5

train_data['home_ground'] = train_data.apply(determine_home_ground, axis=1)
test_data['home_ground'] = test_data.apply(determine_home_ground, axis=1)


### Feature Name: team_average_last15_ratio

In [33]:
def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.

    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.

    Output- None

    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # combine two dataframes - one where input team is batting first, and another one where input team is batting second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean() # return mean of the combined dataframe.

# Compute average runs scored by team1 in their last 15 games for train data.
train_data['team2only_avg_runs_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)

train_data['team_average_last15_ratio'] = (train_data['team1only_avg_runs_last15']+1)/(train_data['team2only_avg_runs_last15']+1)

train_data.drop(columns=['team1only_avg_runs_last15','team2only_avg_runs_last15'],inplace=True)


# Compute average runs scored by team1 in their last 15 games for test data.
test_data['team2only_avg_runs_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)
test_data['team_average_last15_ratio'] = (test_data['team1only_avg_runs_last15']+1)/(test_data['team2only_avg_runs_last15']+1)
test_data.drop(columns=['team1only_avg_runs_last15','team2only_avg_runs_last15'],inplace=True)



100%|██████████| 948/948 [00:05<00:00, 169.62it/s]
100%|██████████| 207/207 [00:01<00:00, 163.45it/s]


### Feature Name: team_count_3W_bowler_last15

In [34]:
def no3WLastn(player_list, date, n):
    '''
    Function to get the total number of instances where players in the roster of a team took 3 or more wickets in the last n games.

    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: Match date of the game to calculate this feature.
    3. n: Number of games to look back and create this feature.

    Output- None

    Returns- int value denoting the sum of instances where all players in the roster took 3 or more wickets.
    '''

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl') # getting bowling stats from last n games for each player.
        df_rel['gte_3wruns'] = np.where(df_rel['wicket_count'] >= 3, 1, 0) # binary indicator to denote whether the player took 3 or more wickets in the game.
        res_list.append(np.nansum(df_rel['gte_3wruns'])) # Sum up number of 3+ wicket games for the player and append to a list. We will do this for all players.
    return np.nansum(res_list) # Sum up values of the list which is the sum of 3+ wicket games by all players in the roster.


# Computing number of 3+ wicket games in the last 15 games for team1 for the train dataset.
train_data['team1_count_3W_bowler_last15'] = train_data.progress_apply(lambda x: \
            no3WLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 3+ wicket games in the last 15 games for team2 for the train dataset.
train_data['team2_count_3W_bowler_last15'] = train_data.progress_apply(lambda x: \
            no3WLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# Taking the ratio of (number of 3+ wicket games in the last 15 games for team1) to (number of 3+ wicket games in the last 15 games for team2). Adding 1 to handle divide by zero exceptions.
train_data['team_count_3W_bowler_last15'] = (train_data['team1_count_3W_bowler_last15'] + 1) / (train_data['team2_count_3W_bowler_last15'] + 1)
train_data.drop(columns=['team1_count_3W_bowler_last15','team2_count_3W_bowler_last15'], inplace=True) # dropping intermediate columns

test_data['team1_count_3W_bowler_last15'] = test_data.progress_apply(lambda x: \
            no3WLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
test_data['team2_count_3W_bowler_last15'] = test_data.progress_apply(lambda x: \
            no3WLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
test_data['team_count_3W_bowler_last15'] = (test_data['team1_count_3W_bowler_last15']) / (test_data['team2_count_3W_bowler_last15'] + 1)

test_data.drop(columns=['team1_count_3W_bowler_last15','team2_count_3W_bowler_last15'], inplace=True)


100%|██████████| 948/948 [00:43<00:00, 21.93it/s]
100%|██████████| 948/948 [00:41<00:00, 22.62it/s]
100%|██████████| 207/207 [00:08<00:00, 23.32it/s]
100%|██████████| 207/207 [00:09<00:00, 22.19it/s]


### Feature Name: team_performance_last15

In [35]:
def winpLastn(team_id, date, n, win_or_lose):
    '''
    Get a team's win or loss percentage in the last n games.

    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: Match date from which to get the last n historical games.
    3. n: Look-back window of games.
    4. win_or_lose: Indicator to calculate win percentage (1) or loss percentage (0).

    Output- None

    Returns- Float value denoting win or loss percentage of the team in the last n games.
    '''
    # Filter out games with either team1/2_id as input team_id, match_dt being before current game's date, sort descending by date, and get top n rows (games).
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) & \
                            ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))] \
                            .sort_values(by='match_dt', ascending=False).head(n)

    if win_or_lose == 1:
        win_count = df_rel[df_rel['winner_id'] == team_id].shape[0] # Count number of rows having winner as the input team.
        if win_count == 0:
            return 0
        return round(win_count * 100 / df_rel.shape[0], 2) # Return win percentage rounded to two decimal points.
    else:
        lose_count = df_rel[df_rel['winner_id'] != team_id].shape[0] # Count number of rows having winner not as the input team.
        if lose_count == 0:
            return 0
        return round(lose_count * 100 / df_rel.shape[0], 2) # Return loss percentage rounded to two decimal points.


def winPerformanceLastN(team_id, date, n):
    '''
    Calculate the win performance value for a team based on their win/loss record and the win percentages of their opponents in the last n games.

    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: Match date from which to get the last n historical games.
    3. n: Look-back window of games.

    Output- None

    Returns- Float value denoting the win performance of the team in the last n games.
    '''
    # Filter out games with either team1/2_id as input team_id, match_dt being before current game's date, sort descending by date, and get top n rows (games).
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) & \
                            ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))] \
                            .sort_values(by='match_dt', ascending=False).head(n)

    opponent_ids = []
    win_loss = []

    for _, row in df_rel.iterrows():
        if row['team1_id'] == team_id:
            opponent_ids.append(row['team2_id'])
        else:
            opponent_ids.append(row['team1_id'])

        win_loss.append(row['winner_id'] == team_id)

    value = 0
    for i in range(len(opponent_ids)):
        if win_loss[i]:
            value += winpLastn(opponent_ids[i], date, 10, 1)
        else:
            value -= winpLastn(opponent_ids[i], date, 10, 0)
    return value


# Compute team1's win performance value in the last 15 games for the train dataset.
train_data['team1_performance_last15'] = train_data.progress_apply(lambda x: \
                                  winPerformanceLastN(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute team2's win performance value in the last 15 games for the train dataset.
train_data['team2_performance_last5'] = train_data.progress_apply(lambda x: \
                                  winPerformanceLastN(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the win performance difference between team1 and team2.
train_data['team_performance_last15'] = train_data['team1_performance_last15'] - train_data['team2_performance_last5']
train_data.drop(columns=['team1_performance_last15', 'team2_performance_last5'], inplace=True) # Drop intermediate columns.

# Compute team1's win performance value in the last 15 games for the test dataset.
test_data['team1_performance_last15'] = test_data.progress_apply(lambda x: \
                                  winPerformanceLastN(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute team2's win performance value in the last 15 games for the test dataset.
test_data['team2_performance_last5'] = test_data.progress_apply(lambda x: \
                                  winPerformanceLastN(x['team2_id'], x['match_dt'], 15), axis=1)

# Compute the win performance difference between team1 and team2.
test_data['team_performance_last15'] = test_data['team1_performance_last15'] - test_data['team2_performance_last5']
test_data.drop(columns=['team1_performance_last15', 'team2_performance_last5'], inplace=True) # Drop intermediate columns.


100%|██████████| 948/948 [00:35<00:00, 26.48it/s]
100%|██████████| 948/948 [00:33<00:00, 28.16it/s]
100%|██████████| 207/207 [00:08<00:00, 24.12it/s]
100%|██████████| 207/207 [00:08<00:00, 24.04it/s]


### Keeping the Copy of preprocessed Data

In [36]:
train_copy = train_data.copy(deep=True)
test_copy = test_data.copy(deep=True)

In [37]:
train_data = train_copy.copy(deep=True)
test_data = test_copy.copy(deep=True)

### Handling Null Values

In [38]:
train_data['team_average_last15_ratio'].fillna(train_data['team_average_last15_ratio'].mean(), inplace=True)
train_data['ground_avg_runs_last15'].fillna(train_data['ground_avg_runs_last15'].mean(), inplace=True)
train_data['team_avg_wicket_last15'].fillna(train_data['team_avg_wicket_last15'].mean(), inplace=True)
train_data['ground_favorability_bat_first'].fillna(train_data['ground_favorability_bat_first'].mean(), inplace=True)

In [39]:
test_data['team_average_last15_ratio'].fillna(test_data['team_average_last15_ratio'].mean(), inplace=True)
test_data['ground_avg_runs_last15'].fillna(test_data['ground_avg_runs_last15'].mean(), inplace=True)
test_data['team_avg_wicket_last15'].fillna(test_data['team_avg_wicket_last15'].mean(), inplace=True)
test_data['ground_favorability_bat_first'].fillna(test_data['ground_favorability_bat_first'].mean(), inplace=True)

## Model Training

### Defining X_temp(Independent) and y_temp(Dependent)

In [40]:
# Separate the features and target variable
X_temp = train_data.drop(columns=['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'venue', 'city', 'match_dt', 'lighting', 'series_name','winner_01','series_type','toss decision','season',
       'ground_id'])
y_temp = train_data['winner_01']


Z_temp = test_data.drop(columns=['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'toss winner', 'series_type','toss decision','season',
       'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'ground_id'])


print(X_temp.shape)
print(Z_temp.shape)

(948, 24)
(207, 24)


In [41]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Initialize models with default hyperparameters
gbm_model = GradientBoostingClassifier(random_state=42)
lgbm_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
xgb_model = xgb.XGBClassifier(random_state=42)
cat_model = CatBoostClassifier(random_seed=42, silent=True)

# Train the models
gbm_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Make predictions and evaluate each model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")
    return model.predict_proba(X_test)[:, 1]  # Return the probability of class 1

y_pred_proba_gbm = evaluate_model(gbm_model, X_test, y_test)
y_pred_proba_lgbm = evaluate_model(lgbm_model, X_test, y_test)
y_pred_proba_xgb = evaluate_model(xgb_model, X_test, y_test)
y_pred_proba_cat = evaluate_model(cat_model, X_test, y_test)

# Ensemble predictions: weighted average of probabilities
weights = [0.25, 0.25, 0.25, 0.25]  # Equal weights for simplicity, can be adjusted based on performance
y_pred_proba_ensemble = (weights[0] * y_pred_proba_gbm + weights[1] * y_pred_proba_lgbm + weights[2] * y_pred_proba_xgb + weights[3] * y_pred_proba_cat)
y_pred_ensemble = (y_pred_proba_ensemble >= 0.5).astype(int)

# Evaluate the ensemble model
print("Weighted Average Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble))

GradientBoostingClassifier Accuracy: 0.6663
LGBMClassifier Accuracy: 0.7111
XGBClassifier Accuracy: 0.6558
CatBoostClassifier Accuracy: 0.6642
Weighted Average Ensemble Model Accuracy: 0.671578947368421


### Hyperparameter Tuning and Cross Validation



In [42]:
# Define common hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize models
models = {
    'GBM': GradientBoostingClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_seed=42, silent=True)
}

# Perform cross-validation
cv_results = {}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store best models
best_models = {}

for name, model in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_temp, y_temp)
    best_models[name] = grid_search.best_estimator_
    cv_scores = cross_val_score(grid_search.best_estimator_, X_temp, y_temp, cv=kfold, scoring='accuracy')
    cv_results[name] = {
        'mean_score': np.mean(cv_scores),
        'std_score': np.std(cv_scores),
        'best_params': grid_search.best_params_
    }
    print(f"{name} Best Params: {cv_results[name]['best_params']}")
    print(f"{name} CV Mean Accuracy: {cv_results[name]['mean_score']:.4f} (Std: {cv_results[name]['std_score']:.4f})")


GBM Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
GBM CV Mean Accuracy: 0.7443 (Std: 0.0214)
LightGBM Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500}
LightGBM CV Mean Accuracy: 0.7623 (Std: 0.0271)
XGBoost Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
XGBoost CV Mean Accuracy: 0.7338 (Std: 0.0324)
CatBoost Best Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}
CatBoost CV Mean Accuracy: 0.7402 (Std: 0.0294)


In [43]:
# Initialize models with the best hyperparameters
gbm_model = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.01, random_state=42)
lgbm_model = lgb.LGBMClassifier(n_estimators=500, max_depth=3, learning_rate=0.05,  random_state=42, verbose=-1)
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.05,  random_state=42)
cat_model = CatBoostClassifier(iterations=200, depth=5, learning_rate=0.01, random_seed=42, silent=True)

# Train the models
gbm_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Make predictions and evaluate each model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")
    return model.predict_proba(X_test)[:, 1]  # Return the probability of class 1

y_pred_proba_gbm = evaluate_model(gbm_model, X_test, y_test)
y_pred_proba_lgbm = evaluate_model(lgbm_model, X_test, y_test)
y_pred_proba_xgb = evaluate_model(xgb_model, X_test, y_test)
y_pred_proba_cat = evaluate_model(cat_model, X_test, y_test)

# Ensemble predictions: weighted average of probabilities
weights = [0.25, 0.25, 0.25, 0.25]  # Equal weights for simplicity, can be adjusted based on performance
y_pred_proba_ensemble = (weights[0] * y_pred_proba_gbm + weights[1] * y_pred_proba_lgbm + weights[2] * y_pred_proba_xgb + weights[3] * y_pred_proba_cat)
y_pred_ensemble = (y_pred_proba_ensemble >= 0.5).astype(int)


# Evaluate the ensemble model
print("Weighted Average Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble))

GradientBoostingClassifier Accuracy: 0.7047
LGBMClassifier Accuracy: 0.7511
XGBClassifier Accuracy: 0.7021
CatBoostClassifier Accuracy: 0.7200
Weighted Average Ensemble Model Accuracy: 0.7489473684210527


### Description of Features

In [44]:
description_mapping = {
    'team_count_50runs_last15': 'Ratio of the number of 50s scored by players in Team 1 to the number of 50s scored by players in Team 2 in the last 15 games',
    'team_winp_last5': 'Ratio of Team 1\'s win percentage to Team 2\'s win percentage in the last 5 games',
    'team1_winp_team2_last15': 'Team 1\'s win percentage against Team 2 in the last 15 games',
    'ground_avg_runs_last15': 'Average runs scored in the ground in the last 15 games',
    'team1_avg_per_total_bat_SR_last15': 'Average Strike rate of Team 1 batsman with respect to team2 in the last 15 games',
    'team1_avg_per_total_batting_AVG_last15': 'Batting Average of Team 1 batsman with respect to team2 in the last 15 games',
    'team1_per_total_bowling_economy_last15': 'Average Bowler Economy of Team 1 bowler with respect to team2 in the last 15 games',
    'team1_avg_per_total_bowl_AVG_last15': 'Bowling Average of Team 1 bowler with respect to team2 in the last 15 games',
    'captain_impact_team1': 'Impact of the captain of Team 1 with respect to team 2 by win ratio in last 15 matches',
    'ground_favorability_bat_first': 'Probability of favourability of batting first on that ground in last 15 matches',
    'lighting_favorability_bat_first': 'winning probability due to lighting for team 1 with respect to team 2',
    'team_num_spinners_ratio':'Ratio of number of spinners in team 1 and team 2',
    'team_num_fast_bowlers_ratio':'Ratio of number of fast bowlers in team 1 and team 2',
    'home_ground':'Weather it home ground or neutral ground ',
    'team_run_per_wicket_last15' : 'Ratio of runs per wicket of team1 by team2 in last 15 matches',
    'team_avg_wicket_last15' : 'Average wickets taken by team1 by team2 in last 15 matches',
    'team_count_100runs_last15' : 'Ratio of number of 100s scored by team1 by team2 in last 15 matches',
    'team_avg_economy_last15' : 'Average economy of team1 by team2 in last 15 matches',
    'team_boundary_rate_last15' : 'Average boundary rate of team1 by team2 in last 15 matches',
    'team_average_last15_ratio' : 'Ratio of average runs of team 1 and team 2 in last 15 matches',
    'ground_favorability_team_last15' : 'Ratio of ground favourabilty of team1 by team2 in last 15 matches',
    'team_run_rate_last15' : 'Ratio of runrate of team1 by team2 in last 15 matches',
    'team_count_3W_bowler_last15':'Ratio of the number of 3W+ taken by players in Team 1 to the number of 3W+ taken by players in Team 2 in the last 15 games',
    'team_performance_last15':'Performance value of team in their last 15 games, considering the strength of their opponents and the outcome of those matches before the current match date.'

}


### Feature Importance Analysis of the Best Hypertuned Model

In [45]:
lgbm_model = lgb.LGBMClassifier(n_estimators=500, max_depth=3, learning_rate=0.05,  random_state=42, verbose=-1)
lgbm_model.fit(X_temp, y_temp)
y_pred_proba_lgbm = lgbm_model.predict_proba(Z_temp)[:, 1]
y_pred_lgbm = lgbm_model.predict(Z_temp)
win_pred_team_id = test_data.loc[Z_temp.index, 'team1_id'].where(y_pred_lgbm == 0, test_data.loc[Z_temp.index, 'team2_id'])
feature_importance_lgbm = lgbm_model.feature_importances_
feature_importance_lgbm_normalized = feature_importance_lgbm / feature_importance_lgbm.sum()


feature_importance_df = pd.DataFrame({
    'feat_name': X_temp.columns,
    'feat_description': [description_mapping.get(feat, "ensemble_xgb_lgbm") for feat in X_temp.columns],
    'model_feat_imp_train': feature_importance_lgbm_normalized*100
})
feature_importance_df['feat_rank_train'] = feature_importance_df['model_feat_imp_train'].rank(ascending=False).astype(int)
feature_importance_df['feat_id'] = feature_importance_df['model_feat_imp_train'].rank(ascending=False).astype(int)

In [46]:
feature_importance_df = feature_importance_df.sort_values(['feat_rank_train']).reset_index(drop =True)
feature_importance_df = feature_importance_df[['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train', 'feat_rank_train' ]]
feature_importance_df

Unnamed: 0,feat_id,feat_name,feat_description,model_feat_imp_train,feat_rank_train
0,1,team1_per_total_bowling_economy_last15,Average Bowler Economy of Team 1 bowler with respect to team2 in the last 15 games...,7.796218,1
1,2,team_performance_last15,Performance value of team in their last 15 games...,6.175222,2
2,3,team1_avg_per_total_bowl_AVG_last15,Bowling Average of Team 1 bowler with respect to team2 in the last 15 games...,6.098032,3
3,4,ground_avg_runs_last15,Average runs scored in the ground in the last 15 games...,5.943651,4
4,5,team1_avg_per_total_batting_AVG_last15,Batting Average of Team 1 batsman with respect to team2 in the last 15 games...,5.673485,5
5,6,team_run_rate_last15,Ratio of runrate of team1 by team2 in last 15 matches...,5.480509,6
6,6,team_avg_economy_last15,Average economy of team1 by team2 in last 15 matches...,5.480509,6
7,8,team_avg_wicket_last15,Average wickets taken by team1 by team2 in last 15 matches,5.171748,8
8,9,team_run_per_wicket_last15,Ratio of runs per wicket of team1 by team2 in last 15 matches...,4.940178,9
9,9,team_average_last15_ratio,Ratio of average runs of team 1 and team 2 in the last 15 games...,4.940178,9


### Feature Selection Driven by Feature Importance

In [47]:
# Separate the features and target variable
X_train_fs = train_data.drop(columns=['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner', 'lighting_favorability_bat_first',
       'team_num_spinners_ratio', 'team_count_100runs_last15','team_winp_last5','team1_winp_team2_last15',
       'venue', 'city', 'match_dt', 'lighting', 'series_name','winner_01','series_type','toss decision','season',
       'ground_id'])
y_train_fs = train_data['winner_01']


Z_test = test_data.drop(columns=['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'toss winner', 'series_type','toss decision','lighting_favorability_bat_first',
       'team_num_spinners_ratio', 'team_count_100runs_last15','team_winp_last5','team1_winp_team2_last15',
       'venue', 'city', 'match_dt', 'lighting', 'series_name','season',
       'ground_id'])


print(X_train.shape)
print(Z_test.shape)

(758, 24)
(207, 19)


### Final Performance Evaluation

In [48]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_fs, y_train_fs, test_size=0.2, random_state=42)

# Initialize models with the best hyperparameters
gbm_model = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.01, random_state=42)
lgbm_model = lgb.LGBMClassifier(n_estimators=500, max_depth=3, learning_rate=0.05,  random_state=42, verbose=-1)
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.05,  random_state=42)
cat_model = CatBoostClassifier(iterations=200, depth=5, learning_rate=0.01, random_seed=42, silent=True)

# Train the models
gbm_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Make predictions and evaluate each model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")
    return model.predict_proba(X_test)[:, 1]  # Return the probability of class 1

y_pred_proba_gbm = evaluate_model(gbm_model, X_test, y_test)
y_pred_proba_lgbm = evaluate_model(lgbm_model, X_test, y_test)
y_pred_proba_xgb = evaluate_model(xgb_model, X_test, y_test)
y_pred_proba_cat = evaluate_model(cat_model, X_test, y_test)

# Ensemble predictions: weighted average of probabilities
weights = [0.25, 0.25, 0.25, 0.25]  # Equal weights for simplicity, can be adjusted based on performance
y_pred_proba_ensemble = (weights[0] * y_pred_proba_gbm + weights[1] * y_pred_proba_lgbm + weights[2] * y_pred_proba_xgb + weights[3] * y_pred_proba_cat)
y_pred_ensemble = (y_pred_proba_ensemble >= 0.5).astype(int)


# Evaluate the ensemble model
print("Weighted Average Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble))


GradientBoostingClassifier Accuracy: 0.7242
LGBMClassifier Accuracy: 0.7858
XGBClassifier Accuracy: 0.7474
CatBoostClassifier Accuracy: 0.7211
Weighted Average Ensemble Model Accuracy: 0.7631578947368421


In [49]:
lgbm_model = lgb.LGBMClassifier(n_estimators=500, max_depth=3, learning_rate=0.05,  random_state=42, verbose=-1)
lgbm_model.fit(X_train_fs, y_train_fs)
y_pred_proba_lgbm = lgbm_model.predict_proba(Z_test)[:, 1]
y_pred_lgbm = lgbm_model.predict(Z_test)
win_pred_team_id = test_data.loc[Z_test.index, 'team1_id'].where(y_pred_lgbm == 0, test_data.loc[Z_test.index, 'team2_id'])
feature_importance_lgbm = lgbm_model.feature_importances_
feature_importance_lgbm_normalized = feature_importance_lgbm / feature_importance_lgbm.sum()


feature_importance_df = pd.DataFrame({
    'feat_name': X_train_fs.columns,
    'feat_description': [description_mapping.get(feat, "ensemble_xgb_lgbm") for feat in X_train_fs.columns],
    'model_feat_imp_train': feature_importance_lgbm_normalized*100
})
feature_importance_df['feat_rank_train'] = feature_importance_df['model_feat_imp_train'].rank(ascending=False).astype(int)
feature_importance_df['feat_id'] = feature_importance_df['model_feat_imp_train'].rank(ascending=False).astype(int)


feature_importance_df = feature_importance_df.sort_values(['feat_rank_train']).reset_index(drop =True)
feature_importance_df = feature_importance_df[['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train', 'feat_rank_train' ]]
feature_importance_df

Unnamed: 0,feat_id,feat_name,feat_description,model_feat_imp_train,feat_rank_train
0,1,team1_per_total_bowling_economy_last15,Average Bowler Economy of Team 1 bowler with r...,7.334526,1
1,2,team_performance_last15,Performance value of team in their last 15 gam...,7.24508,2
2,3,team1_avg_per_total_bowl_AVG_last15,Bowling Average of Team 1 bowler with respect ...,7.155635,3
3,4,ground_avg_runs_last15,Average runs scored in the ground in the last ...,6.708408,4
4,5,team1_avg_per_total_batting_AVG_last15,Batting Average of Team 1 batsman with respect...,6.529517,5
5,6,team_run_rate_last15,Ratio of runrate of team1 by team2 in last 15 ...,6.529517,6
6,7,team_avg_economy_last15,Average economy of team1 by team2 in last 15 m...,6.440071,7
7,8,team_avg_wicket_last15,Average wickets taken by team1 by team2 in las...,6.350626,8
8,9,team_run_per_wicket_last15,Ratio of runs per wicket of team1 by team2 in ...,6.171735,9
9,10,team_average_last15_ratio,Ratio of average runs of team 1 and team 2 in ...,5.903399,10
