In [1]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [3]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

## Feature Functions

In [4]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [5]:
def calculate_batsman_strike_rate(player_list, date, n):
    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    res_list = []

    for player in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        runs = recent_matches['runs'].sum()
        balls = recent_matches['balls_faced'].sum()

        # Calculate strike rate and avoid division by zero
        if balls > 0:
            strike_rate = (runs / balls) * 100
        else:
            strike_rate = 0

        res_list.append(strike_rate)

    # Calculate the average strike rate across all players in the list
    if res_list:  # Ensure the list is not empty to avoid division by zero
        average_strike_rate = sum(res_list) / len(res_list)
    else:
        average_strike_rate = 0

    return average_strike_rate


def calculate_bowler_economy_rate(player_list, date, n):
    player_list = player_list.split(':')
    economy_rates = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        runs_conceded = recent_matches['runs'].sum()
        overs = (recent_matches['balls_bowled'].sum()) / 6
        economy_rate = (runs_conceded / overs) if overs > 0 else 0
        economy_rates.append(economy_rate)
    return sum(economy_rates) / len(economy_rates) if economy_rates else 0


def average_wickets_taken(player_list, date, n):
    player_list = player_list.split(':')
    wickets_list = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_wickets = recent_matches['wicket_count'].sum()
        average_wickets = total_wickets / n if n > 0 else 0
        wickets_list.append(average_wickets)
    return sum(wickets_list) / len(wickets_list) if wickets_list else 0


def team_batting_strength(player_list, date, n):
    players = player_list.split(':')
    total_runs = 0
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        total_runs += recent_matches['runs'].sum()
    return total_runs / len(players) if players else 0

In [6]:
train_data['team1_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

train_data['team1_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

train_data['team1_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

train_data['team1_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:15<00:00, 59.91it/s]
100%|██████████| 948/948 [00:16<00:00, 58.92it/s]
100%|██████████| 948/948 [00:12<00:00, 76.29it/s]
100%|██████████| 948/948 [00:12<00:00, 76.52it/s]
100%|██████████| 948/948 [00:12<00:00, 78.03it/s]
100%|██████████| 948/948 [00:12<00:00, 78.64it/s]
100%|██████████| 948/948 [00:15<00:00, 61.00it/s]
100%|██████████| 948/948 [00:15<00:00, 60.92it/s]


## More features

In [7]:
def player_form_factor(player_list, date, n):
    players = player_list.split(':')
    form_factors = []
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            weights = np.linspace(1, 2, len(recent_matches))  # Linear weight increase from 1 to 2
            form_factors.append(np.average(recent_matches['runs'], weights=weights))
        else:
            form_factors.append(0)
    return np.mean(form_factors) if players else 0

In [8]:
train_data['team1_form_factor'] = train_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_form_factor'] = train_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)


100%|██████████| 948/948 [00:15<00:00, 59.79it/s]
100%|██████████| 948/948 [00:16<00:00, 56.12it/s]


In [9]:
def head_to_head_performance(team1_id, team2_id, date, match_lvl_data):
    # Filter matches between the two teams up to the specified date
    relevant_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
         (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id))
    ]

    # Calculate win percentage for team1
    team1_wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    win_percentage = team1_wins / len(relevant_matches) if len(relevant_matches) > 0 else 0

    # Calculate average score for team1 in these matches
    # Assuming scores are in 'inning1_runs' or 'inning2_runs' based on which team was batting first
    team1_scores = relevant_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team1_id else x['inning2_runs'], axis=1
    )
    team1_avg_score = team1_scores.mean() if len(team1_scores) > 0 else 0

    return win_percentage, team1_avg_score

In [10]:
# Applying the function to the train_data DataFrame
train_data[['team1_win_percentage', 'team1_avg_score']] = train_data.apply(
    lambda x: head_to_head_performance(
        team1_id=x['team1_id'],
        team2_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1, result_type='expand'
)

In [11]:
def recent_performance(team_id, date, n, match_lvl_data):
    recent_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ].tail(n)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / len(recent_matches) if len(recent_matches) > 0 else 0

def average_winning_margin(team_id, date, match_lvl_data):
    winning_matches = match_lvl_data[
        (match_lvl_data['winner_id'] == team_id) & (match_lvl_data['match_dt'] < date)
    ]
    if len(winning_matches) > 0:
        runs_wins = winning_matches[winning_matches['by'] == 'runs']['win amount']
        wickets_wins = winning_matches[winning_matches['by'] == 'wickets']['win amount']
        average_margin = pd.concat([runs_wins, wickets_wins]).mean()
        return average_margin
    return 0

def team_scoring_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_scores = team_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1
    )
    return team_scores.mean() if len(team_scores) > 0 else 0

def team_wicket_loss_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_wickets = team_matches.apply(
        lambda x: x['inning1_wickets'] if x['team1_id'] == team_id else x['inning2_wickets'], axis=1
    )
    return team_wickets.mean() if len(team_wickets) > 0 else 0

In [12]:
n_matches = 15  # Number of recent matches to consider
train_data['team1_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=n_matches,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_wicket_loss_average'] = train_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

In [13]:
n_matches = 15  # Number of recent matches to consider
train_data['team2_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=n_matches,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_wicket_loss_average'] = train_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

## More Features

In [14]:
def average_score_by_venue(venue_id, date, match_lvl_data):
    # Filter matches based on the venue and date
    relevant_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) & (match_lvl_data['match_dt'] < date)]
    average_score = (relevant_matches['inning1_runs'].sum() + relevant_matches['inning2_runs'].sum()) / (2 * len(relevant_matches))
    return average_score if not pd.isna(average_score) else 0

def team_win_rate_at_venue(team_id, venue_id, date, match_lvl_data):
    # Filter matches where the team played at the given venue up to the specified date
    relevant_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]
    wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    win_rate = wins / len(relevant_matches) if len(relevant_matches) > 0 else 0
    return win_rate

def most_frequent_matchups(team_id, match_lvl_data):
    # Filter matches involving the team
    relevant_matches = match_lvl_data[(match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)]
    opponent_counts = relevant_matches.apply(lambda x: x['team2_id'] if x['team1_id'] == team_id else x['team1_id'], axis=1).value_counts()
    return opponent_counts

More features new

In [15]:
def batsman_performance_index(bat_df, match_id, date):
    relevant_batsmen = bat_df[(bat_df['match id'] == match_id) & (bat_df['match_dt'] < date)]
    relevant_batsmen['performance_index'] = (relevant_batsmen['runs'] * 0.5 +
                                             relevant_batsmen['strike_rate'] * 0.3 +
                                             (relevant_batsmen['Fours'] + relevant_batsmen['Sixes'] * 2) * 0.2) * \
                                            (1 + 0.1 * relevant_batsmen['is_batsman_captain'] + 0.05 * relevant_batsmen['is_batsman_keeper'])
    return relevant_batsmen['performance_index'].sum()



def team_batsman_performance_index(bat_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 0.5 +
                                  recent_matches['strike_rate'] * 0.3 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] * 2) * 0.2) * \
                                  (1 + 0.1 * recent_matches['is_batsman_captain'].iloc[0] +
                                   0.05 * recent_matches['is_batsman_keeper'].iloc[0])
            total_performance_index += player_performance.sum()

    return total_performance_index


def team_bowler_impact_score(bowler_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count'] * 2 +
                             (120 / recent_matches['economy']) * 0.5 +
                             recent_matches['maiden'] * 1) * \
                            (1 + 0.1 * recent_matches['is_bowler_captain'].iloc[0] +
                             0.05 * recent_matches['is_bowler_keeper'].iloc[0])
            total_impact_score += player_impact.sum()

    return total_impact_score


In [17]:
def venue_performance(match_lvl_data, venue_id, date):
    historical_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) & (match_lvl_data['match_dt'] < date)]
    average_runs = historical_matches[['inning1_runs', 'inning2_runs']].mean().mean()
    win_rate = historical_matches['winner'].value_counts(normalize=True)
    return average_runs, win_rate

train_data['venue_avg_runs'], train_data['venue_win_rate'] = zip(*train_data.apply(
    lambda x: venue_performance(match_lvl_data, x['ground_id'], x['match_dt']),
    axis=1
))


# def lighting_performance(match_lvl_data, lighting_type, date):
#     matches = match_lvl_data[(match_lvl_data['lighting'] == lighting_type) & (match_lvl_data['match_dt'] < date)]
#     win_rate = matches['winner'].value_counts(normalize=True)
#     return win_rate

# train_data['lighting_win_rate'] = train_data.apply(
#     lambda x: lighting_performance(match_lvl_data, x['lighting'], x['match_dt']),
#     axis=1
# )


def lighting_performance(match_lvl_data, lighting_type, team_id, date):
    matches = match_lvl_data[(match_lvl_data['lighting'] == lighting_type) & (match_lvl_data['match_dt'] < date)]
    if len(matches) > 0:
        wins = matches[matches['winner_id'] == team_id].shape[0]
        return wins / len(matches)
    return 0  # Return 0 if no matches found under this condition

# Applying the function to each row and lighting condition
train_data['team1_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team2_id'], x['match_dt']),
    axis=1
)
train_data['team1_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)
train_data['team1_day_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day/night match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_day_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day/night match', x['team2_id'], x['match_dt']),
    axis=1
)



def toss_advantage(match_lvl_data, team_id, date):             #( WRONG IMPLEMENTATION )
    matches = match_lvl_data[(match_lvl_data['toss winner'] == team_id) & (match_lvl_data['match_dt'] < date)]
    wins_after_toss_win = matches[matches['winner'] == team_id].shape[0]
    if matches.shape[0] > 0:
        return wins_after_toss_win / matches.shape[0]
    return 0

train_data['team1_toss_advantage'] = train_data.apply(
    lambda x: toss_advantage(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_toss_advantage'] = train_data.apply(
    lambda x: toss_advantage(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)
def series_type_performance(match_lvl_data, team_id, series_type, date):
    relevant_matches = match_lvl_data[(match_lvl_data['series_type'] == series_type) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]
    wins = relevant_matches['winner'].value_counts().get(team_id, 0)
    total = relevant_matches.shape[0]
    if total > 0:
        return wins / total
    return 0

# train_data['team1_series_type_perf'] = train_data.apply(
#     lambda x: series_type_performance(match_lvl_data, x['team1_id'], x['series_type'], x['match_dt']),
#     axis=1
# )
# train_data['team2_series_type_perf'] = train_data.apply(
#     lambda x: series_type_performance(match_lvl_data, x['team2_id'], x['series_type'], x['match_dt']),
#     axis=1
# )


New new new features

In [18]:
def adjusted_team_venue_win_rate(match_lvl_data, team_id, venue_id, date):
    # Filter matches at the venue for the specific team before the given date
    venue_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                   ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                   (match_lvl_data['match_dt'] < date)]
    # Calculate basic win rate
    wins = venue_matches[venue_matches['winner_id'] == team_id].shape[0]
    total = venue_matches.shape[0]
    win_rate = wins / total if total > 0 else 0

    # Adjust win rate by recent form (last 5 matches)
    recent_form = venue_matches.tail(5)
    recent_wins = recent_form[recent_form['winner_id'] == team_id].shape[0]
    recent_total = recent_form.shape[0]
    recent_win_rate = recent_wins / recent_total if recent_total > 0 else 0

    # Combine basic and recent win rates
    if total > 0:
        adjusted_win_rate = (win_rate * 0.75) + (recent_win_rate * 0.25)
    else:
        adjusted_win_rate = 0

    return adjusted_win_rate

# Application
train_data['team1_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_team_venue_win_rate(match_lvl_data, x['team1_id'], x['ground_id'], x['match_dt']),
    axis=1
)
train_data['team2_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_team_venue_win_rate(match_lvl_data, x['team2_id'], x['ground_id'], x['match_dt']),
    axis=1
)


In [19]:
# def venue_lighting_performance(match_lvl_data, venue_id, lighting_condition, date):
#     filtered_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
#                                       (match_lvl_data['lighting'] == lighting_condition) &
#                                       (match_lvl_data['match_dt'] < date)]
#     wins = filtered_matches['winner'].value_counts(normalize=True)
#     return wins

# # Application example (assuming binary outcomes for simplicity)
# train_data['venue_night_win_rate'] = train_data.apply(
#     lambda x: venue_lighting_performance(match_lvl_data, x['ground_id'], 'night match', x['match_dt']),
#     axis=1
# )


In [20]:
def toss_strategy_impact(match_lvl_data, team_id, toss_decision, venue_id, date):  #( WRONG IMPLEMENTATION )
    matches = match_lvl_data[(match_lvl_data['toss winner'] == team_id) &
                             (match_lvl_data['toss decision'] == toss_decision) &
                             (match_lvl_data['ground_id'] == venue_id) &
                             (match_lvl_data['match_dt'] < date)]
    wins = matches[matches['winner_id'] == team_id].shape[0]
    total = matches.shape[0]
    return wins / total if total > 0 else 0

# Application
train_data['team1_toss_strategy_at_venue'] = train_data.apply(
    lambda x: toss_strategy_impact(match_lvl_data, x['team1_id'], x['toss decision'], x['ground_id'], x['match_dt']),
    axis=1
)
train_data['team2_toss_strategy_at_venue'] = train_data.apply(
    lambda x: toss_strategy_impact(match_lvl_data, x['team2_id'], x['toss decision'], x['ground_id'], x['match_dt']),
    axis=1
)


In [21]:
def overall_historical_win_rate(match_lvl_data, team_id, date):
    historical_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < date)]
    wins = historical_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    total_matches = len(historical_matches)
    return wins / total_matches if total_matches > 0 else 0

train_data['team1_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)


In [22]:
def recent_form(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(5)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / 5 if recent_matches.shape[0] > 0 else 0

train_data['team1_recent_form'] = train_data.apply(
    lambda x: recent_form(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_form(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)


In [23]:
def head_to_head_win_rate(match_lvl_data, team1_id, team2_id, date):
    head_to_head_matches = match_lvl_data[((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
                                           (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
                                          (match_lvl_data['match_dt'] < date)]
    wins = head_to_head_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    total = len(head_to_head_matches)
    return wins / total if total > 0 else 0

train_data['team1_head_to_head_win_rate'] = train_data.apply(
    lambda x: head_to_head_win_rate(match_lvl_data, x['team1_id'], x['team2_id'], x['match_dt']),
    axis=1
)


Applying on train data

In [24]:
# Example for integrating one of the functions
train_data['batsman_performance_index'] = train_data.apply(
    lambda x: batsman_performance_index(batsman_lvl_data, x['match id'], x['match_dt']),
    axis=1
)

# Similar integration can be done for other functions


In [25]:
train_data['team1_batsman_performance_index'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)
train_data['team2_batsman_performance_index'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_bowler_impact_score'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)
train_data['team2_bowler_impact_score'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)


In [26]:
def adjusted_venue_win_rate_with_recent_form(match_lvl_data, team_id, venue_id, date):
    # Historical win rate at the venue
    venue_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                   ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                   (match_lvl_data['match_dt'] < date)]
    venue_wins = venue_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    venue_total = len(venue_matches)

    # Recent form (last 5 matches overall)
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(5)
    recent_wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    recent_total = len(recent_matches)

    # Combine both metrics
    venue_win_rate = venue_wins / venue_total if venue_total > 0 else 0
    recent_win_rate = recent_wins / recent_total if recent_total > 0 else 0
    adjusted_win_rate = 0.7 * venue_win_rate + 0.3 * recent_win_rate  # Weighted average

    return adjusted_win_rate

train_data['team1_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate_with_recent_form(match_lvl_data, x['team1_id'], x['ground_id'], x['match_dt']),
    axis=1
)
train_data['team2_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate_with_recent_form(match_lvl_data, x['team2_id'], x['ground_id'], x['match_dt']),
    axis=1
)


In [27]:
train_data['average_score_at_venue'] = train_data.apply(
    lambda x: average_score_by_venue(
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

  average_score = (relevant_matches['inning1_runs'].sum() + relevant_matches['inning2_runs'].sum()) / (2 * len(relevant_matches))


In [28]:
def scoring_consistency(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(10)
    runs_scored = recent_matches.apply(lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1)
    if len(runs_scored) > 0:
        mean_runs = runs_scored.mean()
        variance = runs_scored.var()
        consistency_index = mean_runs / variance if variance != 0 else mean_runs
    else:
        consistency_index = 0
    return consistency_index

train_data['team1_scoring_consistency'] = train_data.apply(
    lambda x: scoring_consistency(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_scoring_consistency'] = train_data.apply(
    lambda x: scoring_consistency(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)


In [29]:
def wicket_loss_variance(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(10)
    wickets_lost = recent_matches.apply(lambda x: x['inning1_wickets'] if x['team1_id'] == team_id else x['inning2_wickets'], axis=1)
    variance = wickets_lost.var() if len(wickets_lost) > 0 else 0
    return variance

train_data['team1_wicket_loss_variance'] = train_data.apply(
    lambda x: wicket_loss_variance(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_wicket_loss_variance'] = train_data.apply(
    lambda x: wicket_loss_variance(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)


In [30]:
import numpy as np

def calculate_exponential_momentum(match_lvl_data, team_id, date, alpha=0.1):
    matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False)
    wins = matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).to_numpy()
    weights = np.exp(-alpha * np.arange(len(wins)))
    if np.sum(weights) > 0:
        momentum = np.dot(wins, weights) / np.sum(weights)
    else:
        momentum = 0  # Handle case with no matches
    return momentum


train_data['team1_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)


In [31]:
# from sklearn.cluster import KMeans

# def create_performance_clusters(match_lvl_data, n_clusters=4):
#     # Feature engineering for clustering
#     teams_stats = match_lvl_data.groupby('team1_id').agg(
#         average_score=('inning1_runs', 'mean'),
#         win_rate=('winner_id', lambda x: np.mean(x == match_lvl_data['team1_id']))
#     ).fillna(0)

#     # K-means clustering
#     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
#     clusters = kmeans.fit_predict(teams_stats)
#     teams_stats['cluster'] = clusters

#     return teams_stats['cluster'].to_dict()

# team_clusters = create_performance_clusters(match_lvl_data)

# train_data['team1_cluster'] = train_data['team1_id'].map(team_clusters)
# train_data['team2_cluster'] = train_data['team2_id'].map(team_clusters)


In [32]:
train_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'winner_01', 'team1_strike_rate',
       'team2_strike_rate', 'team1_bowler_eco', 'team2_bowler_eco',
       'team1_avg_wicket', 'team2_avg_wicket', 'team1_batting_strength',
       'team2_batting_strength', 'team1_form_factor', 'team2_form_factor',
       'team1_win_percentage', 'team1_avg_score', 'team1_recent_win_rate',
       'team1_average_winning_margin', 'team1_scoring_average',
       'team1_wicket_loss_average', 'team2_recent_win_rate',
       'team2_average_winning_margin', 'team2_scoring_average',
       'team2_wicket_loss_average', 'venue_avg_runs', 'venue_win_rate',
       'team1_n

In [33]:
def win_rate_against_recent_form(team_id, venue_id, date, match_lvl_data, n=5):  # ( WRONG IMPLEMENTATION )
    # Filter relevant matches
    relevant_matches = match_lvl_data[(match_lvl_data['venue'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]

    # Calculate opponent's recent form
    relevant_matches['opponent_id'] = relevant_matches.apply(lambda x: x['team1_id'] if x['team2_id'] == team_id else x['team2_id'], axis=1)
    recent_forms = {}
    for opponent in relevant_matches['opponent_id'].unique():
        opponent_matches = match_lvl_data[(match_lvl_data['team1_id'] == opponent) | (match_lvl_data['team2_id'] == opponent)]
        recent_wins = opponent_matches.tail(n)['winner'].apply(lambda x: 1 if x == opponent else 0).sum()
        recent_forms[opponent] = recent_wins / n

    relevant_matches['opponent_form'] = relevant_matches['opponent_id'].map(recent_forms)
    weighted_wins = (relevant_matches['winner'].apply(lambda x: 1 if x == team_id else 0) * relevant_matches['opponent_form']).sum()
    win_rate = weighted_wins / relevant_matches['opponent_form'].sum() if relevant_matches['opponent_form'].sum() > 0 else 0
    return win_rate

train_data['team1_win_rate_against_form'] = train_data.apply(
    lambda x: win_rate_against_recent_form(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
train_data['team2_win_rate_against_form'] = train_data.apply(
    lambda x: win_rate_against_recent_form(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)


In [34]:
def adjusted_win_rate_at_venue(match_lvl_data, team_id, venue, match_dt):
    # Filter matches for the team at this venue before this date
    relevant_matches = match_lvl_data[(match_lvl_data['venue'] == venue) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < match_dt)]
    opponent_strengths = relevant_matches.apply(
        lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'] if x['team1_id'] == team_id else x['team1_id'], x['match_dt']),
        axis=1
    )
    wins = (relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0) * opponent_strengths).sum()
    return wins / opponent_strengths.sum() if opponent_strengths.sum() > 0 else 0

train_data['team1_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_win_rate_at_venue(match_lvl_data, x['team1_id'], x['venue'], x['match_dt']),
    axis=1
)
train_data['team2_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_win_rate_at_venue(match_lvl_data, x['team2_id'], x['venue'], x['match_dt']),
    axis=1
)


In [35]:
def performance_after_loss(match_lvl_data, team_id, match_dt):
    past_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                  (match_lvl_data['match_dt'] < match_dt)].sort_values(by='match_dt', ascending=False)
    if not past_matches.empty and past_matches.iloc[0]['winner_id'] != team_id:
        # Find next match result
        if past_matches.shape[0] > 1:
            return 1 if past_matches.iloc[1]['winner_id'] == team_id else 0
    return None  # Not applicable if no match after a loss

train_data['team1_post_loss_performance'] = train_data.apply(
    lambda x: performance_after_loss(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_post_loss_performance'] = train_data.apply(
    lambda x: performance_after_loss(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)


In [36]:
def scoring_variance_at_venue(match_lvl_data, team_id, venue, match_dt):
    scores = match_lvl_data[((match_lvl_data['venue'] == venue) &
                             ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < match_dt))].apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'],
        axis=1
    )
    if len(scores) > 1:
        return np.var(scores)
    return None  # Not applicable if fewer than 2 scores

train_data['team1_scoring_variance_at_venue'] = train_data.apply(
    lambda x: scoring_variance_at_venue(match_lvl_data, x['team1_id'], x['venue'], x['match_dt']),
    axis=1
)
train_data['team2_scoring_variance_at_venue'] = train_data.apply(
    lambda x: scoring_variance_at_venue(match_lvl_data, x['team2_id'], x['venue'], x['match_dt']),
    axis=1
)


In [37]:
# def player_impact_score(player_roster, match_lvl_data, date, n):
#     player_ids = player_roster.split(':')  # Splitting player IDs
#     total_impact_score = 0

#     for player_id in player_ids:
#         # Fetch the last n matches before the given date for each player
#         player_matches = match_lvl_data[(match_lvl_data['player_id'] == player_id) &
#                                         (match_lvl_data['date'] < date)].sort_values('date', ascending=False).head(n)
#         if not player_matches.empty:
#             # Summing up an example impact score, could be runs, wickets, etc.
#             impact_score = player_matches['impact_metric'].sum()  # Assuming 'impact_metric' is a column
#             total_impact_score += impact_score

#     return total_impact_score

# # Apply to DataFrame
# train_data['team1_impact_score'] = train_data.apply(
#     lambda x: player_impact_score(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 10), axis=1)
# train_data['team2_impact_score'] = train_data.apply(
#     lambda x: player_impact_score(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 10), axis=1)


In [38]:
# def player_impact_score(player_ids, match_lvl_data, date, n):
#     player_ids = player_ids.split(':')
#     total_impact_score = 0

#     for player_id in player_ids:
#         player_matches = match_lvl_data[(match_lvl_data['match_dt'] < date) &
#                                         (match_lvl_data['player_id'] == player_id)].sort_values(by='date', ascending=False).head(n)
#         if not player_matches.empty:
#             # Example calculation, modify based on your criteria
#             impact_score = player_matches['impact_score'].sum()
#             total_impact_score += impact_score

#     return total_impact_score

# train_data['team1_impact_score'] = train_data.progress_apply(
#     lambda x: player_impact_score(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 10), axis=1)
# train_data['team2_impact_score'] = train_data.progress_apply(
#     lambda x: player_impact_score(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 10), axis=1)


In [39]:
def player_impact_score(player_roster, batsman_data, bowler_data, date, n):  # ( WRONG IMPLEMENTATION )
    player_ids = player_roster.split(':')
    total_impact_score = 0

    for player_id in player_ids:
        batsman_matches = batsman_data[(batsman_data['batsman_id'] == player_id) &
                                       (batsman_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        bowler_matches = bowler_data[(bowler_data['bowler_id'] == player_id) &
                                     (bowler_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        batsman_score = batsman_matches['runs'].sum()  # Example metric
        bowler_score = bowler_matches['wicket_count'].sum()  # Example metric
        total_impact_score += (batsman_score + bowler_score)

    return total_impact_score  # Ensures a single scalar is returned

def player_form_factor(player_roster, batsman_data, date, n):    # ( WRONG IMPLEMENTATION )
    player_ids = player_roster.split(':')
    total_form_factor = 0
    players_counted = 0

    for player_id in player_ids:
        player_matches = batsman_data[(batsman_data['batsman_id'] == player_id) &
                                      (batsman_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        if not player_matches.empty:
            runs_scored = player_matches['runs'].sum()
            balls_faced = player_matches['balls_faced'].sum()
            strike_rate = (runs_scored / balls_faced * 100) if balls_faced else 0
            total_form_factor += strike_rate
            players_counted += 1

    return total_form_factor / players_counted if players_counted else 0


In [40]:
train_data['team1_impact_score'] = train_data.apply(
    lambda x: player_impact_score(x['team1_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)
train_data['team2_impact_score'] = train_data.apply(
    lambda x: player_impact_score(x['team2_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)

train_data['team1_form_factor'] = train_data.apply(
    lambda x: player_form_factor(x['team1_roster_ids'], batsman_lvl_data, x['match_dt'], 15), axis=1)
train_data['team2_form_factor'] = train_data.apply(
    lambda x: player_form_factor(x['team2_roster_ids'], batsman_lvl_data, x['match_dt'], 15), axis=1)

# Repeat for test_data if needed


In [41]:
def historical_matchup(team1_id, team2_id, match_dt, match_lvl_data):
    matchups = match_lvl_data[((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
                              (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
                              (match_lvl_data['match_dt'] < match_dt)]
    wins = matchups['winner_id'].apply(lambda winner_id: 1 if winner_id == team1_id else 0).sum()
    total_matches = len(matchups)
    return wins / total_matches if total_matches > 0 else 0

train_data['team1_historical_matchup_win_pct'] = train_data.apply(
    lambda x: historical_matchup(x['team1_id'], x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)


In [42]:
def recent_team_form(team_id, match_dt, match_lvl_data, last_n=5):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < match_dt)].sort_values('match_dt', ascending=False).head(last_n)
    wins = recent_matches['winner_id'].apply(lambda winner_id: 1 if winner_id == team_id else 0).sum()
    return wins / last_n if last_n > 0 else 0

train_data['team1_recent_form'] = train_data.apply(
    lambda x: recent_team_form(x['team1_id'], x['match_dt'], match_lvl_data),
    axis=1)
train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_team_form(x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)


In [43]:
def venue_winning_percentage(team_id, venue_id, match_dt, match_lvl_data):
    historical_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < match_dt)]
    total_matches = len(historical_matches)
    wins = len(historical_matches[historical_matches['winner_id'] == team_id])
    return wins / total_matches if total_matches > 0 else 0

train_data['team1_venue_win_pct'] = train_data.apply(
    lambda x: venue_winning_percentage(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1)
train_data['team2_venue_win_pct'] = train_data.apply(
    lambda x: venue_winning_percentage(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1)


In [44]:
def team_win_rate_under_condition(team_id, condition_col, condition_val, date, match_lvl_data):
    # Filter matches based on the given condition up to the specified date
    relevant_matches = match_lvl_data[(match_lvl_data[condition_col] == condition_val) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]
    wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    win_rate = wins / len(relevant_matches) if len(relevant_matches) > 0 else 0
    return win_rate

# Example usage: Calculate team win rate for 'day' lighting condition
train_data['team1_day_light_win_rate'] = train_data.apply(
    lambda x: team_win_rate_under_condition(x['team1_id'], 'lighting', 'day', x['match_dt'], match_lvl_data),
    axis=1)
train_data['team2_day_light_win_rate'] = train_data.apply(
    lambda x: team_win_rate_under_condition(x['team2_id'], 'lighting', 'day', x['match_dt'], match_lvl_data),
    axis=1)


In [45]:
def team_venue_scoring(team_id, venue_id, date, match_lvl_data):  # ( WRONG IMPLEMENTATION )
    matches_at_venue = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]
    total_runs_scored = matches_at_venue.apply(lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1).sum()
    total_runs_conceded = matches_at_venue.apply(lambda x: x['inning2_runs'] if x['team1_id'] == team_id else x['inning1_runs'], axis=1).sum()
    matches_count = len(matches_at_venue)
    avg_runs_scored = total_runs_scored / matches_count if matches_count > 0 else 0
    avg_runs_conceded = total_runs_conceded / matches_count if matches_count > 0 else 0
    return avg_runs_scored, avg_runs_conceded

train_data['team1_avg_runs_scored_at_venue'], train_data['team1_avg_runs_conceded_at_venue'] = zip(*train_data.apply(
    lambda x: team_venue_scoring(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1))
train_data['team2_avg_runs_scored_at_venue'], train_data['team2_avg_runs_conceded_at_venue'] = zip(*train_data.apply(
    lambda x: team_venue_scoring(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1))


In [46]:
def adjusted_venue_win_rate(team_id, venue_id, date, match_lvl_data):
    relevant_matches = match_lvl_data[
        (match_lvl_data['ground_id'] == venue_id) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
        (match_lvl_data['match_dt'] < date)
    ]

    # Calculate wins considering toss and lighting as factors
    if not relevant_matches.empty:
        wins = relevant_matches.apply(
            lambda x: 1 if (x['winner_id'] == team_id and x['toss_winner'] == team_id and x['lighting'] == 'day') else 0, axis=1
        ).sum()
        total_matches = len(relevant_matches)
        win_rate = wins / total_matches if total_matches > 0 else 0
        return win_rate
    return 0

# Applying the feature to the DataFrame
train_data['team1_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
train_data['team2_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)


In [47]:
def historical_matchup_at_venue(team1_id, team2_id, venue_id, date, match_lvl_data):
    head_to_head_matches = match_lvl_data[
        ((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
         (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
        (match_lvl_data['ground_id'] == venue_id) &
        (match_lvl_data['match_dt'] < date)
    ]

    if not head_to_head_matches.empty:
        wins = head_to_head_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
        total_matches = len(head_to_head_matches)
        return wins / total_matches if total_matches > 0 else 0
    return 0

train_data['team1_head_to_head_venue_win_rate'] = train_data.apply(
    lambda x: historical_matchup_at_venue(x['team1_id'], x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)


In [48]:
def head_to_head_win_rate(team1_id, team2_id, match_lvl_data):
    team1_matches = match_lvl_data[(match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id)]
    team2_matches = match_lvl_data[(match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)]
    total_matches = pd.concat([team1_matches, team2_matches])
    team1_wins = total_matches[total_matches['winner'] == team1_id].shape[0]
    total_matches_played = total_matches.shape[0]
    if total_matches_played > 0:
        return team1_wins / total_matches_played
    else:
        return 0
def toss_win_advantage(team_id, match_lvl_data):
    team_toss_wins = match_lvl_data[(match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] == team_id)]
    team_toss_losses = match_lvl_data[(match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] != team_id)]
    total_toss_wins = team_toss_wins.shape[0] + team_toss_losses.shape[0]
    if total_toss_wins > 0:
        return team_toss_wins.shape[0] / total_toss_wins
    else:
        return 0
def batting_order_advantage(team_id, match_lvl_data):
    team_wins_batting_first = match_lvl_data[(match_lvl_data['toss decision'] == 'bat') & (match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] == team_id)]
    team_wins_batting_second = match_lvl_data[(match_lvl_data['toss decision'] == 'field') & (match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] == team_id)]
    total_wins = team_wins_batting_first.shape[0] + team_wins_batting_second.shape[0]
    if total_wins > 0:
        return team_wins_batting_first.shape[0] / total_wins
    else:
        return 0
def avg_runs_conceded(team_id, match_lvl_data, batsman_lvl_data, bowler_lvl_data):
    team_matches = match_lvl_data[match_lvl_data['team1_id'] == team_id]
    team_bowlers = []
    for roster_ids in team_matches['team1_roster_ids']:
        team_bowlers.extend(roster_ids.split(','))
    team_bowlers = list(set(team_bowlers))
    runs_conceded = batsman_lvl_data[batsman_lvl_data['out_by_bowler'].isin(team_bowlers)]['runs'].sum()
    balls_bowled = bowler_lvl_data[bowler_lvl_data['bowler_id'].isin(team_bowlers)]['balls_bowled'].sum()
    if balls_bowled > 0:
        return runs_conceded / balls_bowled
    else:
        return 0

def avg_batting_strike_rate(team_id, match_lvl_data, batsman_lvl_data):
    team_matches = match_lvl_data[match_lvl_data['team1_id'] == team_id]
    team_batsmen = []
    for roster_ids in team_matches['team1_roster_ids']:
        team_batsmen.extend(roster_ids.split(','))
    team_batsmen = list(set(team_batsmen))
    total_runs = batsman_lvl_data[batsman_lvl_data['batsman_id'].isin(team_batsmen)]['runs'].sum()
    total_balls_faced = batsman_lvl_data[batsman_lvl_data['batsman_id'].isin(team_batsmen)]['balls_faced'].sum()
    if total_balls_faced > 0:
        return total_runs / total_balls_faced * 100
    else:
        return 0


In [49]:
# Head-to-Head Win Rate
train_data['team1_head_to_head_win_rate'] = train_data.apply(lambda x: head_to_head_win_rate(x['team1_id'], x['team2_id'], match_lvl_data), axis=1)
train_data['team2_head_to_head_win_rate'] = train_data.apply(lambda x: head_to_head_win_rate(x['team2_id'], x['team1_id'], match_lvl_data), axis=1)

# Toss Win Advantage
train_data['team1_toss_win_advantage'] = train_data.apply(lambda x: toss_win_advantage(x['team1_id'], match_lvl_data), axis=1)
train_data['team2_toss_win_advantage'] = train_data.apply(lambda x: toss_win_advantage(x['team2_id'], match_lvl_data), axis=1)

# Batting Order Advantage
train_data['team1_batting_first_advantage'] = train_data.apply(lambda x: batting_order_advantage(x['team1_id'], match_lvl_data), axis=1)
train_data['team2_batting_first_advantage'] = train_data.apply(lambda x: batting_order_advantage(x['team2_id'], match_lvl_data), axis=1)

# Average Runs Conceded
train_data['team1_avg_runs_conceded'] = train_data.apply(lambda x: avg_runs_conceded(x['team1_id'], match_lvl_data, batsman_lvl_data, bowler_lvl_data), axis=1)
train_data['team2_avg_runs_conceded'] = train_data.apply(lambda x: avg_runs_conceded(x['team2_id'], match_lvl_data, batsman_lvl_data, bowler_lvl_data), axis=1)

# Average Batting Strike Rate
train_data['team1_avg_batting_strike_rate'] = train_data.apply(lambda x: avg_batting_strike_rate(x['team1_id'], match_lvl_data, batsman_lvl_data), axis=1)
train_data['team2_avg_batting_strike_rate'] = train_data.apply(lambda x: avg_batting_strike_rate(x['team2_id'], match_lvl_data, batsman_lvl_data), axis=1)

In [51]:
# train_data['team2_recent_mom_count_10matches'].unique()

In [52]:
match_lvl_data['player_of_the_match_id'] = match_lvl_data['player_of_the_match_id'].astype(str)


In [53]:
import numpy as np

def count_recent_mom_awards(team_roster, match_lvl_data, date, n):
    '''
    Function to get the total number of Man of the Match awards won by players in the roster of a team in the last n games.

    Input-
    1. team_roster: ':' separated list of player ids in the roster of a team.
    2. match_lvl_data: DataFrame containing match level data.
    3. date: match date to calculate this feature before.
    4. n: Number of games to look-back and create this feature.

    Returns-
    int value denoting the sum of MoM awards won by all players in the roster.
    '''

    player_ids = str(team_roster).split(':')  # Split string of ':' separated ids into a list of ids
    total_mom_awards = 0

    for player_id in player_ids:  # Loop over each player_id in roster
        # Get relevant matches for the player up to the specified date
        player_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'] == player_id) &
                                        (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        # Count the number of matches where this player was MoM
        mom_count = len(player_matches)
        total_mom_awards += mom_count  # Accumulate MoM awards

    return total_mom_awards

# Applying the function to train_data for an example usage
train_data['team1_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)
train_data['team2_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)


In [54]:
train_data['team2_recent_mom_count']

0       0
1      15
2      35
3      27
4      24
       ..
943    14
944    18
945     3
946    21
947     3
Name: team2_recent_mom_count, Length: 948, dtype: int64

In [55]:
print(match_lvl_data.columns)


Index(['match id', 'team1', 'team2', 'winner', 'by', 'win amount',
       'toss winner', 'toss decision', 'venue', 'city', 'match_dt', 'lighting',
       'series_name', 'season', 'ground_id', 'umpire1', 'umpire2',
       'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs',
       'inning2_wickets', 'inning2_balls', 'team1_id', 'team1_roster_ids',
       'team2_id', 'team2_roster_ids', 'series_type', 'winner_id',
       'player_of_the_match_id'],
      dtype='object')


In [56]:
def avg_runs_at_venue_by_conditions(ground_id, lighting, inning, match_lvl_data):
    condition_matches = match_lvl_data[(match_lvl_data['ground_id'] == ground_id) &
                                       (match_lvl_data['lighting'] == lighting)]
    if inning == 1:
        avg_runs = condition_matches['inning1_runs'].mean() if not condition_matches.empty else 0
    else:
        avg_runs = condition_matches['inning2_runs'].mean() if not condition_matches.empty else 0
    return avg_runs

# Applying this feature for each lighting condition and inning to train_data
train_data['ground_avg_runs_inning1_day'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day', 1, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning1_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'night', 1, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning1_day_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day/night', 1, match_lvl_data), axis=1)

train_data['ground_avg_runs_inning2_day'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day', 2, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning2_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'night', 2, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning2_day_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day/night', 2, match_lvl_data), axis=1)


In [57]:
def average_margin_of_victory(team_id, match_lvl_data, date):
    team_wins = match_lvl_data[(match_lvl_data['winner_id'] == team_id) &
                               (match_lvl_data['match_dt'] < date)]
    if team_wins.empty:
        return 0
    avg_runs_margin = team_wins[team_wins['by'] == 'runs']['win amount'].mean()
    avg_wickets_margin = team_wins[team_wins['by'] == 'wickets']['win amount'].mean()
    return {'avg_runs_margin': avg_runs_margin, 'avg_wickets_margin': avg_wickets_margin}

# Applying this feature to train_data
train_data['team1_dominance'] = train_data.apply(
    lambda x: average_margin_of_victory(x['team1_id'], match_lvl_data, x['match_dt']), axis=1)
train_data['team2_dominance'] = train_data.apply(
    lambda x: average_margin_of_victory(x['team2_id'], match_lvl_data, x['match_dt']), axis=1)


In [59]:
# train_data['team1_weighted_mom'][:10]

In [60]:
def weighted_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = team_roster.split(':')
    total_weighted_awards = 0

    # Fetch recent matches up to 'n' for all players in the roster before the specified date
    recent_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'].isin(player_ids)) &
                                    (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

    # Assigning weights - newer matches get higher weight
    for i, match in enumerate(recent_matches.itertuples(), 1):
        weight = 1 + (n - i) * 0.1  # Example weight formula: 1 + (total_matches - position) * 0.1
        total_weighted_awards += weight

    return total_weighted_awards

# Applying the feature to train_data
train_data['team1_weighted_mom'] = train_data.apply(
    lambda x: weighted_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)
train_data['team2_weighted_mom'] = train_data.apply(
    lambda x: weighted_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)


In [61]:
def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.

    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.

    Output- None

    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points


# In[21]:


# Compute team1's win% in last 5 games
train_data['team1_winp_last25'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 25), axis=1)
# Compute team2's win% in last 5 games
train_data['team2_winp_last25'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 25), axis=1)


# In[22]:


# Take the ratio of (team1's win% in their last 5 games)/(team2's win% in their last 5 games). Adding 1 to avoid divide by zero error
# train_data['team_winp_last25'] = (train_data['team1_winp_last5']+1)/(train_data['team2_winp_last5']+1)
# train_data.drop(columns=['team1_winp_last5', 'team2_winp_last5'], inplace=True) # drop intermediate columns



100%|██████████| 948/948 [00:00<00:00, 1515.00it/s]
100%|██████████| 948/948 [00:00<00:00, 1554.11it/s]


In [62]:
match_lvl_data.columns

Index(['match id', 'team1', 'team2', 'winner', 'by', 'win amount',
       'toss winner', 'toss decision', 'venue', 'city', 'match_dt', 'lighting',
       'series_name', 'season', 'ground_id', 'umpire1', 'umpire2',
       'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs',
       'inning2_wickets', 'inning2_balls', 'team1_id', 'team1_roster_ids',
       'team2_id', 'team2_roster_ids', 'series_type', 'winner_id',
       'player_of_the_match_id'],
      dtype='object')

In [63]:
def head_to_head_win_rate(team1_id, team2_id, match_lvl_data, date):
    head_to_head_matches = match_lvl_data[((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
                                           (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
                                          (match_lvl_data['match_dt'] < date)]

    if head_to_head_matches.empty:
        return 0.5  # Neutral value if no historical matches

    wins = head_to_head_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    win_rate = wins / len(head_to_head_matches)
    return win_rate

train_data['team1_h2h_win_rate'] = train_data.apply(
    lambda x: head_to_head_win_rate(x['team1_id'], x['team2_id'], match_lvl_data, x['match_dt']), axis=1)


  train_data['team1_h2h_win_rate'] = train_data.apply(


In [64]:
def team_consistency_score(team_roster, batsman_data, bowler_data, date, n):
    player_ids = team_roster.split(':')
    consistency_scores = []

    for player_id in player_ids:
        # Evaluate batting consistency
        player_batting = batsman_data[(batsman_data['batsman_id'] == player_id) &
                                      (batsman_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        if not player_batting.empty:
            avg_runs = player_batting['runs'].mean()
            std_dev_runs = player_batting['runs'].std()
            batting_consistency = avg_runs / std_dev_runs if std_dev_runs else avg_runs

            consistency_scores.append(batting_consistency)

        # Evaluate bowling consistency
        player_bowling = bowler_data[(bowler_data['bowler_id'] == player_id) &
                                     (bowler_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        if not player_bowling.empty:
            avg_wickets = player_bowling['wicket_count'].mean()
            std_dev_wickets = player_bowling['wicket_count'].std()
            bowling_consistency = avg_wickets / std_dev_wickets if std_dev_wickets else avg_wickets

            consistency_scores.append(bowling_consistency)

    if consistency_scores:
        team_score = np.mean(consistency_scores)
    else:
        team_score = 0

    return team_score

train_data['team1_consistency_score'] = train_data.apply(
    lambda x: team_consistency_score(x['team1_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)
train_data['team2_consistency_score'] = train_data.apply(
    lambda x: team_consistency_score(x['team2_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)

  train_data['team1_consistency_score'] = train_data.apply(
  train_data['team2_consistency_score'] = train_data.apply(


In [65]:
# def strategic_success_rate(team_id, match_lvl_data, date, n):
#     recent_matches = match_lvl_data[(match_lvl_data['match_dt'] < date) &
#                                     ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))].sort_values('match_dt', ascending=False).head(n)

#     successful_strategies = 0
#     for match in recent_matches.itertuples():
#         if ((match.toss winner == team_id and match.toss decision == 'bat' and match.winner_id == team_id and match.inning1_runs > match.inning2_runs) or
#             (match.toss winner == team_id and match.toss decision == 'field' and match.winner_id == team_id and match.inning2_runs > match.inning1_runs)):
#             successful_strategies += 1

#     return successful_strategies / n if n else 0

# train_data['team1_strategic_success_rate'] = train_data.apply(
#     lambda x: strategic_success_rate(x['team1_id'], match_lvl_data, x['match_dt'], 10), axis=1)
# train_data['team2_strategic_success_rate'] = train_data.apply(
#     lambda x: strategic_success_rate(x['team2_id'], match_lvl_data, x['match_dt'], 10), axis=1)


In [66]:
# train_data['team2_consistency_score'].unique()

In [67]:
train_data.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = False)[:25]

winner_01                           1.000000
team2_recent_form                   0.105046
team2_momentum                      0.083399
team2_average_winning_margin        0.074013
team2_day_match_win_rate            0.073309
team2_overall_win_rate              0.072307
team2_recent_win_rate               0.069638
team2_winp_last25                   0.069452
match id                            0.038028
team2_win_rate_at_venue             0.034948
team2_adjusted_win_rate_at_venue    0.025747
team1_wicket_loss_average           0.020005
team2_post_loss_performance         0.019303
team1_day_match_win_rate            0.018755
team2_bowler_impact_score           0.018734
ground_id                           0.015227
team2_night_match_win_rate          0.012764
team2_avg_wicket                    0.005451
team2_scoring_variance_at_venue     0.003014
team2_scoring_average               0.002221
team1_average_winning_margin        0.001749
team2_batting_strength             -0.003665
venue_avg_

In [68]:
train_data.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = True)[:25]

team1_weighted_mom                 -0.151858
team1_recent_mom_count             -0.141026
team1_batting_strength             -0.140372
team1_batsman_performance_index    -0.139209
team1_avg_wicket                   -0.133810
team1_strike_rate                  -0.129016
team1_bowler_impact_score          -0.121397
team_count_50runs_last15           -0.110761
team1_overall_win_rate             -0.091137
team1_bowler_eco                   -0.086270
team1_winp_last25                  -0.085741
team1_night_match_win_rate         -0.084950
team1_recent_win_rate              -0.082662
team1_scoring_average              -0.080175
team1_momentum                     -0.079657
team1_post_loss_performance        -0.077811
team1_recent_form                  -0.075800
team2_id                           -0.068129
winner_id                          -0.065381
team1_win_rate_at_venue            -0.064772
team1_day_night_match_win_rate     -0.063411
team1_historical_matchup_win_pct   -0.060605
team1_win_