In [None]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [None]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [None]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

In [None]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [None]:
def no50sLastn(player_list, date, n):

    player_list = str(player_list).split(':')
    res_list = []
    for player in player_list:
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

In [None]:
def calculate_batsman_strike_rate(player_list, date, n):
    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    res_list = []

    for player in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        runs = recent_matches['runs'].sum()
        balls = recent_matches['balls_faced'].sum()

        # Calculate strike rate and avoid division by zero
        if balls > 0:
            strike_rate = (runs / balls) * 100
        else:
            strike_rate = 0

        res_list.append(strike_rate)

    # Calculate the average strike rate across all players in the list
    if res_list:  # Ensure the list is not empty to avoid division by zero
        average_strike_rate = sum(res_list) / len(res_list)
    else:
        average_strike_rate = 0

    return average_strike_rate


def calculate_bowler_economy_rate(player_list, date, n):
    player_list = player_list.split(':')
    economy_rates = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        runs_conceded = recent_matches['runs'].sum()
        overs = (recent_matches['balls_bowled'].sum()) / 6
        economy_rate = (runs_conceded / overs) if overs > 0 else 0
        economy_rates.append(economy_rate)
    return sum(economy_rates) / len(economy_rates) if economy_rates else 0


def average_wickets_taken(player_list, date, n):
    player_list = player_list.split(':')
    wickets_list = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_wickets = recent_matches['wicket_count'].sum()
        average_wickets = total_wickets / n if n > 0 else 0
        wickets_list.append(average_wickets)
    return sum(wickets_list) / len(wickets_list) if wickets_list else 0


def team_batting_strength(player_list, date, n):
    players = player_list.split(':')
    total_runs = 0
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        total_runs += recent_matches['runs'].sum()
    return total_runs / len(players) if players else 0

In [None]:
def player_form_factor(player_list, date, n):
    players = player_list.split(':')
    form_factors = []
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            weights = np.linspace(1, 2, len(recent_matches))  # Linear weight increase from 1 to 2
            form_factors.append(np.average(recent_matches['runs'], weights=weights))
        else:
            form_factors.append(0)
    return np.mean(form_factors) if players else 0

def head_to_head_performance(team1_id, team2_id, date, match_lvl_data):
    # Filter matches between the two teams up to the specified date
    relevant_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
         (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id))
    ]

    # Calculate win percentage for team1
    team1_wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    win_percentage = team1_wins / len(relevant_matches) if len(relevant_matches) > 0 else 0

    # Calculate average score for team1 in these matches
    # Assuming scores are in 'inning1_runs' or 'inning2_runs' based on which team was batting first
    team1_scores = relevant_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team1_id else x['inning2_runs'], axis=1
    )
    team1_avg_score = team1_scores.mean() if len(team1_scores) > 0 else 0

    return win_percentage, team1_avg_score

In [None]:
def recent_performance(team_id, date, n, match_lvl_data):
    recent_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ].tail(n)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / len(recent_matches) if len(recent_matches) > 0 else 0

def average_winning_margin(team_id, date, match_lvl_data):
    winning_matches = match_lvl_data[
        (match_lvl_data['winner_id'] == team_id) & (match_lvl_data['match_dt'] < date)
    ]
    if len(winning_matches) > 0:
        runs_wins = winning_matches[winning_matches['by'] == 'runs']['win amount']
        wickets_wins = winning_matches[winning_matches['by'] == 'wickets']['win amount']
        average_margin = pd.concat([runs_wins, wickets_wins]).mean()
        return average_margin
    return 0

def team_scoring_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_scores = team_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1
    )
    return team_scores.mean() if len(team_scores) > 0 else 0

def team_wicket_loss_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_wickets = team_matches.apply(
        lambda x: x['inning1_wickets'] if x['team1_id'] == team_id else x['inning2_wickets'], axis=1
    )
    return team_wickets.mean() if len(team_wickets) > 0 else 0

In [None]:
def average_score_by_venue(venue_id, date, match_lvl_data):
    # Filter matches based on the venue and date
    relevant_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) & (match_lvl_data['match_dt'] < date)]
    average_score = (relevant_matches['inning1_runs'].sum() + relevant_matches['inning2_runs'].sum()) / (2 * len(relevant_matches))
    return average_score if not pd.isna(average_score) else 0

def team_win_rate_at_venue(team_id, venue_id, date, match_lvl_data):
    # Filter matches where the team played at the given venue up to the specified date
    relevant_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]
    wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    win_rate = wins / len(relevant_matches) if len(relevant_matches) > 0 else 0
    return win_rate

def most_frequent_matchups(team_id, match_lvl_data):
    # Filter matches involving the team
    relevant_matches = match_lvl_data[(match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)]
    opponent_counts = relevant_matches.apply(lambda x: x['team2_id'] if x['team1_id'] == team_id else x['team1_id'], axis=1).value_counts()
    return opponent_counts

## Harshit

In [None]:
def batsman_performance_index(bat_df, match_id, date):
    relevant_batsmen = bat_df[(bat_df['match id'] == match_id) & (bat_df['match_dt'] <= date)]
    relevant_batsmen['performance_index'] = (relevant_batsmen['runs'] * 0.5 +
                                             relevant_batsmen['strike_rate'] * 0.3 +
                                             (relevant_batsmen['Fours'] + relevant_batsmen['Sixes'] * 2) * 0.2) * \
                                            (1 + 0.1 * relevant_batsmen['is_batsman_captain'] + 0.05 * relevant_batsmen['is_batsman_keeper'])
    return relevant_batsmen['performance_index'].sum()



def team_batsman_performance_index(bat_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 0.5 +
                                  recent_matches['strike_rate'] * 0.3 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] * 2) * 0.2) * \
                                  (1 + 0.1 * recent_matches['is_batsman_captain'].iloc[0] +
                                   0.05 * recent_matches['is_batsman_keeper'].iloc[0])
            total_performance_index += player_performance.sum()

    return total_performance_index


def team_bowler_impact_score(bowler_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count'] * 2 +
                             (120 / recent_matches['economy']) * 0.5 +
                             recent_matches['maiden'] * 1) * \
                            (1 + 0.1 * recent_matches['is_bowler_captain'].iloc[0] +
                             0.05 * recent_matches['is_bowler_keeper'].iloc[0])
            total_impact_score += player_impact.sum()

    return total_impact_score

In [None]:
def venue_performance(match_lvl_data, venue_id, date):
    historical_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) & (match_lvl_data['match_dt'] < date)]
    average_runs = historical_matches[['inning1_runs', 'inning2_runs']].mean().mean()
    win_rate = historical_matches['winner'].value_counts(normalize=True)
    return average_runs, win_rate

train_data['venue_avg_runs'], train_data['venue_win_rate'] = zip(*train_data.apply(
    lambda x: venue_performance(match_lvl_data, x['ground_id'], x['match_dt']),
    axis=1
))

def lighting_performance(match_lvl_data, lighting_type, team_id, date):
    matches = match_lvl_data[(match_lvl_data['lighting'] == lighting_type) & (match_lvl_data['match_dt'] < date)]
    if len(matches) > 0:
        wins = matches[matches['winner_id'] == team_id].shape[0]
        return wins / len(matches)
    return 0  # Return 0 if no matches found under this condition


def toss_advantage(match_lvl_data, team_id, date):
    matches = match_lvl_data[(match_lvl_data['toss winner'] == team_id) & (match_lvl_data['match_dt'] < date)]
    wins_after_toss_win = matches[matches['winner'] == team_id].shape[0]
    if matches.shape[0] > 0:
        return wins_after_toss_win / matches.shape[0]
    return 0

In [None]:
def adjusted_team_venue_win_rate(match_lvl_data, team_id, venue_id, date):
    # Filter matches at the venue for the specific team before the given date
    venue_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                   ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                   (match_lvl_data['match_dt'] < date)]
    # Calculate basic win rate
    wins = venue_matches[venue_matches['winner_id'] == team_id].shape[0]
    total = venue_matches.shape[0]
    win_rate = wins / total if total > 0 else 0

    # Adjust win rate by recent form (last 5 matches)
    recent_form = venue_matches.tail(5)
    recent_wins = recent_form[recent_form['winner_id'] == team_id].shape[0]
    recent_total = recent_form.shape[0]
    recent_win_rate = recent_wins / recent_total if recent_total > 0 else 0

    # Combine basic and recent win rates
    if total > 0:
        adjusted_win_rate = (win_rate * 0.75) + (recent_win_rate * 0.25)
    else:
        adjusted_win_rate = 0

    return adjusted_win_rate

def toss_strategy_impact(match_lvl_data, team_id, toss_decision, venue_id, date):
    matches = match_lvl_data[(match_lvl_data['toss winner'] == team_id) &
                             (match_lvl_data['toss decision'] == toss_decision) &
                             (match_lvl_data['ground_id'] == venue_id) &
                             (match_lvl_data['match_dt'] < date)]
    wins = matches[matches['winner_id'] == team_id].shape[0]
    total = matches.shape[0]
    return wins / total if total > 0 else 0


def overall_historical_win_rate(match_lvl_data, team_id, date):
    historical_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < date)]
    wins = historical_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    total_matches = len(historical_matches)
    return wins / total_matches if total_matches > 0 else 0

def recent_form(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(5)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / 5 if recent_matches.shape[0] > 0 else 0

def head_to_head_win_rate(match_lvl_data, team1_id, team2_id, date):
    head_to_head_matches = match_lvl_data[((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
                                           (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
                                          (match_lvl_data['match_dt'] < date)]
    wins = head_to_head_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    total = len(head_to_head_matches)
    return wins / total if total > 0 else 0


def adjusted_venue_win_rate_with_recent_form(match_lvl_data, team_id, venue_id, date):
    # Historical win rate at the venue
    venue_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                   ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                   (match_lvl_data['match_dt'] < date)]
    venue_wins = venue_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    venue_total = len(venue_matches)

    # Recent form (last 5 matches overall)
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(5)
    recent_wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    recent_total = len(recent_matches)

    # Combine both metrics
    venue_win_rate = venue_wins / venue_total if venue_total > 0 else 0
    recent_win_rate = recent_wins / recent_total if recent_total > 0 else 0
    adjusted_win_rate = 0.7 * venue_win_rate + 0.3 * recent_win_rate  # Weighted average

    return adjusted_win_rate

def scoring_consistency(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(10)
    runs_scored = recent_matches.apply(lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1)
    if len(runs_scored) > 0:
        mean_runs = runs_scored.mean()
        variance = runs_scored.var()
        consistency_index = mean_runs / variance if variance != 0 else mean_runs
    else:
        consistency_index = 0
    return consistency_index

def wicket_loss_variance(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(10)
    wickets_lost = recent_matches.apply(lambda x: x['inning1_wickets'] if x['team1_id'] == team_id else x['inning2_wickets'], axis=1)
    variance = wickets_lost.var() if len(wickets_lost) > 0 else 0
    return variance

def calculate_exponential_momentum(match_lvl_data, team_id, date, alpha=0.1):
    matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False)
    wins = matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).to_numpy()
    weights = np.exp(-alpha * np.arange(len(wins)))
    if np.sum(weights) > 0:
        momentum = np.dot(wins, weights) / np.sum(weights)
    else:
        momentum = 0  # Handle case with no matches
    return momentum

In [None]:
def win_rate_against_recent_form(team_id, venue_id, date, match_lvl_data, n=5):
    # Filter relevant matches
    relevant_matches = match_lvl_data[(match_lvl_data['venue'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] <= date)]

    # Calculate opponent's recent form
    relevant_matches['opponent_id'] = relevant_matches.apply(lambda x: x['team1_id'] if x['team2_id'] == team_id else x['team2_id'], axis=1)
    recent_forms = {}
    for opponent in relevant_matches['opponent_id'].unique():
        opponent_matches = match_lvl_data[(match_lvl_data['team1_id'] == opponent) | (match_lvl_data['team2_id'] == opponent)]
        recent_wins = opponent_matches.tail(n)['winner'].apply(lambda x: 1 if x == opponent else 0).sum()
        recent_forms[opponent] = recent_wins / n

    relevant_matches['opponent_form'] = relevant_matches['opponent_id'].map(recent_forms)
    weighted_wins = (relevant_matches['winner'].apply(lambda x: 1 if x == team_id else 0) * relevant_matches['opponent_form']).sum()
    win_rate = weighted_wins / relevant_matches['opponent_form'].sum() if relevant_matches['opponent_form'].sum() > 0 else 0
    return win_rate

In [None]:
def adjusted_win_rate_at_venue(match_lvl_data, team_id, venue, match_dt):
    # Filter matches for the team at this venue before this date
    relevant_matches = match_lvl_data[(match_lvl_data['venue'] == venue) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < match_dt)]
    opponent_strengths = relevant_matches.apply(
        lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'] if x['team1_id'] == team_id else x['team1_id'], x['match_dt']),
        axis=1
    )
    wins = (relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0) * opponent_strengths).sum()
    return wins / opponent_strengths.sum() if opponent_strengths.sum() > 0 else 0

def performance_after_loss(match_lvl_data, team_id, match_dt):
    past_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                  (match_lvl_data['match_dt'] < match_dt)].sort_values(by='match_dt', ascending=False)
    if not past_matches.empty and past_matches.iloc[0]['winner_id'] != team_id:
        # Find next match result
        if past_matches.shape[0] > 1:
            return 1 if past_matches.iloc[1]['winner_id'] == team_id else 0
    return None  # Not applicable if no match after a loss

def scoring_variance_at_venue(match_lvl_data, team_id, venue, match_dt):
    scores = match_lvl_data[((match_lvl_data['venue'] == venue) &
                             ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < match_dt))].apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'],
        axis=1
    )
    if len(scores) > 1:
        return np.var(scores)
    return None  # Not applicable if fewer than 2 scores



In [None]:
def player_impact_score(player_roster, batsman_data, bowler_data, date, n):
    player_ids = player_roster.split(':')
    total_impact_score = 0

    for player_id in player_ids:
        batsman_matches = batsman_data[(batsman_data['batsman_id'] == player_id) &
                                       (batsman_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        bowler_matches = bowler_data[(bowler_data['bowler_id'] == player_id) &
                                     (bowler_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        batsman_score = batsman_matches['runs'].sum()  # Example metric
        bowler_score = bowler_matches['wicket_count'].sum()  # Example metric
        total_impact_score += (batsman_score + bowler_score)

    return total_impact_score  # Ensures a single scalar is returned

def player_form_factor(player_roster, batsman_data, date, n):
    player_ids = player_roster.split(':')
    total_form_factor = 0
    players_counted = 0

    for player_id in player_ids:
        player_matches = batsman_data[(batsman_data['batsman_id'] == player_id) &
                                      (batsman_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        if not player_matches.empty:
            runs_scored = player_matches['runs'].sum()
            balls_faced = player_matches['balls_faced'].sum()
            strike_rate = (runs_scored / balls_faced * 100) if balls_faced else 0
            total_form_factor += strike_rate
            players_counted += 1

    return total_form_factor / players_counted if players_counted else 0


In [None]:
def historical_matchup(team1_id, team2_id, match_dt, match_lvl_data):
    matchups = match_lvl_data[((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
                              (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
                              (match_lvl_data['match_dt'] < match_dt)]
    wins = matchups['winner_id'].apply(lambda winner_id: 1 if winner_id == team1_id else 0).sum()
    total_matches = len(matchups)
    return wins / total_matches if total_matches > 0 else 0

In [None]:
def recent_team_form(team_id, match_dt, match_lvl_data, last_n=5):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < match_dt)].sort_values('match_dt', ascending=False).head(last_n)
    wins = recent_matches['winner_id'].apply(lambda winner_id: 1 if winner_id == team_id else 0).sum()
    return wins / last_n if last_n > 0 else 0

def venue_winning_percentage(team_id, venue_id, match_dt, match_lvl_data):
    historical_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < match_dt)]
    total_matches = len(historical_matches)
    wins = len(historical_matches[historical_matches['winner_id'] == team_id])
    return wins / total_matches if total_matches > 0 else 0

def team_win_rate_under_condition(team_id, condition_col, condition_val, date, match_lvl_data):
    # Filter matches based on the given condition up to the specified date
    relevant_matches = match_lvl_data[(match_lvl_data[condition_col] == condition_val) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] <= date)]
    wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    win_rate = wins / len(relevant_matches) if len(relevant_matches) > 0 else 0
    return win_rate

def team_venue_scoring(team_id, venue_id, date, match_lvl_data):
    matches_at_venue = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] <= date)]
    total_runs_scored = matches_at_venue.apply(lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1).sum()
    total_runs_conceded = matches_at_venue.apply(lambda x: x['inning2_runs'] if x['team1_id'] == team_id else x['inning1_runs'], axis=1).sum()
    matches_count = len(matches_at_venue)
    avg_runs_scored = total_runs_scored / matches_count if matches_count > 0 else 0
    avg_runs_conceded = total_runs_conceded / matches_count if matches_count > 0 else 0
    return avg_runs_scored, avg_runs_conceded

def adjusted_venue_win_rate(team_id, venue_id, date, match_lvl_data):
    relevant_matches = match_lvl_data[
        (match_lvl_data['ground_id'] == venue_id) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
        (match_lvl_data['match_dt'] < date)
    ]

    # Calculate wins considering toss and lighting as factors
    if not relevant_matches.empty:
        wins = relevant_matches.apply(
            lambda x: 1 if (x['winner_id'] == team_id and x['toss_winner'] == team_id and x['lighting'] == 'day') else 0, axis=1
        ).sum()
        total_matches = len(relevant_matches)
        win_rate = wins / total_matches if total_matches > 0 else 0
        return win_rate
    return 0

def historical_matchup_at_venue(team1_id, team2_id, venue_id, date, match_lvl_data):
    head_to_head_matches = match_lvl_data[
        ((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
         (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
        (match_lvl_data['ground_id'] == venue_id) &
        (match_lvl_data['match_dt'] < date)
    ]

    if not head_to_head_matches.empty:
        wins = head_to_head_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
        total_matches = len(head_to_head_matches)
        return wins / total_matches if total_matches > 0 else 0
    return 0

def head_to_head_win_rate(team1_id, team2_id, match_lvl_data):
    team1_matches = match_lvl_data[(match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id)]
    team2_matches = match_lvl_data[(match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)]
    total_matches = pd.concat([team1_matches, team2_matches])
    team1_wins = total_matches[total_matches['winner'] == team1_id].shape[0]
    total_matches_played = total_matches.shape[0]
    if total_matches_played > 0:
        return team1_wins / total_matches_played
    else:
        return 0
def toss_win_advantage(team_id, match_lvl_data):
    team_toss_wins = match_lvl_data[(match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] == team_id)]
    team_toss_losses = match_lvl_data[(match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] != team_id)]
    total_toss_wins = team_toss_wins.shape[0] + team_toss_losses.shape[0]
    if total_toss_wins > 0:
        return team_toss_wins.shape[0] / total_toss_wins
    else:
        return 0
def batting_order_advantage(team_id, match_lvl_data):
    team_wins_batting_first = match_lvl_data[(match_lvl_data['toss decision'] == 'bat') & (match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] == team_id)]
    team_wins_batting_second = match_lvl_data[(match_lvl_data['toss decision'] == 'field') & (match_lvl_data['toss winner'] == team_id) & (match_lvl_data['winner'] == team_id)]
    total_wins = team_wins_batting_first.shape[0] + team_wins_batting_second.shape[0]
    if total_wins > 0:
        return team_wins_batting_first.shape[0] / total_wins
    else:
        return 0
def avg_runs_conceded(team_id, match_lvl_data, batsman_lvl_data, bowler_lvl_data):
    team_matches = match_lvl_data[match_lvl_data['team1_id'] == team_id]
    team_bowlers = []
    for roster_ids in team_matches['team1_roster_ids']:
        team_bowlers.extend(roster_ids.split(','))
    team_bowlers = list(set(team_bowlers))
    runs_conceded = batsman_lvl_data[batsman_lvl_data['out_by_bowler'].isin(team_bowlers)]['runs'].sum()
    balls_bowled = bowler_lvl_data[bowler_lvl_data['bowler_id'].isin(team_bowlers)]['balls_bowled'].sum()
    if balls_bowled > 0:
        return runs_conceded / balls_bowled
    else:
        return 0

def avg_batting_strike_rate(team_id, match_lvl_data, batsman_lvl_data):
    team_matches = match_lvl_data[match_lvl_data['team1_id'] == team_id]
    team_batsmen = []
    for roster_ids in team_matches['team1_roster_ids']:
        team_batsmen.extend(roster_ids.split(','))
    team_batsmen = list(set(team_batsmen))
    total_runs = batsman_lvl_data[batsman_lvl_data['batsman_id'].isin(team_batsmen)]['runs'].sum()
    total_balls_faced = batsman_lvl_data[batsman_lvl_data['batsman_id'].isin(team_batsmen)]['balls_faced'].sum()
    if total_balls_faced > 0:
        return total_runs / total_balls_faced * 100
    else:
        return 0


In [None]:
def count_recent_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = str(team_roster).split(':')  # Split string of ':' separated ids into a list of ids
    total_mom_awards = 0

    for player_id in player_ids:  # Loop over each player_id in roster
        # Get relevant matches for the player up to the specified date
        player_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'] == player_id) &
                                        (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        # Count the number of matches where this player was MoM
        mom_count = len(player_matches)
        total_mom_awards += mom_count  # Accumulate MoM awards

    return total_mom_awards

def avg_runs_at_venue_by_conditions(ground_id, lighting, inning, match_lvl_data):
    condition_matches = match_lvl_data[(match_lvl_data['ground_id'] == ground_id) &
                                       (match_lvl_data['lighting'] == lighting)]
    if inning == 1:
        avg_runs = condition_matches['inning1_runs'].mean() if not condition_matches.empty else 0
    else:
        avg_runs = condition_matches['inning2_runs'].mean() if not condition_matches.empty else 0
    return avg_runs

def average_margin_of_victory(team_id, match_lvl_data, date):
    team_wins = match_lvl_data[(match_lvl_data['winner_id'] == team_id) &
                               (match_lvl_data['match_dt'] < date)]
    if team_wins.empty:
        return 0
    avg_runs_margin = team_wins[team_wins['by'] == 'runs']['win amount'].mean()
    avg_wickets_margin = team_wins[team_wins['by'] == 'wickets']['win amount'].mean()
    return {'avg_runs_margin': avg_runs_margin, 'avg_wickets_margin': avg_wickets_margin}

def weighted_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = team_roster.split(':')
    total_weighted_awards = 0

    # Fetch recent matches up to 'n' for all players in the roster before the specified date
    recent_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'].isin(player_ids)) &
                                    (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

    # Assigning weights - newer matches get higher weight
    for i, match in enumerate(recent_matches.itertuples(), 1):
        weight = 1 + (n - i) * 0.1  # Example weight formula: 1 + (total_matches - position) * 0.1
        total_weighted_awards += weight

    return total_weighted_awards

def head_to_head_win_rate(team1_id, team2_id, match_lvl_data, date):
    head_to_head_matches = match_lvl_data[((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
                                           (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id)) &
                                          (match_lvl_data['match_dt'] < date)]

    if head_to_head_matches.empty:
        return 0.5  # Neutral value if no historical matches

    wins = head_to_head_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    win_rate = wins / len(head_to_head_matches)
    return win_rate

def team_consistency_score(team_roster, batsman_data, bowler_data, date, n):
    player_ids = team_roster.split(':')
    consistency_scores = []

    for player_id in player_ids:
        # Evaluate batting consistency
        player_batting = batsman_data[(batsman_data['batsman_id'] == player_id) &
                                      (batsman_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        if not player_batting.empty:
            avg_runs = player_batting['runs'].mean()
            std_dev_runs = player_batting['runs'].std()
            batting_consistency = avg_runs / std_dev_runs if std_dev_runs else avg_runs

            consistency_scores.append(batting_consistency)

        # Evaluate bowling consistency
        player_bowling = bowler_data[(bowler_data['bowler_id'] == player_id) &
                                     (bowler_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)
        if not player_bowling.empty:
            avg_wickets = player_bowling['wicket_count'].mean()
            std_dev_wickets = player_bowling['wicket_count'].std()
            bowling_consistency = avg_wickets / std_dev_wickets if std_dev_wickets else avg_wickets

            consistency_scores.append(bowling_consistency)

    if consistency_scores:
        team_score = np.mean(consistency_scores)
    else:
        team_score = 0

    return team_score

## Function Call

In [None]:
num_match = 15

In [20]:
# Computing number of 50 runs in last 15 games for team1 for train dataset.
train_data['team1_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)


# Applying the function to the train_data DataFrame
train_data[['team1_win_percentage', 'team1_avg_score']] = train_data.apply(
    lambda x: head_to_head_performance(
        team1_id=x['team1_id'],
        team2_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1, result_type='expand'
)

train_data['team1_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_wicket_loss_average'] = train_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

n_matches = 15  # Number of recent matches to consider
train_data['team2_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_wicket_loss_average'] = train_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['venue_avg_runs'], train_data['venue_win_rate'] = zip(*train_data.apply(
    lambda x: venue_performance(match_lvl_data, x['ground_id'], x['match_dt']),
    axis=1
))
train_data['team1_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team2_id'], x['match_dt']),
    axis=1
)
train_data['team1_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)
train_data['team1_day_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day/night match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_day_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day/night match', x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_toss_advantage'] = train_data.apply(
    lambda x: toss_advantage(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_toss_advantage'] = train_data.apply(
    lambda x: toss_advantage(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_team_venue_win_rate(match_lvl_data, x['team1_id'], x['ground_id'], x['match_dt']),
    axis=1
)
train_data['team2_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_team_venue_win_rate(match_lvl_data, x['team2_id'], x['ground_id'], x['match_dt']),
    axis=1
)

train_data['team1_toss_strategy_at_venue'] = train_data.apply(
    lambda x: toss_strategy_impact(match_lvl_data, x['team1_id'], x['toss decision'], x['ground_id'], x['match_dt']),
    axis=1
)
train_data['team2_toss_strategy_at_venue'] = train_data.apply(
    lambda x: toss_strategy_impact(match_lvl_data, x['team2_id'], x['toss decision'], x['ground_id'], x['match_dt']),
    axis=1
)

train_data['team1_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_recent_form'] = train_data.apply(
    lambda x: recent_form(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_form(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)
# train_data['team1_head_to_head_win_rate'] = train_data.apply(
#     lambda x: head_to_head_win_rate(match_lvl_data, x['team1_id'], x['team2_id'], x['match_dt']),
#     axis=1
# )
train_data['batsman_performance_index'] = train_data.apply(
    lambda x: batsman_performance_index(batsman_lvl_data, x['match id'], x['match_dt']),
    axis=1
)
train_data['team1_batsman_performance_index'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)
train_data['team2_batsman_performance_index'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_bowler_impact_score'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)
train_data['team2_bowler_impact_score'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate_with_recent_form(match_lvl_data, x['team1_id'], x['ground_id'], x['match_dt']),
    axis=1
)
train_data['team2_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate_with_recent_form(match_lvl_data, x['team2_id'], x['ground_id'], x['match_dt']),
    axis=1
)

train_data['average_score_at_venue'] = train_data.apply(
    lambda x: average_score_by_venue(
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_scoring_consistency'] = train_data.apply(
    lambda x: scoring_consistency(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_scoring_consistency'] = train_data.apply(
    lambda x: scoring_consistency(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)
train_data['team1_wicket_loss_variance'] = train_data.apply(
    lambda x: wicket_loss_variance(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_wicket_loss_variance'] = train_data.apply(
    lambda x: wicket_loss_variance(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_win_rate_against_form'] = train_data.apply(
    lambda x: win_rate_against_recent_form(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
train_data['team2_win_rate_against_form'] = train_data.apply(
    lambda x: win_rate_against_recent_form(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
train_data['team1_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_win_rate_at_venue(match_lvl_data, x['team1_id'], x['venue'], x['match_dt']),
    axis=1
)
train_data['team2_adjusted_win_rate_at_venue'] = train_data.apply(
    lambda x: adjusted_win_rate_at_venue(match_lvl_data, x['team2_id'], x['venue'], x['match_dt']),
    axis=1
)

train_data['team1_post_loss_performance'] = train_data.apply(
    lambda x: performance_after_loss(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_post_loss_performance'] = train_data.apply(
    lambda x: performance_after_loss(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_scoring_variance_at_venue'] = train_data.apply(
    lambda x: scoring_variance_at_venue(match_lvl_data, x['team1_id'], x['venue'], x['match_dt']),
    axis=1
)
train_data['team2_scoring_variance_at_venue'] = train_data.apply(
    lambda x: scoring_variance_at_venue(match_lvl_data, x['team2_id'], x['venue'], x['match_dt']),
    axis=1
)

train_data['team1_impact_score'] = train_data.apply(
    lambda x: player_impact_score(x['team1_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)
train_data['team2_impact_score'] = train_data.apply(
    lambda x: player_impact_score(x['team2_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)

train_data['team1_form_factor'] = train_data.apply(
    lambda x: player_form_factor(x['team1_roster_ids'], batsman_lvl_data, x['match_dt'], 15), axis=1)
train_data['team2_form_factor'] = train_data.apply(
    lambda x: player_form_factor(x['team2_roster_ids'], batsman_lvl_data, x['match_dt'], 15), axis=1)

train_data['team1_historical_matchup_win_pct'] = train_data.apply(
    lambda x: historical_matchup(x['team1_id'], x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

train_data['team1_recent_form'] = train_data.apply(
    lambda x: recent_team_form(x['team1_id'], x['match_dt'], match_lvl_data),
    axis=1)
train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_team_form(x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

train_data['team1_venue_win_pct'] = train_data.apply(
    lambda x: venue_winning_percentage(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1)
train_data['team2_venue_win_pct'] = train_data.apply(
    lambda x: venue_winning_percentage(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1)

train_data['team1_day_light_win_rate'] = train_data.apply(
    lambda x: team_win_rate_under_condition(x['team1_id'], 'lighting', 'day', x['match_dt'], match_lvl_data),
    axis=1)
train_data['team2_day_light_win_rate'] = train_data.apply(
    lambda x: team_win_rate_under_condition(x['team2_id'], 'lighting', 'day', x['match_dt'], match_lvl_data),
    axis=1)

train_data['team1_avg_runs_scored_at_venue'], train_data['team1_avg_runs_conceded_at_venue'] = zip(*train_data.apply(
    lambda x: team_venue_scoring(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1))
train_data['team2_avg_runs_scored_at_venue'], train_data['team2_avg_runs_conceded_at_venue'] = zip(*train_data.apply(
    lambda x: team_venue_scoring(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1))

# Head-to-Head Win Rate
# train_data['team1_head_to_head_win_rate'] = train_data.apply(lambda x: head_to_head_win_rate(x['team1_id'], x['team2_id'], match_lvl_data), axis=1)
# train_data['team2_head_to_head_win_rate'] = train_data.apply(lambda x: head_to_head_win_rate(x['team2_id'], x['team1_id'], match_lvl_data), axis=1)

# Applying the feature to the DataFrame
train_data['team1_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
train_data['team2_adjusted_venue_win_rate'] = train_data.apply(
    lambda x: adjusted_venue_win_rate(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
train_data['team1_head_to_head_venue_win_rate'] = train_data.apply(
    lambda x: historical_matchup_at_venue(x['team1_id'], x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
# Toss Win Advantage
train_data['team1_toss_win_advantage'] = train_data.apply(lambda x: toss_win_advantage(x['team1_id'], match_lvl_data), axis=1)
train_data['team2_toss_win_advantage'] = train_data.apply(lambda x: toss_win_advantage(x['team2_id'], match_lvl_data), axis=1)

# Batting Order Advantage
train_data['team1_batting_first_advantage'] = train_data.apply(lambda x: batting_order_advantage(x['team1_id'], match_lvl_data), axis=1)
train_data['team2_batting_first_advantage'] = train_data.apply(lambda x: batting_order_advantage(x['team2_id'], match_lvl_data), axis=1)

# Average Runs Conceded
train_data['team1_avg_runs_conceded'] = train_data.apply(lambda x: avg_runs_conceded(x['team1_id'], match_lvl_data, batsman_lvl_data, bowler_lvl_data), axis=1)
train_data['team2_avg_runs_conceded'] = train_data.apply(lambda x: avg_runs_conceded(x['team2_id'], match_lvl_data, batsman_lvl_data, bowler_lvl_data), axis=1)

# Average Batting Strike Rate
train_data['team1_avg_batting_strike_rate'] = train_data.apply(lambda x: avg_batting_strike_rate(x['team1_id'], match_lvl_data, batsman_lvl_data), axis=1)
train_data['team2_avg_batting_strike_rate'] = train_data.apply(lambda x: avg_batting_strike_rate(x['team2_id'], match_lvl_data, batsman_lvl_data), axis=1)

# Applying the function to train_data for an example usage
train_data['team1_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)
train_data['team2_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)

train_data['ground_avg_runs_inning1_day'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day', 1, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning1_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'night', 1, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning1_day_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day/night', 1, match_lvl_data), axis=1)

train_data['ground_avg_runs_inning2_day'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day', 2, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning2_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'night', 2, match_lvl_data), axis=1)
train_data['ground_avg_runs_inning2_day_night'] = train_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day/night', 2, match_lvl_data), axis=1)

# Applying this feature to train_data
train_data['team1_dominance'] = train_data.apply(
    lambda x: average_margin_of_victory(x['team1_id'], match_lvl_data, x['match_dt']), axis=1)
train_data['team2_dominance'] = train_data.apply(
    lambda x: average_margin_of_victory(x['team2_id'], match_lvl_data, x['match_dt']), axis=1)

train_data['team1_weighted_mom'] = train_data.apply(
    lambda x: weighted_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)
train_data['team2_weighted_mom'] = train_data.apply(
    lambda x: weighted_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)

train_data['team1_consistency_score'] = train_data.apply(
    lambda x: team_consistency_score(x['team1_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)
train_data['team2_consistency_score'] = train_data.apply(
    lambda x: team_consistency_score(x['team2_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)

  average_score = (relevant_matches['inning1_runs'].sum() + relevant_matches['inning2_runs'].sum()) / (2 * len(relevant_matches))


In [21]:
# Computing number of 50 runs in last 15 games for team1 for train dataset.
test_data['team1_count_50runs_last15'] = test_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
test_data['team2_count_50runs_last15'] = test_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_strike_rate'] = test_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
test_data['team2_strike_rate'] = test_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_bowler_eco'] = test_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
test_data['team2_bowler_eco'] = test_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_avg_wicket'] = test_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
test_data['team2_avg_wicket'] = test_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_batting_strength'] = test_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
test_data['team2_batting_strength'] = test_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)


# Applying the function to the test_data DataFrame
test_data[['team1_win_percentage', 'team1_avg_score']] = test_data.apply(
    lambda x: head_to_head_performance(
        team1_id=x['team1_id'],
        team2_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1, result_type='expand'
)

test_data['team1_recent_win_rate'] = test_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_average_winning_margin'] = test_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_scoring_average'] = test_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_wicket_loss_average'] = test_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

n_matches = 15  # Number of recent matches to consider
test_data['team2_recent_win_rate'] = test_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_average_winning_margin'] = test_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_scoring_average'] = test_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_wicket_loss_average'] = test_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_win_rate_at_venue'] = test_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_win_rate_at_venue'] = test_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['venue_avg_runs'], test_data['venue_win_rate'] = zip(*test_data.apply(
    lambda x: venue_performance(match_lvl_data, x['ground_id'], x['match_dt']),
    axis=1
))
test_data['team1_night_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_night_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team2_id'], x['match_dt']),
    axis=1
)
test_data['team1_day_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_day_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)
test_data['team1_day_night_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day/night match', x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_day_night_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day/night match', x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_toss_advantage'] = test_data.apply(
    lambda x: toss_advantage(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_toss_advantage'] = test_data.apply(
    lambda x: toss_advantage(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_adjusted_win_rate_at_venue'] = test_data.apply(
    lambda x: adjusted_team_venue_win_rate(match_lvl_data, x['team1_id'], x['ground_id'], x['match_dt']),
    axis=1
)
test_data['team2_adjusted_win_rate_at_venue'] = test_data.apply(
    lambda x: adjusted_team_venue_win_rate(match_lvl_data, x['team2_id'], x['ground_id'], x['match_dt']),
    axis=1
)

test_data['team1_toss_strategy_at_venue'] = test_data.apply(
    lambda x: toss_strategy_impact(match_lvl_data, x['team1_id'], x['toss decision'], x['ground_id'], x['match_dt']),
    axis=1
)
test_data['team2_toss_strategy_at_venue'] = test_data.apply(
    lambda x: toss_strategy_impact(match_lvl_data, x['team2_id'], x['toss decision'], x['ground_id'], x['match_dt']),
    axis=1
)

test_data['team1_overall_win_rate'] = test_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_overall_win_rate'] = test_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_recent_form'] = test_data.apply(
    lambda x: recent_form(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_recent_form'] = test_data.apply(
    lambda x: recent_form(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)
# test_data['team1_head_to_head_win_rate'] = test_data.apply(
#     lambda x: head_to_head_win_rate(match_lvl_data, x['team1_id'], x['team2_id'], x['match_dt']),
#     axis=1
# )
test_data['batsman_performance_index'] = test_data.apply(
    lambda x: batsman_performance_index(batsman_lvl_data, x['match id'], x['match_dt']),
    axis=1
)
test_data['team1_batsman_performance_index'] = test_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)
test_data['team2_batsman_performance_index'] = test_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team1_bowler_impact_score'] = test_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)
test_data['team2_bowler_impact_score'] = test_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team1_adjusted_venue_win_rate'] = test_data.apply(
    lambda x: adjusted_venue_win_rate_with_recent_form(match_lvl_data, x['team1_id'], x['ground_id'], x['match_dt']),
    axis=1
)
test_data['team2_adjusted_venue_win_rate'] = test_data.apply(
    lambda x: adjusted_venue_win_rate_with_recent_form(match_lvl_data, x['team2_id'], x['ground_id'], x['match_dt']),
    axis=1
)

test_data['average_score_at_venue'] = test_data.apply(
    lambda x: average_score_by_venue(
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_win_rate_at_venue'] = test_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_win_rate_at_venue'] = test_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_scoring_consistency'] = test_data.apply(
    lambda x: scoring_consistency(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_scoring_consistency'] = test_data.apply(
    lambda x: scoring_consistency(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)
test_data['team1_wicket_loss_variance'] = test_data.apply(
    lambda x: wicket_loss_variance(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_wicket_loss_variance'] = test_data.apply(
    lambda x: wicket_loss_variance(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_momentum'] = test_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_momentum'] = test_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_win_rate_against_form'] = test_data.apply(
    lambda x: win_rate_against_recent_form(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
test_data['team2_win_rate_against_form'] = test_data.apply(
    lambda x: win_rate_against_recent_form(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
test_data['team1_adjusted_win_rate_at_venue'] = test_data.apply(
    lambda x: adjusted_win_rate_at_venue(match_lvl_data, x['team1_id'], x['venue'], x['match_dt']),
    axis=1
)
test_data['team2_adjusted_win_rate_at_venue'] = test_data.apply(
    lambda x: adjusted_win_rate_at_venue(match_lvl_data, x['team2_id'], x['venue'], x['match_dt']),
    axis=1
)

test_data['team1_post_loss_performance'] = test_data.apply(
    lambda x: performance_after_loss(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
test_data['team2_post_loss_performance'] = test_data.apply(
    lambda x: performance_after_loss(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_scoring_variance_at_venue'] = test_data.apply(
    lambda x: scoring_variance_at_venue(match_lvl_data, x['team1_id'], x['venue'], x['match_dt']),
    axis=1
)
test_data['team2_scoring_variance_at_venue'] = test_data.apply(
    lambda x: scoring_variance_at_venue(match_lvl_data, x['team2_id'], x['venue'], x['match_dt']),
    axis=1
)

test_data['team1_impact_score'] = test_data.apply(
    lambda x: player_impact_score(x['team1_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)
test_data['team2_impact_score'] = test_data.apply(
    lambda x: player_impact_score(x['team2_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)

test_data['team1_form_factor'] = test_data.apply(
    lambda x: player_form_factor(x['team1_roster_ids'], batsman_lvl_data, x['match_dt'], 15), axis=1)
test_data['team2_form_factor'] = test_data.apply(
    lambda x: player_form_factor(x['team2_roster_ids'], batsman_lvl_data, x['match_dt'], 15), axis=1)

test_data['team1_historical_matchup_win_pct'] = test_data.apply(
    lambda x: historical_matchup(x['team1_id'], x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

test_data['team1_recent_form'] = test_data.apply(
    lambda x: recent_team_form(x['team1_id'], x['match_dt'], match_lvl_data),
    axis=1)
test_data['team2_recent_form'] = test_data.apply(
    lambda x: recent_team_form(x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

test_data['team1_venue_win_pct'] = test_data.apply(
    lambda x: venue_winning_percentage(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1)
test_data['team2_venue_win_pct'] = test_data.apply(
    lambda x: venue_winning_percentage(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1)

test_data['team1_day_light_win_rate'] = test_data.apply(
    lambda x: team_win_rate_under_condition(x['team1_id'], 'lighting', 'day', x['match_dt'], match_lvl_data),
    axis=1)
test_data['team2_day_light_win_rate'] = test_data.apply(
    lambda x: team_win_rate_under_condition(x['team2_id'], 'lighting', 'day', x['match_dt'], match_lvl_data),
    axis=1)

test_data['team1_avg_runs_scored_at_venue'], test_data['team1_avg_runs_conceded_at_venue'] = zip(*test_data.apply(
    lambda x: team_venue_scoring(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1))
test_data['team2_avg_runs_scored_at_venue'], test_data['team2_avg_runs_conceded_at_venue'] = zip(*test_data.apply(
    lambda x: team_venue_scoring(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1))

# Head-to-Head Win Rate
# test_data['team1_head_to_head_win_rate'] = test_data.apply(lambda x: head_to_head_win_rate(x['team1_id'], x['team2_id'], match_lvl_data), axis=1)
# test_data['team2_head_to_head_win_rate'] = test_data.apply(lambda x: head_to_head_win_rate(x['team2_id'], x['team1_id'], match_lvl_data), axis=1)

# Applying the feature to the DataFrame
test_data['team1_adjusted_venue_win_rate'] = test_data.apply(
    lambda x: adjusted_venue_win_rate(x['team1_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
test_data['team2_adjusted_venue_win_rate'] = test_data.apply(
    lambda x: adjusted_venue_win_rate(x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
test_data['team1_head_to_head_venue_win_rate'] = test_data.apply(
    lambda x: historical_matchup_at_venue(x['team1_id'], x['team2_id'], x['venue'], x['match_dt'], match_lvl_data),
    axis=1
)
# Toss Win Advantage
test_data['team1_toss_win_advantage'] = test_data.apply(lambda x: toss_win_advantage(x['team1_id'], match_lvl_data), axis=1)
test_data['team2_toss_win_advantage'] = test_data.apply(lambda x: toss_win_advantage(x['team2_id'], match_lvl_data), axis=1)

# Batting Order Advantage
test_data['team1_batting_first_advantage'] = test_data.apply(lambda x: batting_order_advantage(x['team1_id'], match_lvl_data), axis=1)
test_data['team2_batting_first_advantage'] = test_data.apply(lambda x: batting_order_advantage(x['team2_id'], match_lvl_data), axis=1)

# Average Runs Conceded
test_data['team1_avg_runs_conceded'] = test_data.apply(lambda x: avg_runs_conceded(x['team1_id'], match_lvl_data, batsman_lvl_data, bowler_lvl_data), axis=1)
test_data['team2_avg_runs_conceded'] = test_data.apply(lambda x: avg_runs_conceded(x['team2_id'], match_lvl_data, batsman_lvl_data, bowler_lvl_data), axis=1)

# Average Batting Strike Rate
test_data['team1_avg_batting_strike_rate'] = test_data.apply(lambda x: avg_batting_strike_rate(x['team1_id'], match_lvl_data, batsman_lvl_data), axis=1)
test_data['team2_avg_batting_strike_rate'] = test_data.apply(lambda x: avg_batting_strike_rate(x['team2_id'], match_lvl_data, batsman_lvl_data), axis=1)

# Applying the function to test_data for an example usage
test_data['team1_recent_mom_count'] = test_data.apply(
    lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)
test_data['team2_recent_mom_count'] = test_data.apply(
    lambda x: count_recent_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)

test_data['ground_avg_runs_inning1_day'] = test_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day', 1, match_lvl_data), axis=1)
test_data['ground_avg_runs_inning1_night'] = test_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'night', 1, match_lvl_data), axis=1)
test_data['ground_avg_runs_inning1_day_night'] = test_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day/night', 1, match_lvl_data), axis=1)

test_data['ground_avg_runs_inning2_day'] = test_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day', 2, match_lvl_data), axis=1)
test_data['ground_avg_runs_inning2_night'] = test_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'night', 2, match_lvl_data), axis=1)
test_data['ground_avg_runs_inning2_day_night'] = test_data.apply(
    lambda x: avg_runs_at_venue_by_conditions(x['ground_id'], 'day/night', 2, match_lvl_data), axis=1)

# Applying this feature to test_data
test_data['team1_dominance'] = test_data.apply(
    lambda x: average_margin_of_victory(x['team1_id'], match_lvl_data, x['match_dt']), axis=1)
test_data['team2_dominance'] = test_data.apply(
    lambda x: average_margin_of_victory(x['team2_id'], match_lvl_data, x['match_dt']), axis=1)

test_data['team1_weighted_mom'] = test_data.apply(
    lambda x: weighted_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)
test_data['team2_weighted_mom'] = test_data.apply(
    lambda x: weighted_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)

test_data['team1_consistency_score'] = test_data.apply(
    lambda x: team_consistency_score(x['team1_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)
test_data['team2_consistency_score'] = test_data.apply(
    lambda x: team_consistency_score(x['team2_roster_ids'], batsman_lvl_data, bowler_lvl_data, x['match_dt'], 10), axis=1)

100%|██████████| 271/271 [00:02<00:00, 91.04it/s]
100%|██████████| 271/271 [00:02<00:00, 91.01it/s]
100%|██████████| 271/271 [00:02<00:00, 100.18it/s]
100%|██████████| 271/271 [00:02<00:00, 98.15it/s] 
100%|██████████| 271/271 [00:02<00:00, 98.34it/s] 
100%|██████████| 271/271 [00:02<00:00, 112.84it/s]
100%|██████████| 271/271 [00:02<00:00, 106.57it/s]
100%|██████████| 271/271 [00:02<00:00, 122.59it/s]
100%|██████████| 271/271 [00:02<00:00, 97.11it/s]
100%|██████████| 271/271 [00:02<00:00, 96.28it/s]
  average_score = (relevant_matches['inning1_runs'].sum() + relevant_matches['inning2_runs'].sum()) / (2 * len(relevant_matches))


In [22]:
train_data.to_csv('train_data_combined.csv', index=False)
test_data.to_csv('test_data_combined.csv', index=False)

## Check

In [None]:
train_data.columns

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
col = list(abs(train_data.select_dtypes(include=['number']).corr()['winner_01']).sort_values(ascending = False)[:20].index)

In [None]:
if 'winner_id' in col:
    col.remove('winner_id')

if 'team1_id' in col:
    col.remove('team1_id')

if 'team2_id' in col:
    col.remove('team2_id')

if 'match id' in col:
    col.remove('match id')

In [None]:
# train_data.to_csv('combined.csv',index=False)

## Model

In [None]:
df = train_data.select_dtypes(include=['number'])

In [None]:
# df.drop(['match id', 'team1_id', 'team2_id', 'ground_id','winner_id'], axis=1, inplace=True)
df = df[col]

In [None]:
# df.fillna(0,inplace=True)
# df.replace([np.inf, -np.inf], 0, inplace=True)


# Replace infinite values with NaN to handle them similarly
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)

In [None]:
df.shape

In [None]:
df.corr()['winner_01'].sort_values(ascending = True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

X = df.drop('winner_01', axis=1)
y = df['winner_01']

poly = PolynomialFeatures(degree=2, include_bias=True)
X = poly.fit_transform(X)
print(X.shape)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Select number of components to explain desired variance (e.g., 95%)
desired_variance = 0.95
n_components = np.argmax(cumulative_explained_variance >= desired_variance) + 1
print(n_components)

pca = PCA(n_components=n_components)
X = pca.fit_transform(X_scaled)
print(X.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [None]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

In [None]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train, label=y_train)
valid_pool = Pool(data=X_test, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model_cat = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model_cat.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model_cat.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, y_test are already defined
# If not, you can split your data as follows:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 10.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-3, 10.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'random_state': 42,
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=0)
    
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model_xgb = xgb.XGBClassifier(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda'],
    random_state=42,
    verbose=1  # To monitor the training process
)

best_model_xgb.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model_xgb.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)


In [None]:
import optuna
import lightgbm as lgb

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'num_iterations': trial.suggest_int('num_iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 200),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'verbose': -1  # Suppress output for tuning
    }
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    model = lgb.train(params, train_data, valid_sets=[valid_data])
    
    preds = model.predict(X_test)
    pred_labels = [1 if p > 0.5 else 0 for p in preds]
    accuracy = accuracy_score(y_test, pred_labels)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model_lgbm = lgb.LGBMClassifier(
    num_iterations=best_params['num_iterations'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    num_leaves=best_params['num_leaves'],
    min_data_in_leaf=best_params['min_data_in_leaf'],
    lambda_l1=best_params['lambda_l1'],
    lambda_l2=best_params['lambda_l2'],
    bagging_fraction=best_params['bagging_fraction'],
    bagging_freq=best_params['bagging_freq'],
    feature_fraction=best_params['feature_fraction'],
    verbose=100  # To monitor the training process
)

best_model_lgbm.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model_lgbm.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(f"Final accuracy: {final_accuracy}")

## Test

In [None]:
len(col)

In [None]:
df = test_data.select_dtypes(include=['number'])

In [None]:
col.remove('winner_01')
df = df[col]
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)

In [None]:
df.shape

In [None]:
# Transform test data using the previously fitted PolynomialFeatures
X_test_poly = poly.transform(df)  # Note: Use transform, not fit_transform

# Scale the test data using the previously fitted StandardScaler
X_test_scaled = scaler.transform(X_test_poly)  # Note: Use transform, not fit_transform

# Apply PCA to the test data using the previously fitted PCA
X_test_reduced = pca.transform(X_test_scaled)  # Note: Use transform, not fit_transform

print("Test shape after Polynomial Features:", X_test_poly.shape)
print("Test shape after Standard Scaling:", X_test_scaled.shape)
print("Test shape after PCA:", X_test_reduced.shape)

# X_test_reduced = df

In [None]:
best_model_cat.fit(X, y)
best_model_xgb.fit(X, y)
best_model_lgbm.fit(X, y)

In [None]:
# pred = best_model.predict(X_test_reduced)

final_preds_cat = best_model_cat.predict(X_test_reduced)
final_preds_xgb = best_model_xgb.predict(X_test_reduced)
final_preds_lgbm = best_model_lgbm.predict(X_test_reduced)

In [None]:
def count_zeros(y_test):
    total_elements = len(y_test)
    count_zeros = np.sum(y_test == 0)
    percentage_zeros = (count_zeros / total_elements) * 100
    return percentage_zeros

In [None]:
print("% zeros in Catboost_model ",count_zeros(final_preds_cat))
print("% zeros in LGBM_model ",count_zeros(final_preds_lgbm))
print("% zeros in XGB_model ",count_zeros(final_preds_xgb))