In [1]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [3]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

In [4]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [5]:
bowler_lvl_data.columns

Index(['match id', 'bowler', 'bowler_id', 'bowler_details',
       'is_bowler_captain', 'is_bowler_keeper', 'inning', 'runs',
       'wicket_count', 'balls_bowled', 'economy', 'maiden', 'dots', 'Fours',
       'Sixes', 'wides', 'noballs', 'match_dt'],
      dtype='object')

Bowler's features

In [6]:
#new feature 1 bowler's performance of teams in recent mathes
def team_bowler_performance_recent(bowler_df, match_id, team_roster_ids, date, n=15):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count']) * 5 + recent_matches['dots']*1.5+(recent_matches['maiden']) * 1.5 -(recent_matches['Fours'] * 1) - recent_matches['Sixes']*1.5 - recent_matches['wides'] - recent_matches['noballs']                             
            total_impact_score += player_impact.sum()

    return total_impact_score

train_data['team1_bowler_performance_recent'] = train_data.apply(
    lambda x: team_bowler_performance_recent(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team2_bowler_performance_recent'] = train_data.apply(
    lambda x: team_bowler_performance_recent(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

Batsmen features

In [7]:
def team_batsman_performance_recent(bat_df, match_id, team_roster_ids, date, n=15):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 4.5 +
                                  recent_matches['strike_rate'] * 2.5 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] ) * 1.5) 
                                  
            total_performance_index += player_performance.sum()

    return total_performance_index

train_data['team1_batsman_performance_recent'] = train_data.apply(
    lambda x: team_batsman_performance_recent(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team2_batsman_performance_recent'] = train_data.apply(
    lambda x: team_batsman_performance_recent(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

Match features

In [8]:
#new feature 
def runs_conceded_teams_recent(player_list, date, n=15):
    player_list = str(player_list).split(':')
    total_runs_conceded = 0

    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_runs_conceded += recent_matches['runs'].sum()
        
    return total_runs_conceded


train_data['runs_conceded_team1_recent'] = train_data.progress_apply(
    lambda x: runs_conceded_teams_recent(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1
)
train_data['runs_conceded_team2_recent'] = train_data.progress_apply(
    lambda x: runs_conceded_teams_recent(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1
)


100%|██████████| 948/948 [00:07<00:00, 126.09it/s]
100%|██████████| 948/948 [00:07<00:00, 127.21it/s]


In [9]:
#note this contains 350 zeros , have to be considered 
def headToHeadRatio(team1_id, team2_id, date):
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                             (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|\
                              ((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]
    team1_wins = df_rel[df_rel['winner_id']==team1_id].shape[0]
    team2_wins = df_rel[df_rel['winner_id']==team2_id].shape[0]
    total_matches = df_rel.shape[0]
    if total_matches == 0:
        return 0
    return team1_wins / total_matches

train_data['head_to_head_ratio'] = train_data.progress_apply(lambda x: \
                                  headToHeadRatio(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

100%|██████████| 948/948 [00:00<00:00, 2367.77it/s]


In [10]:
match_lvl_data['team1_bat_inning'] = np.where( ((match_lvl_data['team1']==match_lvl_data['toss winner'])&(match_lvl_data['toss decision']=='bat'))|\
                                               ((match_lvl_data['team2']==match_lvl_data['toss winner'])&(match_lvl_data['toss decision']=='field')) , 1, 2)



In [11]:
def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # combine two dataframes - one where input team is batting first, and another one where input team is batting second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean() # return mean of the combined dataframe.

train_data['team2only_avg_runs_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)




100%|██████████| 948/948 [00:00<00:00, 1066.03it/s]


In [12]:
train_data['ground-teamavg1'] = train_data['ground_avg_runs_last15']-train_data['team1only_avg_runs_last15']

In [13]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.


# In[34]:


# Compute team1 win% against team2 in their past 15 encounters for train data.
train_data['team1_winp_team2_last15'] = train_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)

100%|██████████| 948/948 [00:00<00:00, 2165.45it/s]


In [14]:
def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

train_data['team1_winp_last25'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 25), axis=1)
# Compute team2's win% in last 5 games
train_data['team2_winp_last25'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 25), axis=1)

100%|██████████| 948/948 [00:00<00:00, 2501.90it/s]
100%|██████████| 948/948 [00:00<00:00, 2564.80it/s]


In [15]:
def count_recent_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = str(team_roster).split(':')  # Split string of ':' separated ids into a list of ids
    total_mom_awards = 0

    for player_id in player_ids:  # Loop over each player_id in roster
        # Get relevant matches for the player up to the specified date
        player_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'] == float(player_id)) &
                                        (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        # Count the number of matches where this player was MoM
        mom_count = len(player_matches)
        total_mom_awards += mom_count  # Accumulate MoM awards

    return total_mom_awards

train_data['team1_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)

train_data['team2_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team2_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)



Note while making test data , just uncomment the below comment

In [16]:
def assign_toss_winner_flag(row):
    if row['team1'] == row['toss winner']:
        return row['team1_id']
    elif row['team2'] == row['toss winner']:
        return row['team2_id']
train_data['toss_winner_id'] = train_data.apply(assign_toss_winner_flag, axis=1)
match_lvl_data['toss_winner_id'] = match_lvl_data.apply(assign_toss_winner_flag, axis=1)
# test_lvl_data['toss_winner_id']= test_lvl_data.apply(assign_toss_winner_flag, axis=1)

In [17]:
def assign_bat(row):
    if (row['toss_winner_id'] == row['team1_id']) & (row['toss decision']=='bat'):
        return row['team1_id']
    if (row['toss_winner_id'] != row['team1_id']) & (row['toss decision']=='field'):
        return row['team1_id']
    else:
        return row['team2_id']
train_data['bat_id'] = train_data.apply(assign_bat, axis=1) 
match_lvl_data['bat_id'] = match_lvl_data.apply(assign_bat, axis=1)
# test_lvl_data['bat_id'] = test_lvl_data.apply(assign_bat, axis=1)

In [18]:
def winpLastn_batndchase(team_id, bat_id,date,n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    if(bat_id==team_id):
        df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) )]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
        win_count = df_rel[(df_rel['winner_id' ]==team_id )& (df_rel['bat_id']==team_id)].shape[0] # count number of rows having winner as the input team
        if win_count == 0:
          return 0
        return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points
    else:
       df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
       
       win_count = df_rel[(df_rel['winner_id']==team_id) & (df_rel['bat_id']!=team_id)].shape[0] # count number of rows having winner as the input team
       if win_count == 0:
          return 0
       return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points
       

train_data['team1_winp_bat\chase'] = train_data.progress_apply(lambda x: \
                                  winpLastn_batndchase(x['team1_id'], x['bat_id'], x['match_dt'], 15), axis=1)
train_data['team2_winp_bat\chase'] = train_data.progress_apply(lambda x: \
                                  winpLastn_batndchase(x['team2_id'], x['bat_id'], x['match_dt'], 15), axis=1)


  train_data['team1_winp_bat\chase'] = train_data.progress_apply(lambda x: \
  train_data['team2_winp_bat\chase'] = train_data.progress_apply(lambda x: \
100%|██████████| 948/948 [00:00<00:00, 2063.03it/s]
100%|██████████| 948/948 [00:00<00:00, 2016.70it/s]


Note that the corelation decreased , and the below function generates more zeroes, so can be ignored

In [19]:

# Function to calculate the percentage of victories for team 1 batting first
def calculate_victory_percentage(row, match_lvl_data):
    team1_id = row['team1_id']
    match_date = row['match_dt']
    
    # Filter matches before the current match date
    past_matches = match_lvl_data[match_lvl_data['match_dt'] < match_date]
    
    # Total matches where team1 batted first
    matches_team1_bat_first = past_matches[(past_matches['team1_id'] == team1_id) & (past_matches['by'] == 'runs')]
    
    # Victories of team1 when batting first
    victories_team1_bat_first = matches_team1_bat_first[matches_team1_bat_first['winner_id'] == team1_id]
    
    if len(matches_team1_bat_first) > 0:
        pct_vic_team1_bat_first = len(victories_team1_bat_first) / len(matches_team1_bat_first) * 100
    else:
        pct_vic_team1_bat_first = 0

    return pct_vic_team1_bat_first

# Calculate the feature and append it to the train data
train_data['pct_vic_team1_bat_first'] = train_data.progress_apply(
    lambda row: calculate_victory_percentage(row, match_lvl_data), axis=1
)

# Similarly, create other features such as `pct_vic_team1_bowl_first`, `pct_vic_team2_bat_first`, etc.
def calculate_victory_percentage_other(row, match_lvl_data, team_column, by_condition):
    team_id = row[team_column]
    match_date = row['match_dt']
    
    # Filter matches before the current match date
    past_matches = match_lvl_data[match_lvl_data['match_dt'] < match_date]
    
    # Total matches where the team meets the condition (bat first or bowl first)
    matches_team_condition = past_matches[(past_matches[team_column] == team_id) & (past_matches['by'] == by_condition)]
    
    # Victories of the team under the given condition
    victories_team_condition = matches_team_condition[matches_team_condition['winner_id'] == team_id]
    
    if len(matches_team_condition) > 0:
        pct_vic_team_condition = len(victories_team_condition) / len(matches_team_condition) * 100
    else:
        pct_vic_team_condition = 0
    
    return pct_vic_team_condition

# Calculate additional features
train_data['pct_vic_team1_bat_first'] = train_data.progress_apply(
    lambda row: calculate_victory_percentage_other(row, match_lvl_data, 'team1_id', 'runs'), axis=1
)

train_data['pct_vic_team1_bowl_first'] = train_data.progress_apply(
    lambda row: calculate_victory_percentage_other(row, match_lvl_data, 'team1_id', 'wickets'), axis=1
)

train_data['pct_vic_team2_bat_first'] = train_data.progress_apply(
    lambda row: calculate_victory_percentage_other(row, match_lvl_data, 'team2_id', 'runs'), axis=1
)

train_data['pct_vic_team2_bowl_first'] = train_data.progress_apply(
    lambda row: calculate_victory_percentage_other(row, match_lvl_data, 'team2_id', 'wickets'), axis=1
)

# Saving the enriched dataset
# train_data_with_sample_features.to_csv('enriched_train_data.csv', index=False)


100%|██████████| 948/948 [00:00<00:00, 1784.40it/s]
100%|██████████| 948/948 [00:00<00:00, 1790.01it/s]
100%|██████████| 948/948 [00:00<00:00, 1812.81it/s]
100%|██████████| 948/948 [00:00<00:00, 1819.75it/s]
100%|██████████| 948/948 [00:00<00:00, 1712.84it/s]


## Adding positive corr functions

In [20]:
def recent_form(match_lvl_data, team_id, date):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < date)].tail(5)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / 5 if recent_matches.shape[0] > 0 else 0

train_data['team1_recent_form'] = train_data.apply(
    lambda x: recent_form(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_form(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

In [21]:
def calculate_exponential_momentum(match_lvl_data, team_id, date, alpha=0.1):
    matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False)
    wins = matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).to_numpy()
    weights = np.exp(-alpha * np.arange(len(wins)))
    if np.sum(weights) > 0:
        momentum = np.dot(wins, weights) / np.sum(weights)
    else:
        momentum = 0  # Handle case with no matches
    return momentum


train_data['team1_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

In [22]:
def average_margin_of_victory(matches, team_id, date):
    past_matches = matches[(matches['match_dt'] < date) & ((matches['team1_id'] == team_id) | (matches['team2_id'] == team_id))]
    victory_margins = past_matches.apply(lambda x: x['win amount'] if x['winner_id'] == team_id else -x['win amount'], axis=1)
    return victory_margins.mean() if not victory_margins.empty else 0

train_data['team1_avg_margin_of_victory'] = train_data.apply(lambda x: average_margin_of_victory(match_lvl_data, x['team1_id'], x['match_dt']), axis=1)
train_data['team2_avg_margin_of_victory'] = train_data.apply(lambda x: average_margin_of_victory(match_lvl_data, x['team2_id'], x['match_dt']), axis=1)

In [23]:
def lighting_performance(match_lvl_data, lighting_type, team_id, date):
    matches = match_lvl_data[(match_lvl_data['lighting'] == lighting_type) & (match_lvl_data['match_dt'] < date)]
    if len(matches) > 0:
        wins = matches[matches['winner_id'] == team_id].shape[0]
        return wins / len(matches)
    return 0  # Return 0 if no matches found under this condition

train_data['team1_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)

In [24]:
def overall_historical_win_rate(match_lvl_data, team_id, date):
    historical_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < date)]
    wins = historical_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    total_matches = len(historical_matches)
    return wins / total_matches if total_matches > 0 else 0

train_data['team1_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)
train_data['team2_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

In [25]:
def batting_first_win_rate(matches, team_id, date):
    first_batting_matches = matches[(matches['match_dt'] < date) & (matches['team1_id'] == team_id)]
    wins = first_batting_matches[first_batting_matches['winner_id'] == team_id].shape[0]
    total = first_batting_matches.shape[0]
    return wins / total if total > 0 else 0

train_data['team1_batting_first_win_rate'] = train_data.apply(lambda x: batting_first_win_rate(match_lvl_data, x['team1_id'], x['match_dt']), axis=1)
train_data['team2_batting_first_win_rate'] = train_data.apply(lambda x: batting_first_win_rate(match_lvl_data, x['team2_id'], x['match_dt']), axis=1)

## Adding some more features

In [26]:
def player_of_match_frequency(matches, team_roster_ids, date):
    team_ids = str(team_roster_ids).split(':')
    pom_awards = 0
    total_matches = 0

    for player_id in team_ids:
        past_matches = matches[(matches['match_dt'] < date) & (matches['player_of_the_match_id'] == float(player_id))]
        pom_awards += past_matches.shape[0]
        total_matches += matches[(matches['match_dt'] < date) & ((matches['team1_roster_ids'].str.contains(str(player_id))) | (matches['team2_roster_ids'].str.contains(str(player_id))))].shape[0]

    return pom_awards / total_matches if total_matches > 0 else 0

train_data['team1_pom_frequency'] = train_data.apply(lambda x: player_of_match_frequency(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
train_data['team2_pom_frequency'] = train_data.apply(lambda x: player_of_match_frequency(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)

In [27]:
def team_count_100runs_last15(matches, team_roster, date, n=15):
    def count_100s(team_roster):
        total_100s = 0
        for player_id in team_roster.split(':'):
            player_matches = giveLastNgamesPlayer(player_id, date, n, 'bat')
            total_100s += (player_matches['runs'] >= 75).sum()
        return total_100s

    return count_100s(team_roster)

train_data['team1_count_100runs_last15'] = train_data.apply(lambda x: team_count_100runs_last15(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
train_data['team2_count_100runs_last15'] = train_data.apply(lambda x: team_count_100runs_last15(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)

In [28]:
train_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'winner_01',
       'team1_bowler_performance_recent', 'team2_bowler_performance_recent',
       'team1_batsman_performance_recent', 'team2_batsman_performance_recent',
       'runs_conceded_team1_recent', 'runs_conceded_team2_recent',
       'head_to_head_ratio', 'team2only_avg_runs_last15', 'ground-teamavg1',
       'team1_winp_last25', 'team2_winp_last25', 'team1_recent_mom_count',
       'team2_recent_mom_count', 'toss_winner_id', 'bat_id',
       'team1_winp_bat\chase', 'team2_winp_bat\chase',
       'pct_vic_team1_bat_first', 'pct_vic_team1_bowl_first',
       'pct_vic_team2_bat_first', '

In [29]:
count_zeros = train_data.apply(lambda x: (x == 0).sum())

# Count NaN values in each column
count_nans = train_data.isna().sum()

counts = pd.DataFrame({
    'zeros': count_zeros,
    'nans': count_nans
})

# Filter columns that have either zeros or NaNs
filtered_counts = counts[(counts['zeros'] > 0) | (counts['nans'] > 0)]

print(filtered_counts)

# print("Count of zeros in each column:")
# print(count_zeros)

# print("\nCount of NaNs in each column:")
# print(count_nans)

                                  zeros  nans
team1only_avg_runs_last15             0    21
team1_winp_team2_last15             356     0
ground_avg_runs_last15                0    53
winner_01                           471     0
team1_bowler_performance_recent      12     0
team2_bowler_performance_recent      11     0
team1_batsman_performance_recent     20     0
team2_batsman_performance_recent     12     0
runs_conceded_team1_recent           12     0
runs_conceded_team2_recent           11     0
head_to_head_ratio                  356     0
team2only_avg_runs_last15             0    22
ground-teamavg1                       0    68
team1_winp_last25                    39     0
team2_winp_last25                    42     0
team1_recent_mom_count               76     0
team2_recent_mom_count               56     0
team1_winp_bat\chase                 90     0
team2_winp_bat\chase                 85     0
pct_vic_team1_bat_first              82     0
pct_vic_team1_bowl_first          

In [142]:
train_data.drop(columns=['team1_id','team2_id','match id','ground_id','winner_id'], inplace=True)

KeyError: "['winner_id'] not found in axis"

In [None]:
abs(train_data.select_dtypes(include=['number']).corr()['winner_01']).sort_values(ascending = False)

winner_01                           1.000000
team1_pom_frequency                 0.164982
team1_bowler_performance_recent     0.147883
team1_recent_mom_count              0.141026
team1_batsman_performance_recent    0.134133
runs_conceded_team1_recent          0.125668
team1_count_100runs_last15          0.118201
team_count_50runs_last15            0.110761
team2_avg_margin_of_victory         0.101463
team1_batting_first_win_rate        0.094481
team2_recent_form                   0.091641
team1_overall_win_rate              0.091137
team1_winp_last25                   0.085741
team2_momentum                      0.083399
team1_momentum                      0.079657
team2_batting_first_win_rate        0.078320
team1_winp_bat\chase                0.077597
team1_avg_margin_of_victory         0.075163
team2_day_match_win_rate            0.073309
team2_overall_win_rate              0.072307
team2_winp_last25                   0.069452
toss_winner_id                      0.061980
head_to_he

In [None]:
train_data.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = False)

winner_01                           1.000000
team2_avg_margin_of_victory         0.101463
team2_recent_form                   0.091641
team2_momentum                      0.083399
team2_batting_first_win_rate        0.078320
team2_day_match_win_rate            0.073309
team2_overall_win_rate              0.072307
team2_winp_last25                   0.069452
team2_winp_bat\chase                0.031357
ground-teamavg1                     0.018932
team1_day_match_win_rate            0.018755
pct_vic_team1_bowl_first            0.018044
pct_vic_team2_bowl_first            0.016128
team2_bowler_performance_recent     0.015306
team2_count_100runs_last15          0.004007
runs_conceded_team2_recent         -0.005665
team_winp_last5                    -0.011712
ground_avg_runs_last15             -0.012359
team2_batsman_performance_recent   -0.014706
team2_recent_mom_count             -0.020679
team2_pom_frequency                -0.023783
team1_recent_form                  -0.041517
team1only_

## Feature Selection

In [None]:
df = train_data.select_dtypes(include=['number'])

In [None]:
df.head()

Unnamed: 0,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,winner_01,team1_bowler_performance_recent,team2_bowler_performance_recent,team1_batsman_performance_recent,team2_batsman_performance_recent,...,team1_day_match_win_rate,team2_day_match_win_rate,team1_overall_win_rate,team2_overall_win_rate,team1_batting_first_win_rate,team2_batting_first_win_rate,team1_pom_frequency,team2_pom_frequency,team1_count_100runs_last15,team2_count_100runs_last15
0,1.666667,0.672131,139.0,100.0,157.178571,1,807.5,464.0,3988.8,3913.725,...,0.016736,0.012552,0.6,0.636364,0.5,0.555556,0.0,0.0,1,0
1,1.285714,1.952381,156.0,50.0,103.5,0,1069.5,1266.5,17636.225,24919.6,...,0.008746,0.008746,0.611111,0.434783,0.5,0.555556,0.076705,0.060484,0,1
2,0.857143,0.672131,173.266667,0.0,154.333333,0,883.5,704.5,18055.0,21504.45,...,0.007353,0.003676,0.428571,0.590909,0.555556,0.777778,0.030568,0.075107,2,4
3,2.166667,1.97561,164.266667,50.0,144.25,0,906.0,1078.5,35780.3,24493.75,...,0.008361,0.011706,0.55,0.5,0.636364,0.5,0.034642,0.051331,3,2
4,0.818182,1.327869,164.666667,0.0,189.0,1,1346.5,1187.0,26730.175,23252.3,...,0.001855,0.0,0.48,0.692308,0.444444,0.571429,0.051522,0.046875,4,2


In [None]:
df.describe()

Unnamed: 0,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,winner_01,team1_bowler_performance_recent,team2_bowler_performance_recent,team1_batsman_performance_recent,team2_batsman_performance_recent,...,team1_day_match_win_rate,team2_day_match_win_rate,team1_overall_win_rate,team2_overall_win_rate,team1_batting_first_win_rate,team2_batting_first_win_rate,team1_pom_frequency,team2_pom_frequency,team1_count_100runs_last15,team2_count_100runs_last15
count,948.0,948.0,927.0,948.0,895.0,948.0,948.0,948.0,948.0,948.0,...,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0,948.0
mean,1.202602,4.904243,161.578491,40.90519,153.822446,0.503165,836.135549,832.69884,18731.210812,18842.874552,...,0.00591,0.00593,0.503097,0.501136,0.492261,0.485259,0.042784,0.043344,2.352321,2.360759
std,1.008793,15.83962,14.449746,37.796855,13.971631,0.500254,331.483656,324.879764,8804.268466,8601.510917,...,0.005946,0.005927,0.181283,0.185252,0.237273,0.245822,0.021512,0.021623,2.090438,2.064079
min,0.083333,0.009901,103.0,0.0,81.5,0.0,-6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.686678,0.60396,153.4,0.0,145.381818,0.0,656.0,650.0,13735.35625,13520.925,...,0.001832,0.001691,0.407118,0.4,0.333333,0.333333,0.029575,0.028571,1.0,1.0
50%,1.0,1.0,163.125,40.0,154.666667,1.0,871.25,871.5,19237.3,19913.0625,...,0.003868,0.00415,0.5,0.5,0.5,0.5,0.04529,0.045455,2.0,2.0
75%,1.4,1.952381,171.27619,66.67,162.426768,1.0,1076.125,1061.125,25067.75,24977.40625,...,0.008641,0.008818,0.615385,0.625,0.636364,0.636364,0.056755,0.058098,4.0,4.0
max,11.0,101.0,218.5,100.0,209.5,1.0,1621.5,1722.0,41413.375,45281.575,...,0.025478,0.025157,1.0,1.0,1.0,1.0,0.111111,0.125,12.0,11.0


In [None]:
df.columns

Index(['team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'winner_01',
       'team1_bowler_performance_recent', 'team2_bowler_performance_recent',
       'team1_batsman_performance_recent', 'team2_batsman_performance_recent',
       'runs_conceded_team1_recent', 'runs_conceded_team2_recent',
       'head_to_head_ratio', 'team2only_avg_runs_last15', 'ground-teamavg1',
       'team1_winp_last25', 'team2_winp_last25', 'team1_recent_mom_count',
       'team2_recent_mom_count', 'toss_winner_id', 'bat_id',
       'team1_winp_bat\chase', 'team2_winp_bat\chase',
       'pct_vic_team1_bat_first', 'pct_vic_team1_bowl_first',
       'pct_vic_team2_bat_first', 'pct_vic_team2_bowl_first',
       'team1_recent_form', 'team2_recent_form', 'team1_momentum',
       'team2_momentum', 'team1_avg_margin_of_victory',
       'team2_avg_margin_of_victory', 'team1_day_match_win_rate',
       'team2_day_match_win_rate'

In [None]:
df.isna().sum()

team_count_50runs_last15             0
team_winp_last5                      0
team1only_avg_runs_last15           21
team1_winp_team2_last15              0
ground_avg_runs_last15              53
winner_01                            0
team1_bowler_performance_recent      0
team2_bowler_performance_recent      0
team1_batsman_performance_recent     0
team2_batsman_performance_recent     0
runs_conceded_team1_recent           0
runs_conceded_team2_recent           0
head_to_head_ratio                   0
team2only_avg_runs_last15           22
ground-teamavg1                     68
team1_winp_last25                    0
team2_winp_last25                    0
team1_recent_mom_count               0
team2_recent_mom_count               0
toss_winner_id                       0
bat_id                               0
team1_winp_bat\chase                 0
team2_winp_bat\chase                 0
pct_vic_team1_bat_first              0
pct_vic_team1_bowl_first             0
pct_vic_team2_bat_first  

In [None]:
df.fillna(0, inplace=True)

In [None]:
selected_columns = []

for col in train_data.select_dtypes(include=['number']).columns:
    if train_data[col].corr(train_data['winner_01']) <= -0.1:
        selected_columns.append(col)

for col in train_data.select_dtypes(include=['number']).columns:
    if train_data[col].corr(train_data['winner_01']) >=0.05:
        selected_columns.append(col)

df = df[selected_columns]

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler

def standard_scale(df):
    """ Standardize features by removing the mean and scaling to unit variance """
    scaler = StandardScaler()
    df_scaled = df.copy()
    df_scaled = scaler.fit_transform(df)
    return df_scaled

def log_transform(df):
    """ Apply log transformation to specified columns """
    df_transformed = df.copy()
    for column in df.columns:
        df_transformed[column + '_log'] = np.log1p(df[column])
    return df_transformed

def min_max_scale(df):
    """ Scale features to a given range, typically [0, 1] """
    min_max_scaler = MinMaxScaler()
    df_minmax = df.copy()
    df_minmax = min_max_scaler.fit_transform(df)
    return df_minmax

def power_transform(df, method='yeo-johnson'):
    """ Apply a power transformation to each feature to make the data more Gaussian-like """
    pt = PowerTransformer(method=method, standardize=True)
    df_power = df.copy()
    df_power = pt.fit_transform(df)
    return df_power

def robust_scale(df):
    """ Scale features using statistics that are robust to outliers """
    robust_scaler = RobustScaler()
    df_robust = df.copy()
    df_robust = robust_scaler.fit_transform(df)
    return df_robust

# Applying transformations
df_standard_scaled = standard_scale(df.drop(['winner_01'],axis=1))
df_log_transformed = log_transform(df.drop(['winner_01'],axis=1))
df_minmax_scaled = min_max_scale(df.drop(['winner_01'],axis=1))
df_power_transformed = power_transform(df.drop(['winner_01'],axis=1))
df_robust_scaled = robust_scale(df.drop(['winner_01'],axis=1))

# print("Standard Scaled:\n", df_standard_scaled)
# print("Log Transformed:\n", df_log_transformed)
# print("Min-Max Scaled:\n", df_minmax_scaled)
# print("Power Transformed:\n", df_power_transformed)
# print("Robust Scaled:\n", df_robust_scaled)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
len(selected_columns)

15

In [None]:
df_log_transformed.fillna(0, inplace=True)
df_log_transformed.replace([np.inf, -np.inf], 0, inplace=True)

## Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

X = df_robust_scaled
y = df['winner_01']

# scaler = StandardScaler()
# X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

[LightGBM] [Info] Number of positive: 386, number of negative: 372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2112
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509235 -> initscore=0.036944
[LightGBM] [Info] Start training from score 0.036944
Learning rate set to 0.009153
0:	learn: 0.6918592	total: 917us	remaining: 916ms
1:	learn: 0.6906964	total: 1.77ms	remaining: 884ms
2:	learn: 0.6898884	total: 2.57ms	remaining: 853ms
3:	learn: 0.6892215	total: 3.4ms	remaining: 848ms
4:	learn: 0.6884864	total: 4.35ms	remaining: 866ms
5:	learn: 0.6876356	total: 5.14ms	remaining: 851ms
6:	learn: 0.6860745	total: 5.92ms	remaining: 840ms
7:	learn: 0.6850162	total: 6.75ms	remaining: 837ms
8:	learn: 0.6837152	total: 7.76ms	remaining: 854ms
9:	learn: 0.6826567	total: 

In [None]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train, label=y_train)
valid_pool = Pool(data=X_test, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

[I 2024-06-06 23:06:29,792] A new study created in memory with name: no-name-72cc0b03-042a-4ecf-ae2d-9523aae0647d
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2024-06-06 23:06:31,669] Trial 0 finished with value: -0.5894736842105263 and parameters: {'iterations': 940, 'learning_rate': 0.050485559577550784, 'depth': 9, 'l2_leaf_reg': 5.76682327128443, 'border_count': 86, 'bagging_temperature': 0.2742476720289039, 'random_strength': 0.5320921904279122, 'od_type': 'Iter', 'od_wait': 44}. Best is trial 0 with value: -0.5894736842105263.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('b

Best parameters: {'iterations': 505, 'learning_rate': 0.00012954115188296315, 'depth': 5, 'l2_leaf_reg': 0.028340223578644713, 'border_count': 165, 'bagging_temperature': 0.01741752232968475, 'random_strength': 0.028378346562326785, 'od_type': 'IncToDec', 'od_wait': 43}
0:	learn: 0.6931239	total: 707us	remaining: 357ms
100:	learn: 0.6910651	total: 65.2ms	remaining: 261ms
200:	learn: 0.6890427	total: 134ms	remaining: 203ms
300:	learn: 0.6870704	total: 275ms	remaining: 187ms
400:	learn: 0.6851216	total: 353ms	remaining: 91.6ms
500:	learn: 0.6831719	total: 428ms	remaining: 3.42ms
504:	learn: 0.6831060	total: 431ms	remaining: 0us
0.6368421052631579


## Combinations

In [None]:
# import itertools

# # Generate all possible combinations of features
# features = X.columns.tolist()
# all_combinations = []
# for r in range(1, len(features) + 1):
#     combinations_object = itertools.combinations(features, r)
#     combinations_list = list(combinations_object)
#     all_combinations.extend(combinations_list)

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# # Store results
# results = []

# # Loop through each combination of features
# for combination in all_combinations:
#     # Select the features for this combination
#     X_subset = X[list(combination)]
    
#     # Split the data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)
    
#     # Train a model (using Logistic Regression here as an example)
#     model = CatBoostClassifier()
#     model.fit(X_train, y_train)
    
#     # Make predictions
#     y_pred = model.predict(X_test)
    
#     # Evaluate the model
#     accuracy = accuracy_score(y_test, y_pred)
    
#     # Store the result
#     results.append({
#         'features': combination,
#         'accuracy': accuracy
#     })

In [None]:
# from catboost import CatBoostClassifier, Pool, cv
# from sklearn.model_selection import train_test_split
# import pandas as pd

# # Define categorical features indices
# # cat_features = [index for index, col in enumerate(X.columns) if X[col].dtype == 'object']

# # Initialize a CatBoost Classifier
# model = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.1,
#     depth=6,
#     eval_metric='Accuracy',
#     # cat_features=cat_features,
#     verbose=200
# )

# # Fit model
# model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, early_stopping_rounds=50)

# # Feature importance
# feature_importances = model.get_feature_importance(prettified=True)
# print(feature_importances)

# # You might choose to retrain with selected features based on importance
# important_features = feature_importances['Feature Id'][:10]  # top 10 features
# model.fit(X_train[important_features], y_train, eval_set=(X_test[important_features], y_test))

# # Final evaluation
# print("Model performance:", model.score(X_test[important_features], y_test))

In [None]:
# important_features

In [None]:
# import pandas as pd
# from catboost import CatBoostClassifier
# from itertools import combinations
# import numpy as np

# results = []

# # Iterate over all non-empty combinations of features
# for i in range(1, 27):  # Change this range for all 27 features if feasible
#     for combo in combinations(X_train.columns, i):
#         # Select only the current combination of features
#         X_train_subset = X_train[list(combo)]
#         X_test_subset = X_test[list(combo)]

#         # Train CatBoost
#         model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3, verbose=False)
#         model.fit(X_train_subset, y_train, eval_set=(X_test_subset, y_test), early_stopping_rounds=10, use_best_model=True)
        
#         # Store the performance
#         score = model.get_best_score()['validation']['Logloss']
#         results.append({'combo': combo, 'score': score})

# # Find the best combination
# best_combo = min(results, key=lambda x: x['score'])
# print("Best Combination:", best_combo['combo'])
# print("Best Logloss:", best_combo['score'])