In [1]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [3]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

## Making Features

In [4]:
num_match = 15

In [5]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [6]:
def player_form_factor(player_list, date, n):
    players = player_list.split(':')
    form_factors = []
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            weights = np.linspace(1, 2, len(recent_matches))  # Linear weight increase from 1 to 2
            form_factors.append(np.average(recent_matches['runs'], weights=weights))
        else:
            form_factors.append(0)
    return np.mean(form_factors) if players else 0

In [7]:
def team_batting_strength(player_list, date, n):
    players = player_list.split(':')
    total_runs = 0
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        total_runs += recent_matches['runs'].sum()
    return total_runs / len(players) if players else 0

In [8]:
def average_wickets_taken(player_list, date, n):
    player_list = player_list.split(':')
    wickets_list = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_wickets = recent_matches['wicket_count'].sum()
        average_wickets = total_wickets / n if n > 0 else 0
        wickets_list.append(average_wickets)
    return sum(wickets_list) / len(wickets_list) if wickets_list else 0

In [9]:
def no50sLastn(player_list, date, n):

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

In [10]:
def calculate_batsman_strike_rate(player_list, date, n):
    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    res_list = []

    for player in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        runs = recent_matches['runs'].sum()
        balls = recent_matches['balls_faced'].sum()

        # Calculate strike rate and avoid division by zero
        if balls > 0:
            strike_rate = (runs / balls) * 100
        else:
            strike_rate = 0

        res_list.append(strike_rate)

    # Calculate the average strike rate across all players in the list
    if res_list:  # Ensure the list is not empty to avoid division by zero
        average_strike_rate = sum(res_list) / len(res_list)
    else:
        average_strike_rate = 0

    return average_strike_rate

# Harshit

In [11]:
def recent_team_form(team_id, match_dt, match_lvl_data, last_n=5):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < match_dt)].sort_values('match_dt', ascending=False).head(last_n)
    wins = recent_matches['winner_id'].apply(lambda winner_id: 1 if winner_id == team_id else 0).sum()
    return wins / last_n if last_n > 0 else 0

In [12]:
def calculate_exponential_momentum(match_lvl_data, team_id, date, alpha=0.1):
    matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False)
    wins = matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).to_numpy()
    weights = np.exp(-alpha * np.arange(len(wins)))
    if np.sum(weights) > 0:
        momentum = np.dot(wins, weights) / np.sum(weights)
    else:
        momentum = 0  # Handle case with no matches
    return momentum

In [13]:
def average_winning_margin(team_id, date, match_lvl_data):
    winning_matches = match_lvl_data[
        (match_lvl_data['winner_id'] == team_id) & (match_lvl_data['match_dt'] < date)
    ]
    if len(winning_matches) > 0:
        runs_wins = winning_matches[winning_matches['by'] == 'runs']['win amount']
        wickets_wins = winning_matches[winning_matches['by'] == 'wickets']['win amount']
        average_margin = pd.concat([runs_wins, wickets_wins]).mean()
        return average_margin
    return 0

In [14]:
def lighting_performance(match_lvl_data, lighting_type, team_id, date):
    matches = match_lvl_data[(match_lvl_data['lighting'] == lighting_type) & (match_lvl_data['match_dt'] < date)]
    if len(matches) > 0:
        wins = matches[matches['winner_id'] == team_id].shape[0]
        return wins / len(matches)
    return 0  # Return 0 if no matches found under this condition

In [15]:
def overall_historical_win_rate(match_lvl_data, team_id, date):
    historical_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < date)]
    wins = historical_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    total_matches = len(historical_matches)
    return wins / total_matches if total_matches > 0 else 0

In [16]:
def recent_performance(team_id, date, n, match_lvl_data):
    recent_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ].tail(n)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / len(recent_matches) if len(recent_matches) > 0 else 0

In [17]:
def weighted_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = team_roster.split(':')
    total_weighted_awards = 0

    # Fetch recent matches up to 'n' for all players in the roster before the specified date
    recent_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'].isin(player_ids)) &
                                    (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

    # Assigning weights - newer matches get higher weight
    for i, match in enumerate(recent_matches.itertuples(), 1):
        weight = 1 + (n - i) * 0.1  # Example weight formula: 1 + (total_matches - position) * 0.1
        total_weighted_awards += weight

    return total_weighted_awards

In [18]:
def count_recent_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = str(team_roster).split(':')  # Split string of ':' separated ids into a list of ids
    total_mom_awards = 0

    for player_id in player_ids:  # Loop over each player_id in roster
        # Get relevant matches for the player up to the specified date
        player_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'] == player_id) &
                                        (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        # Count the number of matches where this player was MoM
        mom_count = len(player_matches)
        total_mom_awards += mom_count  # Accumulate MoM awards

    return total_mom_awards

In [19]:
def team_batsman_performance_index(bat_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 0.5 +
                                  recent_matches['strike_rate'] * 0.3 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] * 2) * 0.2) * \
                                  (1 + 0.1 * recent_matches['is_batsman_captain'].iloc[0] +
                                   0.05 * recent_matches['is_batsman_keeper'].iloc[0])
            total_performance_index += player_performance.sum()

    return total_performance_index

In [20]:
def team_bowler_impact_score(bowler_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count'] * 2 +
                             (120 / recent_matches['economy']) * 0.5 +
                             recent_matches['maiden'] * 1) * \
                            (1 + 0.1 * recent_matches['is_bowler_captain'].iloc[0] +
                             0.05 * recent_matches['is_bowler_keeper'].iloc[0])
            total_impact_score += player_impact.sum()

    return total_impact_score

In [21]:
bowler_lvl_data.columns

Index(['match id', 'bowler', 'bowler_id', 'bowler_details',
       'is_bowler_captain', 'is_bowler_keeper', 'inning', 'runs',
       'wicket_count', 'balls_bowled', 'economy', 'maiden', 'dots', 'Fours',
       'Sixes', 'wides', 'noballs', 'match_dt'],
      dtype='object')

New feature added self

In [22]:
#new feature 1 bowler's performance of teams in recent mathes
def team_bowler_performance_recent(bowler_df, match_id, team_roster_ids, date, n=15):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count']) * 5 + recent_matches['dots']*1.5+(recent_matches['maiden']) * 1.5 -(recent_matches['fours'] * 1) - recent_matches['sixes']*1.5 - recent_matches['wides'] - recent_matches['noballs']                          
                           
                            
                             
            total_impact_score += player_impact.sum()

    return total_impact_score

In [23]:
train_data['team1_bowler_performance_recent'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

In [24]:
train_data['team2_bowler_performance_recent'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

New feature self 2

In [25]:
#new feature 2 
def runs_conceded_teams_recent(player_list, date, n=15):
    player_list = str(player_list).split(':')
    total_runs_conceded = 0

    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_runs_conceded += recent_matches['runs'].sum()
        
    return total_runs_conceded




In [26]:
train_data['runs_conceded_team1_recent'] = train_data.progress_apply(
    lambda x: runs_conceded_teams_recent(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1
)


100%|██████████| 948/948 [00:07<00:00, 126.37it/s]


In [27]:
train_data['runs_conceded_team2_recent'] = train_data.progress_apply(
    lambda x: runs_conceded_teams_recent(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1
)


100%|██████████| 948/948 [00:07<00:00, 126.48it/s]


In [28]:
batsman_lvl_data.columns

Index(['match id', 'batsman', 'batsman_id', 'batsman_details',
       'is_batsman_captain', 'is_batsman_keeper', 'inning', 'runs',
       'balls_faced', 'over_faced_first', 'wicket kind', 'out_by_bowler',
       'out_by_fielder', 'bowler_id', 'bowler_details', 'is_bowler_keeper',
       'is_bowler_captain', 'strike_rate', 'Fours', 'Sixes', 'match_dt'],
      dtype='object')

new feature self 4

In [29]:
def team_batsman_performance_recent(bat_df, match_id, team_roster_ids, date, n=15):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 4.5 +
                                  recent_matches['strike_rate'] * 2.5 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] ) * 1.5) 
                                  
            total_performance_index += player_performance.sum()

    return total_performance_index

In [30]:
train_data['team1_batsman_performance_recent'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

In [31]:
train_data['team2_batsman_performance_recent'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

New self features 5

In [32]:
def headToHeadRatio(team1_id, team2_id, date):
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                             (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|\
                              ((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]
    team1_wins = df_rel[df_rel['winner_id']==team1_id].shape[0]
    team2_wins = df_rel[df_rel['winner_id']==team2_id].shape[0]
    total_matches = df_rel.shape[0]
    if total_matches == 0:
        return 0
    return team1_wins / total_matches

train_data['head_to_head_ratio'] = train_data.progress_apply(lambda x: \
                                  headToHeadRatio(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

100%|██████████| 948/948 [00:00<00:00, 2323.77it/s]


In [33]:
def calculate_bowler_economy_rate(player_list, date, n):
    player_list = player_list.split(':')
    economy_rates = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        runs_conceded = recent_matches['runs'].sum()
        overs = (recent_matches['balls_bowled'].sum()) / 6
        economy_rate = (runs_conceded / overs) if overs > 0 else 0
        economy_rates.append(economy_rate)
    return sum(economy_rates) / len(economy_rates) if economy_rates else 0

In [34]:
def team_scoring_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_scores = team_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1
    )
    return team_scores.mean() if len(team_scores) > 0 else 0

In [35]:
train_data['team1_form_factor'] = train_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_team_form(x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

train_data['team2_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team2_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=15,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team2_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

# train_data['team1_weighted_mom'] = train_data.apply(
#     lambda x: weighted_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)

# train_data['team1_recent_mom_count'] = train_data.apply(
#     lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)

train_data['team1_batsman_performance_index'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_bowler_impact_score'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)

train_data['team1_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)


train_data['team1_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team1_id'], x['match_dt']),
    axis=1
)

train_data['team1_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=15,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)

100%|██████████| 948/948 [00:09<00:00, 95.69it/s]
100%|██████████| 948/948 [00:09<00:00, 97.17it/s] 
100%|██████████| 948/948 [00:08<00:00, 115.86it/s]
100%|██████████| 948/948 [00:11<00:00, 79.84it/s]
100%|██████████| 948/948 [00:09<00:00, 96.93it/s] 
100%|██████████| 948/948 [00:07<00:00, 123.56it/s]


In [36]:
train_data['team2_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)


100%|██████████| 948/948 [00:07<00:00, 123.36it/s]


In [37]:
train_data.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = True)

team1_form_factor                  -0.147925
team1_batting_strength             -0.140372
team1_batsman_performance_index    -0.139209
team1_batsman_performance_recent   -0.139209
team1_avg_wicket                   -0.133810
team1_count_50runs_last15          -0.132258
team1_strike_rate                  -0.129016
runs_conceded_team1_recent         -0.125668
team1_bowler_impact_score          -0.121397
team1_bowler_performance_recent    -0.121397
team_count_50runs_last15           -0.110761
team1_overall_win_rate             -0.091137
team1_bowler_eco                   -0.086270
team1_night_match_win_rate         -0.084950
team1_recent_win_rate              -0.082662
team1_scoring_average              -0.080175
team1_momentum                     -0.079657
team2_id                           -0.068129
winner_id                          -0.065381
head_to_head_ratio                 -0.060605
team1_winp_team2_last15            -0.055788
team1_id                           -0.053952
team1only_

In [38]:
abs(train_data.select_dtypes(include=['number']).corr()['winner_01']).sort_values(ascending = False)

winner_01                           1.000000
team1_form_factor                   0.147925
team1_batting_strength              0.140372
team1_batsman_performance_recent    0.139209
team1_batsman_performance_index     0.139209
team1_avg_wicket                    0.133810
team1_count_50runs_last15           0.132258
team1_strike_rate                   0.129016
runs_conceded_team1_recent          0.125668
team1_bowler_performance_recent     0.121397
team1_bowler_impact_score           0.121397
team_count_50runs_last15            0.110761
team2_recent_form                   0.105046
team1_overall_win_rate              0.091137
team1_bowler_eco                    0.086270
team1_night_match_win_rate          0.084950
team2_momentum                      0.083399
team1_recent_win_rate               0.082662
team1_scoring_average               0.080175
team1_momentum                      0.079657
team2_average_winning_margin        0.074013
team2_day_match_win_rate            0.073309
team2_over

In [91]:
numeric_features = train_data.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr_matrix = numeric_features.corr()
corr_matrix

# Set a threshold for considering features as highly correlated
correlation_threshold = 0.2

# Identify pairs of highly correlated features
high_corr_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns 
                   if i != j and abs(corr_matrix.loc[i, j]) > correlation_threshold]

print("Highly correlated feature pairs:")
for pair in high_corr_pairs:
    print(pair)



Highly correlated feature pairs:
('team1_id', 'team2_id')
('team1_id', 'winner_id')
('team1_id', 'team2_day_match_win_rate')
('team2_id', 'team1_id')
('team2_id', 'winner_id')
('team2_id', 'team2_day_match_win_rate')
('winner_id', 'team1_id')
('winner_id', 'team2_id')
('winner_id', 'team2_day_match_win_rate')
('ground_id', 'ground_avg_runs_last15')
('ground_id', 'runs_conceded_team1_recent')
('ground_id', 'runs_conceded_team2_recent')
('ground_id', 'team1_batting_strength')
('ground_id', 'team1_strike_rate')
('ground_id', 'team1_bowler_eco')
('ground_id', 'team2_bowler_eco')
('team_count_50runs_last15', 'team2_bowler_performance_recent')
('team_count_50runs_last15', 'runs_conceded_team2_recent')
('team_count_50runs_last15', 'team2_batsman_performance_recent')
('team_count_50runs_last15', 'team1_count_50runs_last15')
('team_count_50runs_last15', 'team2_recent_form')
('team_count_50runs_last15', 'team2_momentum')
('team_count_50runs_last15', 'team2_recent_win_rate')
('team_count_50runs_l

In [92]:
# Create a set to keep track of features to be removed
features_to_remove = set()

for i, j in high_corr_pairs:
    if i not in features_to_remove and j not in features_to_remove:
        # Add one of the features to the removal set
        features_to_remove.add(j)  # You can choose to add either i or j

# Drop the features
train_data_reduced = numeric_features.drop(columns=features_to_remove)

print("Features removed:")
print(features_to_remove)
print("Remaining features:")
print(train_data_reduced.columns)


Features removed:
{'team2_batsman_performance_recent', 'team1_recent_win_rate', 'team1_night_match_win_rate', 'team1_bowler_impact_score', 'runs_conceded_team1_recent', 'ground_avg_runs_last15', 'team2_recent_win_rate', 'team2_bowler_performance_recent', 'team1_strike_rate', 'team2_id', 'team1_bowler_eco', 'team1_batsman_performance_recent', 'head_to_head_ratio', 'team2_average_winning_margin', 'team1_avg_wicket', 'team1_momentum', 'team2_bowler_eco', 'team1_overall_win_rate', 'team1_scoring_average', 'team1_form_factor', 'team1_batsman_performance_index', 'team2_day_match_win_rate', 'runs_conceded_team2_recent', 'team2_overall_win_rate', 'winner_id', 'team2_recent_form', 'team2_momentum', 'team1_batting_strength', 'team1_count_50runs_last15'}
Remaining features:
Index(['match id', 'team1_id', 'ground_id', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15',
       'team1_winp_team2_last15', 'winner_01',
       'team1_bowler_performance_recent'],
      dty

In [93]:
print(train_data.shape)
print(train_data_reduced.shape)

(948, 51)
(948, 9)


In [94]:
abs(train_data_reduced.select_dtypes(include=['number']).corr()['winner_01']).sort_values(ascending = False)

winner_01                          1.000000
team1_bowler_performance_recent    0.121397
team_count_50runs_last15           0.110761
team1_winp_team2_last15            0.055788
team1_id                           0.053952
team1only_avg_runs_last15          0.047838
match id                           0.038028
ground_id                          0.015227
team_winp_last5                    0.011712
Name: winner_01, dtype: float64

In [95]:
train_data_reduced.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = False)

winner_01                          1.000000
match id                           0.038028
ground_id                          0.015227
team_winp_last5                   -0.011712
team1only_avg_runs_last15         -0.047838
team1_id                          -0.053952
team1_winp_team2_last15           -0.055788
team_count_50runs_last15          -0.110761
team1_bowler_performance_recent   -0.121397
Name: winner_01, dtype: float64

In [96]:
# columns_to_drop = ['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
#                    'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
#                    'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
#                    'season', 'ground_id']

# train_data_scaled = train_data_scaled.drop(columns=columns_to_drop)

## Model

In [97]:
df_train = train_data_reduced.select_dtypes(include=['number'])

In [98]:
# df_train.fillna(0,inplace=True)
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train.fillna(df_train.mean(), inplace=True)

In [99]:
df_train.columns

Index(['match id', 'team1_id', 'ground_id', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15',
       'team1_winp_team2_last15', 'winner_01',
       'team1_bowler_performance_recent'],
      dtype='object')

In [100]:
df_train.drop(['match id','team1_id','ground_id'], axis=1, inplace=True)

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

X = df_train.drop('winner_01', axis=1)
y = df_train['winner_01']

# poly = PolynomialFeatures(degree=2, include_bias=True)
# X = poly.fit_transform(X)
# print(X.shape)

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Apply PCA
# pca = PCA()
# X_pca = pca.fit_transform(X_scaled)

# # Explained variance ratio
# explained_variance_ratio = pca.explained_variance_ratio_
# cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# # Select number of components to explain desired variance (e.g., 95%)
# desired_variance = 0.95
# n_components = np.argmax(cumulative_explained_variance >= desired_variance) + 1
# print(n_components)

# pca = PCA(n_components=n_components)
# X = pca.fit_transform(X_scaled)
# print(X.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [103]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

[LightGBM] [Info] Number of positive: 386, number of negative: 372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509235 -> initscore=0.036944
[LightGBM] [Info] Start training from score 0.036944
Learning rate set to 0.009153
0:	learn: 0.6927092	total: 1.32ms	remaining: 1.32s
1:	learn: 0.6918770	total: 2.53ms	remaining: 1.26s
2:	learn: 0.6911144	total: 2.97ms	remaining: 987ms
3:	learn: 0.6906463	total: 3.42ms	remaining: 851ms
4:	learn: 0.6900615	total: 3.83ms	remaining: 763ms
5:	learn: 0.6893031	total: 4.27ms	remaining: 708ms
6:	learn: 0.6886253	total: 4.69ms	remaining: 666ms
7:	learn: 0.6875492	total: 5.13ms	remaining: 636ms
8:	learn: 0.6867733	total: 5.61ms	remaining: 618ms
9:	learn: 0.6861872	total: 

In [104]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train, label=y_train)
valid_pool = Pool(data=X_test, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

[I 2024-06-05 12:23:58,623] A new study created in memory with name: no-name-0f40beb4-1d20-4cd6-ae48-2d9ef9f68b77
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2024-06-05 12:24:00,110] Trial 0 finished with value: -0.5421052631578948 and parameters: {'iterations': 947, 'learning_rate': 0.0001398940833733022, 'depth': 9, 'l2_leaf_reg': 0.01973863159795463, 'border_count': 212, 'bagging_temperature': 0.3017168299091237, 'random_strength': 0.05683313539512148, 'od_type': 'IncToDec', 'od_wait': 29}. Best is trial 0 with value: -0.5421052631578948.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_log

Best parameters: {'iterations': 100, 'learning_rate': 0.0013964022711170261, 'depth': 4, 'l2_leaf_reg': 1.1102753299747634, 'border_count': 201, 'bagging_temperature': 0.042053179274844164, 'random_strength': 4.106802768178087, 'od_type': 'Iter', 'od_wait': 38}
0:	learn: 0.6931242	total: 773us	remaining: 76.6ms
99:	learn: 0.6901823	total: 18.3ms	remaining: 0us
0.6263157894736842
