In [1]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [3]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

In [4]:
# from matplotlib import pyplot as plt
# plt.style.use('seaborn');
# import re

# def createRnP(X_12, feature, N=5, ylim_lb=0.3, ylim_ub=0.7):
#     '''
#     Rank and Plot of input feature on the input data. The y-axis shows %team1 wins in each bucket.

#     Parameters-
#     1. X_12: dataset to build the RnP on.
#     2. feature: Feature to build RnP of.
#     3. N: number of bins on x-axis. Default 5.
#     4. ylim_lb: lower bound of y axis on plot.
#     5. ylim_ub: upper bound of y axis on plot.

#     Output-
#     1. Rank and Plot

#     Returns- None
#     '''
#     df = X_12.copy()
#     df[f'{feature}_bin'] = df[feature].rank(pct=True)//(1/N) # divide feature values for all games in 5 equi-volume buckets.
#     df['count'] = 1
#     df['team1_win%'] = df['winner_01'].apply(lambda x: 1-x) # invert winner_01 to get team1 winner indicator
#     df['team2_win%'] = df['winner_01'].copy()
#     df[f'{feature}_min'] = df[feature].copy()
#     df[f'{feature}_max'] = df[feature].copy()
#     df_g = df.groupby(f'{feature}_bin').agg({'team1_win%':'mean', 'team2_win%':'mean', 'count':'sum', f'{feature}_min':'min',\
#                                             f'{feature}_max':'max'}).reset_index()
#     N = min(N,df_g.shape[0])
#     blue_bar = df_g['team1_win%'].values.tolist()
#     ind = np.arange(N)
#     # plotting starts
#     plt.figure(figsize=(10,5));
#     plt.bar(ind, blue_bar, label='Team 1 win%');
#     plt.axhline(y=0.5, linewidth=0.5, color='k', linestyle = '--')
#     xlabel = re.sub('team_','ratio_',feature)
#     plt.xlabel(f'{xlabel} (team1 / team2) bins');
#     plt.ylabel('Win %');
#     plt.title(f'RnP - {feature} vs win');
#     df_g['xticks'] = df_g.apply(lambda x: str(round(x[f'{feature}_min'],2)) + ' - ' + str(round(x[f'{feature}_max'],2)), axis=1)
#     plt.xticks(ind, df_g['xticks']);
#     plt.ylim([ylim_lb,ylim_ub]);
#     plt.legend(loc='best');
#     x2,x1 = blue_bar[-1],blue_bar[0]
#     slope = x2/x1
#     if slope < 1:
#         slope = 1/slope
#         x1,x2 = x2,x1
#     print('slope:', round(x2,2),'/',round(x1,2), '= ',round(slope,2))
#     plt.show();

## Feature Functions

In [5]:
num_match = 15

In [6]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [7]:
def no50sLastn(player_list, date, n):

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

In [8]:
# Computing number of 50 runs in last 15 games for team1 for train dataset.
train_data['team1_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

100%|██████████| 948/948 [00:11<00:00, 81.22it/s]
100%|██████████| 948/948 [00:10<00:00, 86.33it/s]


In [9]:
train_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'winner_01', 'team1_count_50runs_last15',
       'team2_count_50runs_last15'],
      dtype='object')

In [10]:
train_data.select_dtypes(include=['number']).corr()['winner_01']

match id                     0.038028
team1_id                    -0.053952
team2_id                    -0.068129
winner_id                   -0.065381
ground_id                    0.015227
team_count_50runs_last15    -0.110761
team_winp_last5             -0.011712
team1only_avg_runs_last15   -0.047838
team1_winp_team2_last15     -0.055788
ground_avg_runs_last15      -0.012359
winner_01                    1.000000
team1_count_50runs_last15   -0.132258
team2_count_50runs_last15   -0.002135
Name: winner_01, dtype: float64

In [11]:
# createRnP(train_data, 'team1_count_50runs_last15')

In [12]:
def team_win_rate_at_ground(team_id, ground_id, date, n):
    # Filter matches with specified team and ground, before the given date
    relevant_games = match_lvl_data[(match_lvl_data['match_dt'] < date) &
                                    ((match_lvl_data['team1_id'] == team_id) |
                                     (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['ground_id'] == ground_id)].sort_values(by='match_dt', ascending=False).head(n)

    # Calculate wins
    wins = relevant_games.apply(lambda x: 1 if x['winner_id'] == team_id else 0, axis=1).sum()

    # Calculate win rate
    if len(relevant_games) > 0:
        win_rate = wins / len(relevant_games)
    else:
        win_rate = 0  # No games to calculate from

    return win_rate

# Add to train_data
train_data['team1_win_rate_at_ground'] = train_data.progress_apply(lambda x:
                            team_win_rate_at_ground(x['team1_id'], x['ground_id'], x['match_dt'], num_match), axis=1)

100%|██████████| 948/948 [00:00<00:00, 2297.33it/s]


In [13]:
def calculate_batsman_strike_rate(player_list, date, n):
    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    res_list = []

    for player in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        runs = recent_matches['runs'].sum()
        balls = recent_matches['balls_faced'].sum()

        # Calculate strike rate and avoid division by zero
        if balls > 0:
            strike_rate = (runs / balls) * 100
        else:
            strike_rate = 0

        res_list.append(strike_rate)

    # Calculate the average strike rate across all players in the list
    if res_list:  # Ensure the list is not empty to avoid division by zero
        average_strike_rate = sum(res_list) / len(res_list)
    else:
        average_strike_rate = 0

    return average_strike_rate


def calculate_bowler_economy_rate(player_list, date, n):
    player_list = player_list.split(':')
    economy_rates = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        runs_conceded = recent_matches['runs'].sum()
        overs = (recent_matches['balls_bowled'].sum()) / 6
        economy_rate = (runs_conceded / overs) if overs > 0 else 0
        economy_rates.append(economy_rate)
    return sum(economy_rates) / len(economy_rates) if economy_rates else 0


def average_wickets_taken(player_list, date, n):
    player_list = player_list.split(':')
    wickets_list = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_wickets = recent_matches['wicket_count'].sum()
        average_wickets = total_wickets / n if n > 0 else 0
        wickets_list.append(average_wickets)
    return sum(wickets_list) / len(wickets_list) if wickets_list else 0


def team_batting_strength(player_list, date, n):
    players = player_list.split(':')
    total_runs = 0
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        total_runs += recent_matches['runs'].sum()
    return total_runs / len(players) if players else 0

In [14]:
train_data['team1_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

100%|██████████| 948/948 [00:09<00:00, 95.12it/s] 
100%|██████████| 948/948 [00:09<00:00, 96.04it/s] 


In [15]:
test_data['team1_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

100%|██████████| 948/948 [00:08<00:00, 117.34it/s]
100%|██████████| 948/948 [00:08<00:00, 115.63it/s]


In [16]:
train_data['team1_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

100%|██████████| 948/948 [00:07<00:00, 119.98it/s]
100%|██████████| 948/948 [00:07<00:00, 123.74it/s]


In [17]:
train_data['team1_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

100%|██████████| 948/948 [00:09<00:00, 96.22it/s] 
100%|██████████| 948/948 [00:10<00:00, 91.82it/s]


## More features

In [18]:
match_lvl_data.head()

Unnamed: 0,match id,team1,team2,winner,by,win amount,toss winner,toss decision,venue,city,...,inning2_runs,inning2_wickets,inning2_balls,team1_id,team1_roster_ids,team2_id,team2_roster_ids,series_type,winner_id,player_of_the_match_id
0,8638034,Nn Ds,Wn,Wn,wickets,9.0,Wn,field,By Ol,Mount Maunganui,...,152.0,1.0,97.0,17982,7907451.0:4381761.0:31464.0:258649.0:4949790.0...,18570,2653993.0:6718326.0:6718382.0:2486896.0:228878...,other_domestic,18570,
1,8588005,Me Rs,Sy Tr,Sy Tr,runs,7.0,Sy Tr,field,Ca Ol,Carrara,...,117.0,2.0,74.0,33942,37351.0:46794.0:5406540.0:2231928.0:181404.0:1...,33963,1506098.0:1749075.0:36665.0:2083409.0:7534652....,other_domestic,33963,1749075.0
2,8587837,Sy Ss,Be Ht,Be Ht,wickets,4.0,Sy Ss,bat,Be Ct Gd,Brisbane,...,171.0,6.0,119.0,33956,7869987.0:7620283.0:2076192.0:4002340.0:306369...,33921,7620269.0:2286437.0:87191.0:5786766.0:3114803....,other_domestic,33921,3890984.0
3,8638041,Nn Ds,Oo,Oo,wickets,2.0,Oo,field,By Ol,Mount Maunganui,...,156.0,8.0,126.0,17982,7907451.0:4381761.0:31464.0:4949790.0:258649.0...,18360,2319638.0:256080.0:7918280.0:3913447.0:2690498...,other_domestic,18360,
4,8587921,Ht Hs,Me Ss,Ht Hs,runs,21.0,Me Ss,field,Be Ol,Hobart,...,143.0,9.0,123.0,33928,4223883.0:2161599.0:1655436.0:5788418.0:319948...,33949,363047.0:2275097.0:3901078.0:2275195.0:4230127...,other_domestic,33928,3125849.0


In [19]:
def player_form_factor(player_list, date, n):
    players = player_list.split(':')
    form_factors = []
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            weights = np.linspace(1, 2, len(recent_matches))  # Linear weight increase from 1 to 2
            form_factors.append(np.average(recent_matches['runs'], weights=weights))
        else:
            form_factors.append(0)
    return np.mean(form_factors) if players else 0

# def team_momentum(player_list, date, n):
#     teams = player_list.split(':')
#     momenta = []
#     for team_id in teams:
#         recent_results = giveLastNgamesPlayer(player_id=team_id, date=date, n=n, bat_or_bowl='bat')
#         wins = recent_results['winner'].apply(lambda x: 1 if x == team_id else 0).sum()
#         momenta.append(wins / n if n > 0 else 0)
#     return np.mean(momenta) if teams else 0

# def head_to_head_performance(player_list1, player_list2, date):
#     team1s = player_list1.split(':')
#     team2s = player_list2.split(':')
#     performances = []
#     for team1_id, team2_id in zip(team1s, team2s):
#         matches = giveLastNgamesPlayer(team1_id, team2_id, date)
#         team1_wins = matches['winner'].apply(lambda x: 1 if x == team1_id else 0).sum()
#         team1_avg_score = matches[matches['batting_team'] == team1_id]['score'].mean()
#         performances.append((team1_wins / len(matches) if matches else 0, team1_avg_score))
#     return performances

# def impact_player_identification(player_list, date, n):
#     teams = player_list.split(':')
#     impact_players = []
#     for team_id in teams:
#         players = giveLastNgamesPlayer(player_list, date, n)
#         impact_players.extend(players['player_id'][players['impact_score'] == players['impact_score'].max()].tolist())
#     return impact_players

In [20]:
#Apply the Player Form Factor function
train_data['team1_form_factor'] = train_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)
train_data['team2_form_factor'] = train_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team2_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

# Apply the Team Momentum function
# train_data['team1_momentum'] = train_data.progress_apply(lambda x: \
#             calculate_team_momentum(team_id=x['team1_id'], date=x['match_dt'], n=15), axis=1)
# train_data['team2_momentum'] = train_data.progress_apply(lambda x: \
#             calculate_team_momentum(team_id=x['team2_id'], date=x['match_dt'], n=15), axis=1)

# # Apply the Head-to-Head Performance function (assuming you have a way to pair teams)
# train_data[['team1_h2h_win_pct', 'team1_h2h_avg_score']] = train_data.progress_apply(lambda x: \
#             head_to_head_performance(team1_list=x['team1_roster_ids'], team2_list=x['team2_roster_ids'], date=x['match_dt']), axis=1).apply(pd.Series)

# # Apply the Impact Player Identification function
# train_data['impact_players_team1'] = train_data.progress_apply(lambda x: \
#             impact_player_identification(team_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# train_data['impact_players_team2'] = train_data.progress_apply(lambda x: \
#             impact_player_identification(team_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

# # Apply the Environmental Influence function (assuming a field for match IDs)
# train_data[['weather_condition', 'venue_score', 'day_night_match']] = train_data.progress_apply(lambda x: \
#             environmental_influence(match_list=x['match_ids']), axis=1).apply(pd.Series)


100%|██████████| 948/948 [00:09<00:00, 95.57it/s]
100%|██████████| 948/948 [00:10<00:00, 88.78it/s]


In [21]:
def head_to_head_performance(team1_id, team2_id, date, match_lvl_data):
    # Filter matches between the two teams up to the specified date
    relevant_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) |
         (match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id))
    ]

    # Calculate win percentage for team1
    team1_wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team1_id else 0).sum()
    win_percentage = team1_wins / len(relevant_matches) if len(relevant_matches) > 0 else 0

    # Calculate average score for team1 in these matches
    # Assuming scores are in 'inning1_runs' or 'inning2_runs' based on which team was batting first
    team1_scores = relevant_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team1_id else x['inning2_runs'], axis=1
    )
    team1_avg_score = team1_scores.mean() if len(team1_scores) > 0 else 0

    return win_percentage, team1_avg_score

In [22]:
# Applying the function to the train_data DataFrame
train_data[['team1_win_percentage', 'team1_avg_score']] = train_data.apply(
    lambda x: head_to_head_performance(
        team1_id=x['team1_id'],
        team2_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1, result_type='expand'
)

In [23]:
def recent_performance(team_id, date, n, match_lvl_data):
    recent_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ].tail(n)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / len(recent_matches) if len(recent_matches) > 0 else 0

def average_winning_margin(team_id, date, match_lvl_data):
    winning_matches = match_lvl_data[
        (match_lvl_data['winner_id'] == team_id) & (match_lvl_data['match_dt'] < date)
    ]
    if len(winning_matches) > 0:
        runs_wins = winning_matches[winning_matches['by'] == 'runs']['win amount']
        wickets_wins = winning_matches[winning_matches['by'] == 'wickets']['win amount']
        average_margin = pd.concat([runs_wins, wickets_wins]).mean()
        return average_margin
    return 0

def team_scoring_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_scores = team_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1
    )
    return team_scores.mean() if len(team_scores) > 0 else 0

def team_wicket_loss_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_wickets = team_matches.apply(
        lambda x: x['inning1_wickets'] if x['team1_id'] == team_id else x['inning2_wickets'], axis=1
    )
    return team_wickets.mean() if len(team_wickets) > 0 else 0

In [24]:
train_data['team1_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_wicket_loss_average'] = train_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

In [25]:
n_matches = 15  # Number of recent matches to consider
train_data['team2_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_wicket_loss_average'] = train_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

## Some More Features

In [26]:
def average_score_by_venue(venue_id, date, match_lvl_data):
    # Filter matches based on the venue and date
    relevant_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) & (match_lvl_data['match_dt'] < date)]
    average_score = (relevant_matches['inning1_runs'].sum() + relevant_matches['inning2_runs'].sum()) / (2 * len(relevant_matches))
    return average_score if not pd.isna(average_score) else 0

def team_win_rate_at_venue(team_id, venue_id, date, match_lvl_data):
    # Filter matches where the team played at the given venue up to the specified date
    relevant_matches = match_lvl_data[(match_lvl_data['ground_id'] == venue_id) &
                                      ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                      (match_lvl_data['match_dt'] < date)]
    wins = relevant_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    win_rate = wins / len(relevant_matches) if len(relevant_matches) > 0 else 0
    return win_rate

def most_frequent_matchups(team_id, match_lvl_data):
    # Filter matches involving the team
    relevant_matches = match_lvl_data[(match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)]
    opponent_counts = relevant_matches.apply(lambda x: x['team2_id'] if x['team1_id'] == team_id else x['team1_id'], axis=1).value_counts()
    return opponent_counts

In [27]:
# train_data['average_score_at_venue'] = train_data.apply(
#     lambda x: average_score_by_venue(
#         venue_id=x['ground_id'],
#         date=x['match_dt'],
#         match_lvl_data=match_lvl_data
#     ), axis=1
# )

train_data['team1_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_win_rate_at_venue'] = train_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

In [28]:
abs(train_data.select_dtypes(include=['number']).corr()['winner_01']).sort_values(ascending=False)

winner_01                       1.000000
team1_form_factor               0.147925
team1_batting_strength          0.140372
team1_avg_wicket                0.133810
team1_count_50runs_last15       0.132258
team1_strike_rate               0.129016
team_count_50runs_last15        0.110761
team1_recent_win_rate           0.082662
team1_scoring_average           0.080175
team2_average_winning_margin    0.074013
team2_recent_win_rate           0.069638
team2_id                        0.068129
winner_id                       0.065381
team1_win_rate_at_venue         0.064772
team1_win_rate_at_ground        0.064548
team1_win_percentage            0.060605
team1_winp_team2_last15         0.055788
team1_id                        0.053952
team1_avg_score                 0.051135
team1only_avg_runs_last15       0.047838
team2_bowler_eco                0.046352
match id                        0.038028
team2_win_rate_at_venue         0.034948
team2_form_factor               0.029926
team2_wicket_los

match id                        0.038028
team1_id                       -0.053952
team2_id                       -0.068129
winner_id                      -0.065381
ground_id                       0.015227
team_count_50runs_last15       -0.110761
team_winp_last5                -0.011712
team1only_avg_runs_last15      -0.047838
team1_winp_team2_last15        -0.055788
ground_avg_runs_last15         -0.012359
winner_01                       1.000000
team1_count_50runs_last15      -0.132258
team2_count_50runs_last15      -0.002135
team1_win_rate_at_ground       -0.064548
team1_strike_rate              -0.129016
team2_strike_rate              -0.020588
team2_bowler_eco               -0.046352
team1_avg_wicket               -0.133810
team2_avg_wicket                0.005451
team1_batting_strength         -0.140372
team2_batting_strength         -0.003665
team1_form_factor              -0.147925
team2_form_factor              -0.029926
team1_win_percentage           -0.701922
team1_avg_score                -0.363386
team1_recent_win_rate          -0.372045
team1_average_winning_margin   -0.140836
team1_scoring_average          -0.209769
team1_wicket_loss_average       0.152225
team2_recent_win_rate           0.365294
team2_average_winning_margin    0.009973
team2_scoring_average           0.016226
team2_wicket_loss_average      -0.274065
team1_win_rate_at_venue        -0.735297
team2_win_rate_at_venue         0.729234
Name: winner_01, dtype: float64

## Feature Selection

In [46]:
selected_columns = list(train_data.select_dtypes(include=['number']).columns)

# for col in train_data.select_dtypes(include=['number']).columns:
#     if train_data[col].corr(train_data['winner_01']) <= -0.07:
#         selected_columns.append(col)

# for col in train_data.select_dtypes(include=['number']).columns:
#     if train_data[col].corr(train_data['winner_01']) >=0.05:
#         selected_columns.append(col)

In [47]:
selected_columns.remove('winner_id')

In [48]:
df_new = train_data[selected_columns]

In [49]:
df_new.corr()['winner_01']

match id                        0.038028
team1_id                       -0.053952
team2_id                       -0.068129
ground_id                       0.015227
team_count_50runs_last15       -0.110761
team_winp_last5                -0.011712
team1only_avg_runs_last15      -0.047838
team1_winp_team2_last15        -0.055788
ground_avg_runs_last15         -0.012359
winner_01                       1.000000
team1_count_50runs_last15      -0.132258
team2_count_50runs_last15      -0.002135
team1_win_rate_at_ground       -0.064548
team1_strike_rate              -0.129016
team2_strike_rate              -0.020588
team2_bowler_eco               -0.046352
team1_avg_wicket               -0.133810
team2_avg_wicket                0.005451
team1_batting_strength         -0.140372
team2_batting_strength         -0.003665
team1_form_factor              -0.147925
team2_form_factor              -0.029926
team1_win_percentage           -0.060605
team1_avg_score                -0.051135
team1_recent_win

In [50]:
df_new['team1only_avg_runs_last15'].fillna(df_new['team1only_avg_runs_last15'].median(), inplace=True)
df_new['ground_avg_runs_last15'].fillna(df_new['ground_avg_runs_last15'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_new['team1only_avg_runs_last15'].fillna(df_new['team1only_avg_runs_last15'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['team1only_avg_runs_last15'].fillna(df_new['team1only_avg_runs_last15'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].met

In [51]:
df_new.isna().sum()

match id                        0
team1_id                        0
team2_id                        0
ground_id                       0
team_count_50runs_last15        0
team_winp_last5                 0
team1only_avg_runs_last15       0
team1_winp_team2_last15         0
ground_avg_runs_last15          0
winner_01                       0
team1_count_50runs_last15       0
team2_count_50runs_last15       0
team1_win_rate_at_ground        0
team1_strike_rate               0
team2_strike_rate               0
team2_bowler_eco                0
team1_avg_wicket                0
team2_avg_wicket                0
team1_batting_strength          0
team2_batting_strength          0
team1_form_factor               0
team2_form_factor               0
team1_win_percentage            0
team1_avg_score                 0
team1_recent_win_rate           0
team1_average_winning_margin    0
team1_scoring_average           0
team1_wicket_loss_average       0
team2_recent_win_rate           0
team2_average_

## Training

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# from sklearn.preprocessing import StandardScaler

X = df_new.drop('winner_01', axis=1)
y = df_new['winner_01']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [70]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

[LightGBM] [Info] Number of positive: 386, number of negative: 372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5122
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509235 -> initscore=0.036944
[LightGBM] [Info] Start training from score 0.036944
Learning rate set to 0.009153
0:	learn: 0.6920831	total: 2.97ms	remaining: 2.97s
1:	learn: 0.6909905	total: 4.43ms	remaining: 2.21s
2:	learn: 0.6899332	total: 5.66ms	remaining: 1.88s
3:	learn: 0.6890486	total: 7.12ms	remaining: 1.77s
4:	learn: 0.6879436	total: 8.34ms	remaining: 1.66s
5:	learn: 0.6868082	total: 9.66ms	remaining: 1.6s
6:	learn: 0.6856182	total: 11.4ms	remaining: 1.62s
7:	learn: 0.6846927	total: 12.9ms	remaining: 1.59s
8:	learn: 0.6835523	total: 14.5ms	remaining: 1.6s
9:	learn: 0.6822837	total: 

In [71]:
from sklearn.ensemble import VotingClassifier

# Initialize the models
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Create the ensemble
ensemble_model = VotingClassifier(estimators=[
    ('GBM', GBM_model),
    ('LGBM', LGBM_model),
    ('XGB', XGB_model),
    ('CatBoost', CatBoost_model)
], voting='hard')  # You can choose 'hard' or 'soft' voting

# Train the ensemble
ensemble_model.fit(X_train, y_train)

# Make predictions
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the ensemble
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print("Accuracy for ensemble model:", accuracy_ensemble)

[LightGBM] [Info] Number of positive: 386, number of negative: 372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5122
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509235 -> initscore=0.036944
[LightGBM] [Info] Start training from score 0.036944
Learning rate set to 0.009153
0:	learn: 0.6920831	total: 1.54ms	remaining: 1.54s
1:	learn: 0.6909905	total: 2.69ms	remaining: 1.34s
2:	learn: 0.6899332	total: 3.86ms	remaining: 1.28s
3:	learn: 0.6890486	total: 4.99ms	remaining: 1.24s
4:	learn: 0.6879436	total: 6.1ms	remaining: 1.21s
5:	learn: 0.6868082	total: 7.31ms	remaining: 1.21s
6:	learn: 0.6856182	total: 8.7ms	remaining: 1.23s
7:	learn: 0.6846927	total: 10.1ms	remaining: 1.25s
8:	learn: 0.6835523	total: 11.3ms	remaining: 1.24s
9:	learn: 0.6822837	total: 

In [72]:
# from catboost import CatBoostClassifier, Pool, cv
# from sklearn.model_selection import train_test_split
# import pandas as pd

# # Define categorical features indices
# cat_features = [index for index, col in enumerate(X.columns) if X[col].dtype == 'object']

# # Initialize a CatBoost Classifier
# model = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.1,
#     depth=6,
#     eval_metric='Accuracy',
#     cat_features=cat_features,
#     verbose=200
# )

# # Fit model
# model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, early_stopping_rounds=50)

# # Feature importance
# feature_importances = model.get_feature_importance(prettified=True)
# print(feature_importances)

# # You might choose to retrain with selected features based on importance
# important_features = feature_importances['Feature Id'][:20]  # top 10 features
# model.fit(X_train[important_features], y_train, eval_set=(X_test[important_features], y_test))

# # Final evaluation
# print("Model performance:", model.score(X_test[important_features], y_test))

In [73]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train, label=y_train)
valid_pool = Pool(data=X_test, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

[I 2024-06-06 23:07:25,257] A new study created in memory with name: no-name-872b375b-3bc2-455d-affe-f9d04af40f4d
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2024-06-06 23:07:29,530] Trial 0 finished with value: -0.6 and parameters: {'iterations': 678, 'learning_rate': 0.001152477363233324, 'depth': 10, 'l2_leaf_reg': 0.04767237586565451, 'border_count': 100, 'bagging_temperature': 0.04162373618035626, 'random_strength': 3.3624982530319683, 'od_type': 'Iter', 'od_wait': 37}. Best is trial 0 with value: -0.6.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.0

Best parameters: {'iterations': 301, 'learning_rate': 0.000769442566983082, 'depth': 6, 'l2_leaf_reg': 0.25114147269836834, 'border_count': 34, 'bagging_temperature': 0.03628302914224566, 'random_strength': 2.4867656988826887, 'od_type': 'IncToDec', 'od_wait': 26}
0:	learn: 0.6930603	total: 759us	remaining: 228ms
100:	learn: 0.6845141	total: 70.4ms	remaining: 139ms
200:	learn: 0.6766147	total: 138ms	remaining: 68.9ms
300:	learn: 0.6692475	total: 210ms	remaining: 0us
0.6421052631578947


## Test Data

In [74]:
selected_columns

['match id',
 'team1_id',
 'team2_id',
 'ground_id',
 'team_count_50runs_last15',
 'team_winp_last5',
 'team1only_avg_runs_last15',
 'team1_winp_team2_last15',
 'ground_avg_runs_last15',
 'team1_count_50runs_last15',
 'team2_count_50runs_last15',
 'team1_win_rate_at_ground',
 'team1_strike_rate',
 'team2_strike_rate',
 'team2_bowler_eco',
 'team1_avg_wicket',
 'team2_avg_wicket',
 'team1_batting_strength',
 'team2_batting_strength',
 'team1_form_factor',
 'team2_form_factor',
 'team1_win_percentage',
 'team1_avg_score',
 'team1_recent_win_rate',
 'team1_average_winning_margin',
 'team1_scoring_average',
 'team1_wicket_loss_average',
 'team2_recent_win_rate',
 'team2_average_winning_margin',
 'team2_scoring_average',
 'team2_wicket_loss_average',
 'team1_win_rate_at_venue',
 'team2_win_rate_at_venue']

In [75]:
test_data['team1_count_50runs_last15'] = test_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_strike_rate'] = test_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_avg_wicket'] = test_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_batting_strength'] = test_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_form_factor'] = test_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data[['team1_win_percentage', 'team1_avg_score']] = test_data.apply(
    lambda x: head_to_head_performance(
        team1_id=x['team1_id'],
        team2_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1, result_type='expand'
)

n_matches = 15  # Number of recent matches to consider
test_data['team1_recent_win_rate'] = test_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_average_winning_margin'] = test_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_scoring_average'] = test_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_wicket_loss_average'] = test_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_wicket_loss_average'] = test_data.apply(
    lambda x: team_wicket_loss_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_recent_win_rate'] = test_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=num_match,
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_win_rate_at_venue'] = test_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team1_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_win_rate_at_venue'] = test_data.apply(
    lambda x: team_win_rate_at_venue(
        team_id=x['team2_id'],
        venue_id=x['ground_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

100%|██████████| 271/271 [00:03<00:00, 67.92it/s]
100%|██████████| 271/271 [00:03<00:00, 84.34it/s]
100%|██████████| 271/271 [00:02<00:00, 108.87it/s]
100%|██████████| 271/271 [00:03<00:00, 88.51it/s]
100%|██████████| 271/271 [00:02<00:00, 93.93it/s]


In [76]:
test_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'toss winner', 'toss decision', 'venue',
       'city', 'match_dt', 'lighting', 'series_name', 'season', 'ground_id',
       'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'team1_bowler_eco',
       'team1_count_50runs_last15', 'team1_strike_rate', 'team1_avg_wicket',
       'team1_batting_strength', 'team1_form_factor', 'team1_win_percentage',
       'team1_avg_score', 'team1_recent_win_rate',
       'team1_average_winning_margin', 'team1_scoring_average',
       'team2_wicket_loss_average', 'team1_wicket_loss_average',
       'team2_recent_win_rate', 'team1_win_rate_at_venue',
       'team2_win_rate_at_venue'],
      dtype='object')

In [77]:
selected_columns.remove("winner_01")

ValueError: list.remove(x): x not in list

In [None]:
test_df = test_data[selected_columns]

KeyError: "['team2_count_50runs_last15', 'team1_win_rate_at_ground', 'team2_strike_rate', 'team2_bowler_eco', 'team2_avg_wicket', 'team2_batting_strength', 'team2_form_factor', 'team2_average_winning_margin', 'team2_scoring_average'] not in index"

In [None]:
test_df.isna().sum()

In [None]:
GBM_model = GradientBoostingClassifier()
LGBM_model= LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

ensemble_model = VotingClassifier(estimators=[
    ('GBM', GBM_model),
    ('LGBM', LGBM_model),
    ('XGB', XGB_model),
    ('CatBoost', CatBoost_model)
], voting='hard')  

# Train the models
GBM_model.fit(X, y)
LGBM_model.fit(X, y)
XGB_model.fit(X, y)
CatBoost_model.fit(X, y)
ensemble_model.fit(X, y)

In [None]:
# Train the models
# GBM_model.fit(X_train, y_train)
# LGBM_model.fit(X_train, y_train)
# XGB_model.fit(X_train, y_train)
# CatBoost_model.fit(X_train, y_train)
# ensemble_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)
print("Accuracy for ensemble model:", accuracy_ensemble)

In [None]:
# Make predictions
y_pred_GBM_test = GBM_model.predict(test_df)
y_pred_LGBM_test = LGBM_model.predict(test_df)
y_pred_XGB_test = XGB_model.predict(test_df)
y_pred_CatBoost_test = CatBoost_model.predict(test_df)
y_pred_ensemble_test = ensemble_model.predict(test_df)

total_elements = len(y_pred_GBM_test)

def count_zeros(y_test):
    count_zeros = np.sum(y_test == 0)
    percentage_zeros = (count_zeros / total_elements) * 100
    return percentage_zeros

In [None]:
y_pred_CatBoost_test

In [None]:
print("% zeros in GBM_model ",count_zeros(y_pred_GBM_test))
print("% zeros in LGBM_model ",count_zeros(y_pred_LGBM_test))
print("% zeros in XGB_model ",count_zeros(y_pred_XGB_test))
print("% zeros in CatBoost_model ",count_zeros(y_pred_CatBoost_test))
print("% zeros in ensemble_model ",count_zeros(y_pred_ensemble_test))

## Making Submission File

Test

In [None]:
pred_proba = CatBoost_model.predict_proba(test_df)

In [None]:
df_test = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [None]:
df_test['winner'] = y_pred_CatBoost_test
df_test['win_pred_score'] = np.max(pred_proba, axis=1)

In [None]:
winner_id = []
for i in range(len(df_test)):
  if df_test['winner'][i] == 0:
    winner_id.append(df_test['team1_id'][i])
  else:
    winner_id.append(df_test['team2_id'][i])

df_test['winner_id'] = winner_id

In [None]:
df_test.head()

Train

In [None]:
df_train = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')

In [None]:
y_pred = CatBoost_model.predict(X)
pred_proba = CatBoost_model.predict_proba(X)

In [None]:
df_train['winner_new'] = y_pred
df_train['win_pred_score'] = np.max(pred_proba, axis=1)

In [None]:
winner_id = []
for i in range(len(df_train)):
  if df_train['winner'][i] == 0:
    winner_id.append(df_train['team1_id'][i])
  else:
    winner_id.append(df_train['team2_id'][i])

df_train['winner_id_new'] = winner_id

In [None]:
df_train.head()

## Making Submission file

In [None]:
params = CatBoost_model.get_all_params()
print(params)

In [None]:
# Extracting the relevant parameters
train_hps_trees = params.get('iterations')
train_hps_depth = params.get('depth')
train_hps_lr = params.get('learning_rate')

print(f"train_hps_trees: {train_hps_trees}")
print(f"train_hps_depth: {train_hps_depth}")
print(f"train_hps_lr: {train_hps_lr}")

In [None]:
df_sub = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/submission_template_file1.csv')

In [None]:
df_sub.columns

In [None]:
abs(df_new.corr()['winner_01']).sort_values(ascending=False)

In [None]:
# df_sub['win_pred_team_id'] = pd.concat([df_test['winner_id'], df_train['winner_id_new']], ignore_index=True)
# df_sub['win_pred_score'] = pd.concat([df_test['win_pred_score'], df_train['win_pred_score']], ignore_index=True)
# df_sub['train_algorithm'] = 'catboost'
# df_sub['is_ensemble'] = 'no'
# df_sub['train_hps_trees'] = train_hps_trees
# df_sub['train_hps_depth'] = train_hps_depth
# df_sub['train_hps_lr'] = train_hps_lr
# df_sub['indep_feat_id1'] = pd.concat([test_df['team1_win_percentage'], X['team1_win_percentage']], ignore_index=True)
# df_sub['indep_feat_id2'] = pd.concat([test_df['team1_recent_win_rate'], X['team1_recent_win_rate']], ignore_index=True)
# df_sub['indep_feat_id3'] = pd.concat([test_df['team2_recent_win_rate'], X['team2_recent_win_rate']], ignore_index=True)
# df_sub['indep_feat_id4'] = pd.concat([test_df['team1_avg_score'], X['team1_avg_score']], ignore_index=True)
# df_sub['indep_feat_id5'] = pd.concat([test_df['team2_wicket_loss_average'], X['team2_wicket_loss_average']], ignore_index=True)
# df_sub['indep_feat_id6'] = pd.concat([test_df['team1_scoring_average'], X['team1_scoring_average']], ignore_index=True)
# df_sub['indep_feat_id7'] = pd.concat([test_df['team1_form_factor'], X['team1_form_factor']], ignore_index=True)
# df_sub['indep_feat_id8'] = pd.concat([test_df['team1_average_winning_margin'], X['team1_average_winning_margin']], ignore_index=True)
# df_sub['indep_feat_id9'] = pd.concat([test_df['team1_average_winning_margin'], X['team1_average_winning_margin']], ignore_index=True)
# df_sub['indep_feat_id10'] = pd.concat([test_df['team1_avg_wicket'], X['team1_avg_wicket']], ignore_index=True)

In [None]:
# df_sub.isna().sum()

In [None]:
# df_sub.to_csv('submission_file.csv', index=False)