In [18]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [19]:
match_lvl_data = pd.read_csv('match_level_latest.csv')
batsman_lvl_data = pd.read_csv('batsman_level_latest.csv')
bowler_lvl_data = pd.read_csv('bowler_level_latest.csv')
train_data = pd.read_csv('train_features_latest.csv')
test_data = pd.read_csv('test_data_latest.csv')

In [20]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

In [21]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [22]:
#new feature 1 bowler's performance of teams in recent mathes
def team_bowler_performance_recent(bowler_df, match_id, team_roster_ids, date, n=15):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count']) * 5 + recent_matches['dots']*1.5+(recent_matches['maiden']) * 1.5 -(recent_matches['Fours'] * 1) - recent_matches['Sixes']*1.5 - recent_matches['wides'] - recent_matches['noballs']                          
                           
                            
                             
            total_impact_score += player_impact.sum()

    return total_impact_score

train_data['team1_bowler_performance_recent'] = train_data.apply(
    lambda x: team_bowler_performance_recent(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team2_bowler_performance_recent'] = train_data.apply(
    lambda x: team_bowler_performance_recent(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)



In [23]:
# def calculate_ratio(row):
#     team1_value = row['team1_bowler_performance_recent']
#     team2_value = row['team2_bowler_performance_recent']
#     if team2_value == 0:
#         return team1_value
#     else:
#         return team1_value / team2_value

# # Add the new column to the DataFrame
# train_data['team_bowler_performance_ratio'] = train_data.apply(calculate_ratio, axis=1)

# # Drop the initial individual features
# train_data = train_data.drop(columns=['team1_bowler_performance_recent', 'team2_bowler_performance_recent'])


In [24]:
def calculate_ratio(row, col1, col2):
    value1 = row[col1]
    value2 = row[col2]
    if value2 == 0:
        return value1
    else:
        return value1 / value2

# Function to add a ratio column for any two columns
def add_ratio_column(df, col1, col2, new_col_name):
    df[new_col_name] = df.apply(lambda row: calculate_ratio(row, col1, col2), axis=1)
    return df

# Add the new column to the DataFrame
train_data = add_ratio_column(train_data, 'team1_bowler_performance_recent', 'team2_bowler_performance_recent', 'team_bowler_performance_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1_bowler_performance_recent', 'team2_bowler_performance_recent'])


In [25]:
test_data['team1_bowler_performance_recent'] = test_data.apply(
    lambda x: team_bowler_performance_recent(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team2_bowler_performance_recent'] = test_data.apply(
    lambda x: team_bowler_performance_recent(bowler_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)
test_data = add_ratio_column(test_data, 'team1_bowler_performance_recent', 'team2_bowler_performance_recent', 'team_bowler_performance_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1_bowler_performance_recent', 'team2_bowler_performance_recent'])


In [26]:
def team_batsman_performance_recent(bat_df, match_id, team_roster_ids, date, n=15):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 4.5 +
                                  recent_matches['strike_rate'] * 2.5 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] ) * 1.5) 
                                  
            total_performance_index += player_performance.sum()

    return total_performance_index

train_data['team1_batsman_performance_recent'] = train_data.apply(
    lambda x: team_batsman_performance_recent(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team2_batsman_performance_recent'] = train_data.apply(
    lambda x: team_batsman_performance_recent(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team1_batsman_performance_recent'] = test_data.apply(
    lambda x: team_batsman_performance_recent(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team2_batsman_performance_recent'] = test_data.apply(
    lambda x: team_batsman_performance_recent(batsman_lvl_data, x['match id'], x['team2_roster_ids'], x['match_dt']),
    axis=1
)

# Add the new column to the DataFrame
train_data = add_ratio_column(train_data, 'team1_batsman_performance_recent', 'team2_batsman_performance_recent', 'team_batsmen_performance_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1_batsman_performance_recent', 'team2_batsman_performance_recent'])


test_data = add_ratio_column(test_data, 'team1_batsman_performance_recent', 'team2_batsman_performance_recent', 'team_batsmen_performance_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1_batsman_performance_recent', 'team2_batsman_performance_recent'])





In [27]:
a.

SyntaxError: invalid syntax (3905452595.py, line 1)

In [29]:
match_lvl_data['team1_bat_inning'] = np.where( ((match_lvl_data['team1']==match_lvl_data['toss winner'])&(match_lvl_data['toss decision']=='bat'))|\
                                               ((match_lvl_data['team2']==match_lvl_data['toss winner'])&(match_lvl_data['toss decision']=='field')) , 1, 2)



In [30]:
def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # combine two dataframes - one where input team is batting first, and another one where input team is batting second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean() # return mean of the combined dataframe.

train_data['team2only_avg_runs_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)


train_data['team1only_avg_runs_last15'] = train_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team1_id'], x['match_dt'], 15), axis=1)


train_data['team1run-ground_avg_runs'] = train_data['team1only_avg_runs_last15'] - train_data['ground_avg_runs_last15']
train_data['team2run-ground_avg_runs'] = train_data['team2only_avg_runs_last15'] - train_data['ground_avg_runs_last15']

train_data = add_ratio_column(train_data, 'team1run-ground_avg_runs', 'team2run-ground_avg_runs', 'team_ground_factor_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1run-ground_avg_runs', 'team2run-ground_avg_runs','team2only_avg_runs_last15','team1only_avg_runs_last15'])



test_data['team2only_avg_runs_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)


test_data['team1only_avg_runs_last15'] = test_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn(x['team1_id'], x['match_dt'], 15), axis=1)


test_data['team1run-ground_avg_runs'] = test_data['team1only_avg_runs_last15'] - test_data['ground_avg_runs_last15']
test_data['team2run-ground_avg_runs'] = test_data['team2only_avg_runs_last15'] - test_data['ground_avg_runs_last15']

test_data = add_ratio_column(test_data, 'team1run-ground_avg_runs', 'team2run-ground_avg_runs', 'team_ground_factor_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1run-ground_avg_runs', 'team2run-ground_avg_runs','team2only_avg_runs_last15','team1only_avg_runs_last15'])


100%|██████████| 948/948 [00:01<00:00, 487.23it/s]
100%|██████████| 948/948 [00:01<00:00, 484.28it/s]
100%|██████████| 271/271 [00:00<00:00, 574.33it/s]
100%|██████████| 271/271 [00:00<00:00, 582.25it/s]


In [33]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.


# In[34]:


# Compute team1 win% against team2 in their past 15 encounters for train data.
train_data['team1_winp_team2_last15'] = train_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
train_data['team2_winp_team2_last15'] = train_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team2_id'], x['team1_id'], x['match_dt'], 15), axis=1)

test_data['team1_winp_team2_last15'] = test_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
test_data['team2_winp_team2_last15'] = test_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team2_id'], x['team1_id'], x['match_dt'], 15), axis=1)



100%|██████████| 948/948 [00:00<00:00, 1176.49it/s]
100%|██████████| 948/948 [00:00<00:00, 1135.08it/s]
100%|██████████| 271/271 [00:00<00:00, 852.58it/s]
100%|██████████| 271/271 [00:00<00:00, 810.29it/s]


In [34]:
def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

train_data['team1_winp_last10'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 10), axis=1)
# Compute team2's win% in last 5 games
train_data['team2_winp_last10'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 10), axis=1)

train_data['diff_win_p_teams'] = train_data['team1_winp_last10'] - train_data['team2_winp_last10']
train_data = train_data.drop(columns=['team1_winp_last10', 'team2_winp_last10'])


test_data['team1_winp_last10'] = test_data.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 10), axis=1)
# Compute team2's win% in last 5 games
test_data['team2_winp_last10'] = test_data.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 10), axis=1)

test_data['diff_win_p_teams'] = test_data['team1_winp_last10'] - test_data['team2_winp_last10']
test_data = test_data.drop(columns=['team1_winp_last10', 'team2_winp_last10'])



100%|██████████| 948/948 [00:00<00:00, 1304.54it/s]
100%|██████████| 948/948 [00:00<00:00, 986.42it/s] 
100%|██████████| 271/271 [00:00<00:00, 1112.62it/s]
100%|██████████| 271/271 [00:00<00:00, 1007.09it/s]


In [36]:
def team_economy_rate_recent(bowler_df, team_roster_ids, date, n=10):
    team_ids = str(team_roster_ids).split(':')
    total_runs_given = 0
    total_balls_bowled = 0
    
    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_runs_given += recent_matches['runs'].sum()
        total_balls_bowled += recent_matches['balls_bowled'].sum()

    if total_balls_bowled > 0:
        return (total_runs_given / total_balls_bowled) * 6
    return 0

# Apply this function
train_data['team1_economy_rate_recent'] = train_data.apply(
    lambda x: team_economy_rate_recent(bowler_lvl_data, x['team1_roster_ids'], x['match_dt']),
    axis=1
)
train_data['team2_economy_rate_recent'] = train_data.apply(
    lambda x: team_economy_rate_recent(bowler_lvl_data, x['team2_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team1_economy_rate_recent'] = test_data.apply(
    lambda x: team_economy_rate_recent(bowler_lvl_data, x['team1_roster_ids'], x['match_dt']),
    axis=1
)
test_data['team2_economy_rate_recent'] = test_data.apply(
    lambda x: team_economy_rate_recent(bowler_lvl_data, x['team2_roster_ids'], x['match_dt']),
    axis=1
)

train_data = add_ratio_column(train_data, 'team1_economy_rate_recent', 'team2_economy_rate_recent', 'team_economy_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1_economy_rate_recent', 'team2_economy_rate_recent'])


test_data = add_ratio_column(test_data, 'team1_economy_rate_recent', 'team2_economy_rate_recent', 'team_economy_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1_economy_rate_recent', 'team2_economy_rate_recent'])


In [37]:
def player_of_match_frequency(matches, team_roster_ids, date):
    team_ids = str(team_roster_ids).split(':')
    pom_awards = 0
    total_matches = 0

    for player_id in team_ids:
        past_matches = matches[(matches['match_dt'] < date) & (matches['player_of_the_match_id'] == float(player_id))]
        pom_awards += past_matches.shape[0]
        total_matches += matches[(matches['match_dt'] < date) & ((matches['team1_roster_ids'].str.contains(str(player_id))) | (matches['team2_roster_ids'].str.contains(str(player_id))))].shape[0]

    return pom_awards / total_matches if total_matches > 0 else 0

train_data['team1_pom_frequency'] = train_data.apply(lambda x: player_of_match_frequency(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
train_data['team2_pom_frequency'] = train_data.apply(lambda x: player_of_match_frequency(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)

test_data['team1_pom_frequency'] = test_data.apply(lambda x: player_of_match_frequency(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
test_data['team2_pom_frequency'] = test_data.apply(lambda x: player_of_match_frequency(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)


train_data = add_ratio_column(train_data, 'team1_pom_frequency', 'team2_pom_frequency', 'team_playerOfmatch_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1_pom_frequency', 'team2_pom_frequency'])


test_data = add_ratio_column(test_data, 'team1_pom_frequency', 'team2_pom_frequency', 'team_playerOfmatch_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1_pom_frequency', 'team2_pom_frequency'])





In [38]:
def team_count_100runs_last15(matches, team1_roster, team2_roster, date, n=25):
    def count_100s(team_roster):
        total_100s = 0
        for player_id in team_roster.split(':'):
            player_matches = giveLastNgamesPlayer(player_id, date, n, 'bat')
            total_100s += (player_matches['runs'] >= 75).sum()
        return total_100s

    team1_100s = count_100s(team1_roster)
    team2_100s = count_100s(team2_roster)

    return team1_100s / team2_100s if team2_100s > 0 else team1_100s

train_data['team_count_100runs_ratio_last15'] = train_data.apply(lambda x: team_count_100runs_last15(match_lvl_data, x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team_count_100runs_ratio_last15'] = test_data.apply(lambda x: team_count_100runs_last15(match_lvl_data, x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)


In [39]:
def avg_extras_conceded(matches, team_roster_ids, date, n=10):
    team_ids = team_roster_ids.split(':')
    total_extras = 0
    total_matches = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id, date, n, 'bowl')
        total_extras += recent_matches['wides'].sum() + recent_matches['noballs'].sum()
        total_matches += recent_matches.shape[0]

    return total_extras / total_matches if total_matches > 0 else 0

train_data['team1_avg_extras_conceded'] = train_data.apply(lambda x: avg_extras_conceded(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
train_data['team2_avg_extras_conceded'] = train_data.apply(lambda x: avg_extras_conceded(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)

test_data['team1_avg_extras_conceded'] = test_data.apply(lambda x: avg_extras_conceded(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
test_data['team2_avg_extras_conceded'] = test_data.apply(lambda x: avg_extras_conceded(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)


#add ratio column 
train_data = add_ratio_column(train_data, 'team1_avg_extras_conceded', 'team2_avg_extras_conceded', 'teams_avg_extras_conceded_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1_avg_extras_conceded', 'team2_avg_extras_conceded'])

#add ratio column 
test_data = add_ratio_column(test_data, 'team1_avg_extras_conceded', 'team2_avg_extras_conceded', 'teams_avg_extras_conceded_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1_avg_extras_conceded', 'team2_avg_extras_conceded'])


In [40]:
def avg_partnership_length(matches, team_roster_ids, date, n=10):
    team_ids = team_roster_ids.split(':')
    total_partnership_runs = 0
    total_partnerships = 0

    for player_id in team_ids[:6]:  # Assuming top 4 are the main batsmen
        recent_matches = giveLastNgamesPlayer(player_id, date, n, 'bat')
        total_partnership_runs += recent_matches['runs'].sum()
        total_partnerships += recent_matches.shape[0]

    return total_partnership_runs / total_partnerships if total_partnerships > 0 else 0

train_data['team1_avg_partnership_length'] = train_data.apply(lambda x: avg_partnership_length(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
train_data['team2_avg_partnership_length'] = train_data.apply(lambda x: avg_partnership_length(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1_avg_partnership_length'] = test_data.apply(lambda x: avg_partnership_length(match_lvl_data, x['team1_roster_ids'], x['match_dt']), axis=1)
test_data['team2_avg_partnership_length'] = test_data.apply(lambda x: avg_partnership_length(match_lvl_data, x['team2_roster_ids'], x['match_dt']), axis=1)



#add ratio column 
train_data = add_ratio_column(train_data, 'team1_avg_partnership_length', 'team2_avg_partnership_length', 'team_avg_partnership_length_ratio')

# Drop the initial individual features
train_data = train_data.drop(columns=['team1_avg_partnership_length', 'team2_avg_partnership_length'])


#add ratio column 
test_data = add_ratio_column(test_data, 'team1_avg_partnership_length', 'team2_avg_partnership_length', 'team_avg_partnership_length_ratio')

# Drop the initial individual features
test_data = test_data.drop(columns=['team1_avg_partnership_length', 'team2_avg_partnership_length'])


In [42]:
new_train1  = train_data.copy()
new_train2 = train_data.copy()

In [43]:
test_data1 = test_data.copy()
test_data2 = test_data.copy()

In [46]:
new_test_data = test_data.copy()

In [44]:
new_train = train_data.copy()

In [45]:
new_train1['winner_01'] = new_train1.apply(lambda x: 1 if (x['team2']==x['winner']) else 0, axis=1)

In [47]:
new_train1['toss_winner_01'] = np.where(new_train1['toss winner']==new_train1['team2'], 1, 0)
new_test_data['toss_winner_01'] = np.where(new_test_data['toss winner']==new_test_data['team2'], 1, 0)
new_train1['toss_decision_01'] = np.where(new_train1['toss decision']=='bat', 1, 0)
new_test_data['toss_decision_01'] = np.where(new_test_data['toss decision']=='bat', 1, 0)

In [48]:
drops = ['match id', 'team1','venue', 'team1_id', 'team1_roster_ids', 'team2','winner','winner_id','toss winner',
       'team2_id','toss decision', 'team2_roster_ids','city','match_dt','lighting','series_name','season','ground_id']

new_train1 = new_train1.drop(drops,axis = 1)

drops_test = ['match id', 'team1','venue', 'team1_id', 'team1_roster_ids', 'team2','toss winner',
       'team2_id','toss decision', 'team2_roster_ids','city','match_dt','lighting','series_name','season','ground_id']

new_test_data = new_test_data.drop(drops_test,axis = 1)


In [52]:
new_train1 = new_train1.drop('team_count_50runs_last15',axis = 1)
new_test_data = new_test_data.drop('team_count_50runs_last15',axis = 1)

In [56]:
cols = ['team_winp_last5', 'team1_winp_team2_last15', 'ground_avg_runs_last15',
       'team_bowler_performance_ratio', 'team_batsmen_performance_ratio',
       'team_ground_factor_ratio', 'team2_winp_team2_last15',
       'diff_win_p_teams', 'team_economy_ratio', 'team_playerOfmatch_ratio',
       'team_count_100runs_ratio_last15', 'teams_avg_extras_conceded_ratio',
       'team_avg_partnership_length_ratio', 'toss_winner_01',
       'toss_decision_01']

X,y = new_train1[cols],new_train1['winner_01']

In [61]:
X_sub = new_test_data.copy()

In [62]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [None]:
X.fillna(0,inplace=True)
X_sub.fillna(0,inplace=True)
X.isna().sum()

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [65]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

[LightGBM] [Info] Number of positive: 389, number of negative: 369
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2123
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513193 -> initscore=0.052783
[LightGBM] [Info] Start training from score 0.052783
Learning rate set to 0.009153
0:	learn: 0.6921483	total: 129ms	remaining: 2m 8s
1:	learn: 0.6912870	total: 134ms	remaining: 1m 6s
2:	learn: 0.6900827	total: 139ms	remaining: 46.1s
3:	learn: 0.6890003	total: 143ms	remaining: 35.5s
4:	learn: 0.6876456	total: 146ms	remaining: 29s
5:	learn: 0.6869177	total: 148ms	remaining: 24.5s
6:	learn: 0.6857564	total: 150ms	remaining: 21.3s
7:	learn: 0.6846032	total: 153ms	remaining: 19s
8:	learn: 0.6835759	total: 156ms	remaining: 17.1s
9:	learn: 0.6826211	total: 159ms	remai

In [67]:
X_test.head()

Unnamed: 0,team_winp_last5,team1_winp_team2_last15,ground_avg_runs_last15,team_bowler_performance_ratio,team_batsmen_performance_ratio,team_ground_factor_ratio,team2_winp_team2_last15,diff_win_p_teams,team_economy_ratio,team_playerOfmatch_ratio,team_count_100runs_ratio_last15,teams_avg_extras_conceded_ratio,team_avg_partnership_length_ratio,toss_winner_01,toss_decision_01
589,1.327869,33.33,167.766667,0.700738,0.88983,-1.675737,66.67,20.0,0.977914,0.900495,2.0,1.076658,1.232819,1,0
933,1.952381,33.33,174.366667,0.766174,0.811055,0.502075,66.67,0.0,1.117902,0.962977,0.6,1.3,1.256371,1,0
139,0.405941,0.0,161.785714,1.394122,0.58204,0.325237,100.0,-60.0,1.077486,1.090604,0.25,1.186441,0.537983,0,1
86,1.97561,60.0,160.625,1.277745,1.621427,-0.239243,40.0,30.0,0.93026,0.654785,0.833333,1.033333,0.968054,1,0
39,1.0,100.0,171.7,0.646642,1.052608,0.005076,0.0,-20.0,0.989601,0.905172,1.6,0.880952,0.973447,1,0


Scaling

In [68]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select the columns to be scaled
columns_to_scale = X_train.columns[:-2]  # All columns except the last 2
columns_to_exclude = X_train.columns[-2:]  # The last 2 columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data for selected columns
scaler.fit(X_train[columns_to_scale])

# Transform the training data
X_train_scaled = scaler.transform(X_train[columns_to_scale])

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test[columns_to_scale])

# Convert the scaled arrays back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=columns_to_scale, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=columns_to_scale, index=X_test.index)

# Recombine the scaled columns with the columns that were not scaled
X_train_final = pd.concat([X_train_scaled_df, X_train[columns_to_exclude]], axis=1)
X_test_final = pd.concat([X_test_scaled_df, X_test[columns_to_exclude]], axis=1)


In [77]:
X_train_no_toss = X_train_final.iloc[:, :-2]
X_test_no_toss = X_test_final.iloc[:, :-2]

In [78]:
X_test_no_toss.head()

Unnamed: 0,team_winp_last5,team1_winp_team2_last15,ground_avg_runs_last15,team_bowler_performance_ratio,team_batsmen_performance_ratio,team_ground_factor_ratio,team2_winp_team2_last15,diff_win_p_teams,team_economy_ratio,team_playerOfmatch_ratio,team_count_100runs_ratio_last15,teams_avg_extras_conceded_ratio,team_avg_partnership_length_ratio
589,-0.220621,-0.173003,0.604392,-0.085683,-0.072443,-0.001648,0.786524,0.66846,-0.113003,-0.162943,0.525872,-0.028255,-0.014209
933,-0.180531,-0.173003,0.780923,-0.08436,-0.072554,0.02634,0.786524,0.020746,0.092532,-0.089644,-0.440744,0.328538,-0.003558
139,-0.279802,-1.058499,0.444419,-0.071673,-0.072875,0.024068,1.68827,-1.922395,0.033192,0.060076,-0.682398,0.147125,-0.328425
86,-0.17904,0.535553,0.413374,-0.074024,-0.071417,0.016813,0.064965,0.992317,-0.18297,-0.451188,-0.279641,-0.097467,-0.13394
39,-0.241668,1.598254,0.709597,-0.086776,-0.072215,0.019953,-1.017239,-0.626967,-0.095843,-0.157456,0.249696,-0.340899,-0.131501


In [79]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train_no_toss, y_train)
LGBM_model.fit(X_train_no_toss, y_train)
XGB_model.fit(X_train_no_toss, y_train)
CatBoost_model.fit(X_train_no_toss, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test_no_toss)
y_pred_LGBM = LGBM_model.predict(X_test_no_toss)
y_pred_XGB = XGB_model.predict(X_test_no_toss)
y_pred_CatBoost = CatBoost_model.predict(X_test_no_toss)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

[LightGBM] [Info] Number of positive: 389, number of negative: 369
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2198
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513193 -> initscore=0.052783
[LightGBM] [Info] Start training from score 0.052783
Learning rate set to 0.009153
0:	learn: 0.6923612	total: 2.71ms	remaining: 2.71s
1:	learn: 0.6912479	total: 4.71ms	remaining: 2.35s
2:	learn: 0.6904639	total: 6.41ms	remaining: 2.13s
3:	learn: 0.6890321	total: 7.83ms	remaining: 1.95s
4:	learn: 0.6883440	total: 9.34ms	remaining: 1.86s
5:	learn: 0.6875121	total: 11.1ms	remaining: 1.84s
6:	learn: 0.6865745	total: 12.7ms	remaining: 1.79s
7:	learn: 0.6858406	total: 14.4ms	remaining: 1.79s
8:	learn: 0.6850032	total: 16.4ms	remaining: 1.81s
9:	learn: 0.6843032	total

In [71]:
from sklearn.ensemble import VotingClassifier

# Initialize the models
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Create the ensemble
ensemble_model = VotingClassifier(estimators=[
    ('GBM', GBM_model),
    ('LGBM', LGBM_model),
    ('XGB', XGB_model),
    ('CatBoost', CatBoost_model)
], voting='hard')  # You can choose 'hard' or 'soft' voting

# Train the ensemble
ensemble_model.fit(X_train_final, y_train)

# Make predictions
y_pred_ensemble = ensemble_model.predict(X_test_final)

# Evaluate the ensemble
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print("Accuracy for ensemble model:", accuracy_ensemble)


[LightGBM] [Info] Number of positive: 389, number of negative: 369
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2202
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513193 -> initscore=0.052783
[LightGBM] [Info] Start training from score 0.052783
Learning rate set to 0.009153
0:	learn: 0.6921483	total: 3.38ms	remaining: 3.38s
1:	learn: 0.6912870	total: 7.28ms	remaining: 3.63s
2:	learn: 0.6900827	total: 10.9ms	remaining: 3.62s
3:	learn: 0.6890003	total: 14.2ms	remaining: 3.54s
4:	learn: 0.6876456	total: 18.3ms	remaining: 3.64s
5:	learn: 0.6869177	total: 22.6ms	remaining: 3.75s
6:	learn: 0.6857564	total: 25.9ms	remaining: 3.68s
7:	learn: 0.6846032	total: 29.5ms	remaining: 3.66s
8:	learn: 0.6835759	total: 32.1ms	remaining: 3.54s
9:	learn: 0.6826211	total

In [72]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train_final, label=y_train)
valid_pool = Pool(data=X_test_final, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model.fit(X_train_scaled, y_train)

# Evaluate the final model on the validation set
final_preds = best_model.predict(X_test_scaled)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-06-07 12:59:44,734] A new study created in memory with name: no-name-6c4fe1cf-2e04-441c-b4b1-5b77624815e7
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2024-06-07 12:59:45,878] Trial 0 finished with value: -0.6 and parameters: {'iterations': 735, 'learning_rate': 0.0018523736787599944, 'depth': 4, 'l2_leaf_reg': 0.021094580988984632, 'border_count': 74, 'bagging_temperature': 0.08310470075768395, 'random_strength': 0.04369825428792477, 'od_type': 'Iter', 'od_wait': 49}. Best is trial 0 with value: -0.6.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': 

Best parameters: {'iterations': 709, 'learning_rate': 0.00038388965176806156, 'depth': 8, 'l2_leaf_reg': 0.20971836057109908, 'border_count': 133, 'bagging_temperature': 0.7308076216152887, 'random_strength': 0.051534723938352914, 'od_type': 'Iter', 'od_wait': 27}
0:	learn: 0.6929324	total: 12.7ms	remaining: 9.03s
100:	learn: 0.6735833	total: 479ms	remaining: 2.88s
200:	learn: 0.6554681	total: 917ms	remaining: 2.32s
300:	learn: 0.6384163	total: 1.37s	remaining: 1.86s
400:	learn: 0.6218944	total: 1.8s	remaining: 1.38s
500:	learn: 0.6062354	total: 2.25s	remaining: 934ms
600:	learn: 0.5913905	total: 2.7s	remaining: 485ms
700:	learn: 0.5771547	total: 3.14s	remaining: 35.9ms
708:	learn: 0.5760195	total: 3.18s	remaining: 0us


In [73]:
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

0.6052631578947368


In [35]:
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team_count_50runs_last15,team_winp_last5,team1_winp_team2_last15,ground_avg_runs_last15,winner_01,team_bowler_performance_ratio,team_batsmen_performance_ratio,team_ground_factor_ratio,team2_winp_team2_last15,diff_win_p_teams
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,1.666667,0.672131,100.0,157.178571,1,1.740302,1.019182,1.069532,0.0,-20.0
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,1.285714,1.952381,50.0,103.5,0,0.844453,0.707725,0.934718,50.0,40.0
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,0.857143,0.672131,0.0,154.333333,0,1.254081,0.839594,8.875,100.0,-10.0
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,2.166667,1.97561,50.0,144.25,0,0.840056,1.460793,0.740901,50.0,10.0
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,0.818182,1.327869,0.0,189.0,1,1.134372,1.149571,1.054444,0.0,-10.0
