In [None]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [None]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
bat_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowl_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [None]:
bat_lvl_data['Fours'].fillna(0, inplace=True)
bat_lvl_data['Sixes'].fillna(0, inplace=True)
bat_lvl_data['strike_rate'].fillna(0, inplace=True)

In [None]:
def assign_toss_winner_flag(row):
    if row['team1'] == row['toss winner']:
        return row['team1_id']
    elif row['team2'] == row['toss winner']:
        return row['team2_id']
train_data['toss_winner_id'] = train_data.apply(assign_toss_winner_flag, axis=1)
match_lvl_data['toss_winner_id'] = match_lvl_data.apply(assign_toss_winner_flag, axis=1)
test_lvl_data['toss_winner_id']= test_lvl_data.apply(assign_toss_winner_flag, axis=1)

In [None]:
def assign_bat(row):
    if (row['toss_winner_id'] == row['team1_id']) & (row['toss decision']=='bat'):
        return row['team1_id']
    if (row['toss_winner_id'] != row['team1_id']) & (row['toss decision']=='field'):
        return row['team1_id']
    else:
        return row['team2_id']
train_data['bat_id'] = train_data.apply(assign_bat, axis=1) 
match_lvl_data['bat_id'] = match_lvl_data.apply(assign_bat, axis=1)
test_lvl_data['bat_id'] = test_lvl_data.apply(assign_bat, axis=1)

In [None]:
(test_lvl_data['bat_id']== test_lvl_data['team1_id']).sum()

In [None]:
def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

In [None]:
# Compute team1's win% in last 5 games
train_data['team1_winp_last5'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute team2's win% in last 5 games
train_data['team2_winp_last5'] = train_data.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 15), axis=1)


test_lvl_data['team1_winp_last5'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn(x['team1_id'], x['match_dt'], 15), axis=1)
# Compute team2's win% in last 5 games
test_lvl_data['team2_winp_last5'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn(x['team2_id'], x['match_dt'], 15), axis=1)

In [None]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.

In [None]:
train_data['team1_winp_team2_last15'] = train_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
train_data['team2_winp_team1_last15'] = train_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team2_id'], x['team1_id'], x['match_dt'], 15), axis=1)

test_lvl_data['team1_winp_team2_last15'] = test_lvl_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
test_lvl_data['team2_winp_team1_last15'] = test_lvl_data.progress_apply(lambda x: \
                                  winpCrossLastn(x['team2_id'], x['team1_id'], x['match_dt'], 15), axis=1)

In [None]:
train_data.drop(['team_count_50runs_last15','team1only_avg_runs_last15','team_winp_last5','ground_avg_runs_last15'],inplace=True, axis=1)

In [None]:
def winpLastn_venue(team_id, date, ground_id,n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) & (match_lvl_data['ground_id']==ground_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

In [None]:
# Compute team1's win% in last 5 games
train_data['team1_winp_venue'] = train_data.progress_apply(lambda x: \
                                  winpLastn_venue(x['team1_id'], x['match_dt'],x['ground_id'], 15), axis=1)
# Compute team2's win% in last 5 games
train_data['team2_winp_venue'] = train_data.progress_apply(lambda x: \
                                  winpLastn_venue(x['team2_id'], x['match_dt'],x['ground_id'], 15), axis=1)

# Compute team1's win% in last 5 games
test_lvl_data['team1_winp_venue'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn_venue(x['team1_id'], x['match_dt'],x['ground_id'], 15), axis=1)
# Compute team2's win% in last 5 games
test_lvl_data['team2_winp_venue'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn_venue(x['team2_id'], x['match_dt'],x['ground_id'], 15), axis=1)

In [None]:
def winpCrossLastn_venue(team1_id, team2_id, date, ground_id, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters at a specific ground.
    
    Input-
    1. match_lvl_data: DataFrame containing match-level data.
    2. team1_id: ID of team1 to calculate win% of.
    3. team2_id: ID of team2 to calculate win% against.
    4. date: match date of the current game for which the feature is to be calculated.
    5. ground_id: ID of the ground where matches were played.
    6. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other at the specified ground.
    '''
    # Filter matches based on the criteria
    df_rel = match_lvl_data[(match_lvl_data['match_dt'] < date) &
                            (((match_lvl_data['team1_id'] == team1_id) & (match_lvl_data['team2_id'] == team2_id) & (match_lvl_data['ground_id'] == ground_id)) |
                             ((match_lvl_data['team1_id'] == team2_id) & (match_lvl_data['team2_id'] == team1_id) & (match_lvl_data['ground_id'] == ground_id)))]\
                            .sort_values(by='match_dt', ascending=False).head(n)

    if df_rel.empty:
        return np.nan

    win_count = df_rel[df_rel['winner_id'] == team1_id].shape[0]  # Counting number of rows (games) where winner is input team1.
    
    return round(win_count * 100 / df_rel.shape[0], 2)  # Return float denoting team1's win% against team2 in past n games rounded to 2 decimal places.

In [None]:
def winpLastn_batndchase(team_id, bat_id,date,n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    if(bat_id==team_id):
        df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) )]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
        win_count = df_rel[(df_rel['winner_id' ]==team_id )& (df_rel['bat_id']==team_id)].shape[0] # count number of rows having winner as the input team
        if win_count == 0:
          return 0
        return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points
    else:
       df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
       
       win_count = df_rel[(df_rel['winner_id']==team_id) & (df_rel['bat_id']!=team_id)].shape[0] # count number of rows having winner as the input team
       if win_count == 0:
          return 0
       return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points
       

In [None]:
train_data['team1_winp_bat\chase'] = train_data.progress_apply(lambda x: \
                                  winpLastn_batndchase(x['team1_id'], x['bat_id'], x['match_dt'], 15), axis=1)
train_data['team2_winp_bat\chase'] = train_data.progress_apply(lambda x: \
                                  winpLastn_batndchase(x['team2_id'], x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['team1_winp_bat\chase'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn_batndchase(x['team1_id'], x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['team2_winp_bat\chase'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn_batndchase(x['team2_id'], x['bat_id'], x['match_dt'], 15), axis=1)

In [None]:
def crosswinpLastn_batndchase(team1_id,team2_id, bat_id,date,n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    if(bat_id==team1_id):
        df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
        win_count = df_rel[(df_rel['winner_id']==team1_id) & (df_rel['bat_id']==team1_id)].shape[0] # count number of rows having winner as the input team
        if win_count == 0:
          return 0
        return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points
    else:
       df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      (((match_lvl_data['team1_id']==team1_id)&(match_lvl_data['team2_id']==team2_id))|((match_lvl_data['team1_id']==team2_id)&(match_lvl_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
       win_count = df_rel[(df_rel['winner_id']==team1_id ) & (df_rel['bat_id']!=team1_id )].shape[0] # count number of rows having winner as the input team
       if win_count == 0:
          return 0
       return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

In [None]:
def winpLastn_venue_bat(ground_id, date,n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''

    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                    ( (match_lvl_data['ground_id']==ground_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==df_rel['bat_id']].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

In [None]:
train_data['bat_venue_win'] = train_data.progress_apply(lambda x: \
                                  winpLastn_venue_bat(x['ground_id'], x['match_dt'], 15), axis=1)
test_lvl_data['bat_venue_win'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn_venue_bat(x['ground_id'], x['match_dt'], 15), axis=1)

In [None]:
def winpLastn_venue_bowl(ground_id, date,n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''

    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                    ( (match_lvl_data['ground_id']==ground_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']!=df_rel['bat_id']].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

In [None]:
train_data['bowl_venue_win'] = train_data.progress_apply(lambda x: \
                                  winpLastn_venue_bowl(x['ground_id'], x['match_dt'], 15), axis=1)

test_lvl_data['bowl_venue_win'] = test_lvl_data.progress_apply(lambda x: \
                                  winpLastn_venue_bowl(x['ground_id'], x['match_dt'], 15), axis=1)

In [None]:
def teamAvgRunsLastn_innings1(team_id, bat_id,date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                    ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) & (match_lvl_data['bat_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning1_runs'].mean()

In [None]:
train_data['runs_team1_1st_inning']= train_data.progress_apply(lambda x: \
                                 teamAvgRunsLastn_innings1(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)
train_data['runs_team2_1st_inning']= train_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn_innings1(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['runs_team1_1st_inning']= test_lvl_data.progress_apply(lambda x: \
                                 teamAvgRunsLastn_innings1(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['runs_team2_1st_inning']= test_lvl_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn_innings1(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)

In [None]:
def teamAvgRunsLastn_innings2(team_id, bat_id,date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                    ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) & (match_lvl_data['bat_id']!=team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning2_runs'].mean()

In [None]:
train_data['runs_team1_2nd_inning']= train_data.progress_apply(lambda x: \
                                 teamAvgRunsLastn_innings2(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)
train_data['runs_team2_2nd_inning']= train_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn_innings2(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)

test_lvl_data['runs_team1_2nd_inning']= test_lvl_data.progress_apply(lambda x: \
                                 teamAvgRunsLastn_innings2(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['runs_team2_2nd_inning']= test_lvl_data.progress_apply(lambda x: \
                                  teamAvgRunsLastn_innings2(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)

In [None]:
def teamAvgwicketsLastn_innings1(team_id, bat_id,date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                    ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) & (match_lvl_data['bat_id']!=team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning1_wickets'].mean()

In [None]:
train_data['wickets_team2_inning1']= train_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings1(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)
train_data['wickets_team1_inning1']= train_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings1(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['wickets_team2_inning1']= test_lvl_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings1(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)
test_lvl_data['wickets_team1_inning1']= test_lvl_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings1(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)

In [None]:
def teamAvgwicketsLastn_innings2(team_id, bat_id,date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                    ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id) & (match_lvl_data['bat_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning2_wickets'].mean()

In [29]:
train_data['wickets_team1_inning2']= train_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings2(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)
train_data['wickets_team2_inning2']= train_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings2(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)

test_lvl_data['wickets_team1_inning2']= test_lvl_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings2(x['team1_id'],x['bat_id'], x['match_dt'], 15), axis=1)


test_lvl_data['wickets_team2_inning2']= test_lvl_data.progress_apply(lambda x: \
                                 teamAvgwicketsLastn_innings2(x['team2_id'],x['bat_id'], x['match_dt'], 15), axis=1)

100%|██████████| 271/271 [00:00<00:00, 2377.72it/s]
100%|██████████| 271/271 [00:00<00:00, 2540.64it/s]


In [30]:
#Batsman level statistics

In [31]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    
    Output-None
    
    Returns- dataframe having bowling/batting stats from last n games of a player before an input date. 
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = bat_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowl_lvl_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [32]:
def no50sLastn(player_list, date, n):
    '''
    Function to get total number of 50s scored by players in the roster of a team in last n games.
    
    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    Output-None
    
    Returns- int value denoting sum of 50s scored by all players in the roster.
    '''
    
    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster

In [33]:
def SRLastn(player_list, date, n):
    '''
    Function to get the average strike rate of players in the roster of a team in the last n games.
    
    Input:
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    Output: None
    
    Returns: float value denoting the average strike rate of all players in the roster.
    '''
    
    # Split the player_list into individual player IDs
    player_list = str(player_list).split(':')
    res_list = []
    
    # Loop through each player ID
    for player in player_list:
        # Get batting stats from the last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        # Calculate the mean strike rate for the player and append it to res_list
        if not df_rel.empty:
            res_list.append(np.nanmean(df_rel['strike_rate']))
    
    # Return the mean of strike rates from res_list if it contains any values
    if res_list:
        return np.nanmean(res_list)
    else:
        return np.nan  # Return NaN if res_list is empty

In [34]:
train_data['team1_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

test_lvl_data['team1_count_50runs_last15'] = test_lvl_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
test_lvl_data['team2_count_50runs_last15'] = test_lvl_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:11<00:00, 81.99it/s]
100%|██████████| 948/948 [00:10<00:00, 90.55it/s]
100%|██████████| 271/271 [00:02<00:00, 91.09it/s]
100%|██████████| 271/271 [00:02<00:00, 91.06it/s]


In [35]:
train_data['team1_SR'] = train_data.progress_apply(lambda x: \
            SRLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_SR'] = train_data.progress_apply(lambda x: \
            SRLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

test_lvl_data['team1_SR'] = test_lvl_data.progress_apply(lambda x: \
            SRLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
test_lvl_data['team2_SR'] = test_lvl_data.progress_apply(lambda x: \
            SRLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:09<00:00, 98.05it/s] 
100%|██████████| 948/948 [00:10<00:00, 93.60it/s] 
100%|██████████| 271/271 [00:02<00:00, 98.59it/s]
100%|██████████| 271/271 [00:02<00:00, 94.82it/s]


In [36]:
def RunsLastn(player_list, date, n):
    '''
    Function to get the average strike rate of players in the roster of a team in the last n games.
    
    Input:
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    Output: None
    
    Returns: float value denoting the average strike rate of all players in the roster.
    '''
    
    # Split the player_list into individual player IDs
    player_list = str(player_list).split(':')
    res_list = []
    
    # Loop through each player ID
    for player in player_list:
        # Get batting stats from the last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        # Calculate the mean strike rate for the player and append it to res_list
        if not df_rel.empty:
            res_list.append(np.nanmean(df_rel['runs']))
    
    # Return the mean of strike rates from res_list if it contains any values
    if res_list:
        return np.nanmean(res_list)
    else:
        return np.nan  # Return NaN if res_list is empty

In [37]:
train_data['team1_avg'] = train_data.progress_apply(lambda x: \
            RunsLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_avg'] = train_data.progress_apply(lambda x: \
            RunsLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

test_lvl_data['team1_avg'] = test_lvl_data.progress_apply(lambda x: \
            RunsLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
test_lvl_data['team2_avg'] = test_lvl_data.progress_apply(lambda x: \
            RunsLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:09<00:00, 98.80it/s] 
100%|██████████| 948/948 [00:09<00:00, 97.98it/s] 
100%|██████████| 271/271 [00:02<00:00, 95.61it/s] 
100%|██████████| 271/271 [00:02<00:00, 95.27it/s] 


In [38]:
#Bowler leve

In [39]:
#
def WicketsLastn(player_list, date, n):
    '''
    Function to get the average strike rate of players in the roster of a team in the last n games.
    
    Input:
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    Output: None
    
    Returns: float value denoting the average strike rate of all players in the roster.
    '''
    
    # Split the player_list into individual player IDs
    player_list = str(player_list).split(':')
    res_list = []
    
    # Loop through each player ID
    for player in player_list:
        # Get batting stats from the last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
        # Calculate the mean strike rate for the player and append it to res_list
        if not df_rel.empty:
            res_list.append(np.nanmean(df_rel['wicket_count']))
    
    # Return the mean of strike rates from res_list if it contains any values
    if res_list:
        return np.nanmean(res_list)
    else:
        return np.nan  # Return NaN if res_list is empty

In [40]:
train_data['team1_avg_wkt'] = train_data.progress_apply(lambda x: \
            WicketsLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_avg_wkt'] = train_data.progress_apply(lambda x: \
            WicketsLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

test_lvl_data['team1_avg_wkt'] = test_lvl_data.progress_apply(lambda x: \
            WicketsLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
test_lvl_data['team2_avg_wkt'] = test_lvl_data.progress_apply(lambda x: \
            WicketsLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:07<00:00, 122.86it/s]
100%|██████████| 948/948 [00:07<00:00, 121.12it/s]
100%|██████████| 271/271 [00:02<00:00, 124.96it/s]
100%|██████████| 271/271 [00:02<00:00, 122.53it/s]


In [41]:
def EcoLastn(player_list, date, n):
    '''
    Function to get the average strike rate of players in the roster of a team in the last n games.
    
    Input:
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    Output: None
    
    Returns: float value denoting the average strike rate of all players in the roster.
    '''
    
    # Split the player_list into individual player IDs
    player_list = str(player_list).split(':')
    res_list = []
    
    # Loop through each player ID
    for player in player_list:
        # Get batting stats from the last n games for each player
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
        # Calculate the mean strike rate for the player and append it to res_list
        if not df_rel.empty:
            res_list.append(np.nanmean(df_rel['economy']))
    
    # Return the mean of strike rates from res_list if it contains any values
    if res_list:
        return np.nanmean(res_list)
    else:
        return np.nan  # Return NaN if res_list is empty

In [42]:
train_data['team1_avg_eco'] = train_data.progress_apply(lambda x: \
            EcoLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
train_data['team2_avg_eco'] = train_data.progress_apply(lambda x: \
            EcoLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

test_lvl_data['team1_avg_eco'] = test_lvl_data.progress_apply(lambda x: \
            EcoLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
# Computing number of 50 runs in last 15 games for team2 for train dataset.
test_lvl_data['team2_avg_eco'] = test_lvl_data.progress_apply(lambda x: \
            EcoLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

100%|██████████| 948/948 [00:07<00:00, 121.91it/s]
100%|██████████| 948/948 [00:09<00:00, 104.38it/s]
100%|██████████| 271/271 [00:02<00:00, 97.76it/s] 
100%|██████████| 271/271 [00:02<00:00, 96.07it/s] 


In [43]:
train_data.fillna(0,inplace=True)
test_lvl_data.fillna(0, inplace=True)

In [44]:
train_data.isnull().sum()
test_lvl_data.isnull().sum()

match id                     0
team1                        0
team1_id                     0
team1_roster_ids             0
team2                        0
team2_id                     0
team2_roster_ids             0
toss winner                  0
toss decision                0
venue                        0
city                         0
match_dt                     0
lighting                     0
series_name                  0
season                       0
ground_id                    0
team_count_50runs_last15     0
team_winp_last5              0
team1only_avg_runs_last15    0
team1_winp_team2_last15      0
ground_avg_runs_last15       0
toss_winner_id               0
bat_id                       0
team1_winp_last5             0
team2_winp_last5             0
team2_winp_team1_last15      0
team1_winp_venue             0
team2_winp_venue             0
team1_winp_bat\chase         0
team2_winp_bat\chase         0
bat_venue_win                0
bowl_venue_win               0
runs_tea

In [45]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team2']==x['winner']) else 0, axis=1)

In [46]:
train_data['toss_winner_01'] = np.where(train_data['toss winner']==train_data['team2'], 1, 0)
test_lvl_data['toss_winner_01'] = np.where(test_lvl_data['toss winner']==test_lvl_data['team2'], 1, 0)

In [47]:
train_data['toss_decision_01'] = np.where(train_data['toss decision']=='bat', 1, 0)
test_lvl_data['toss_decision_01'] = np.where(test_lvl_data['toss decision']=='bat', 1, 0)

In [48]:
train_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner',
       'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name',
       'season', 'ground_id', 'team1_winp_team2_last15', 'toss_winner_id',
       'bat_id', 'team1_winp_last5', 'team2_winp_last5',
       'team2_winp_team1_last15', 'team1_winp_venue', 'team2_winp_venue',
       'team1_winp_bat\chase', 'team2_winp_bat\chase', 'bat_venue_win',
       'bowl_venue_win', 'runs_team1_1st_inning', 'runs_team2_1st_inning',
       'runs_team1_2nd_inning', 'runs_team2_2nd_inning',
       'wickets_team2_inning1', 'wickets_team1_inning1',
       'wickets_team1_inning2', 'wickets_team2_inning2',
       'team1_count_50runs_last15', 'team2_count_50runs_last15', 'team1_SR',
       'team2_SR', 'team1_avg', 'team2_avg', 'team1_avg_wkt', 'team2_avg_wkt',
       'team1_avg_eco', 'team2_avg_eco', 'winner_01', 'toss_winner_01',
       'toss_decision_

In [49]:
test_lvl_data.columns

Index(['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2',
       'team2_id', 'team2_roster_ids', 'toss winner', 'toss decision', 'venue',
       'city', 'match_dt', 'lighting', 'series_name', 'season', 'ground_id',
       'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'toss_winner_id', 'bat_id',
       'team1_winp_last5', 'team2_winp_last5', 'team2_winp_team1_last15',
       'team1_winp_venue', 'team2_winp_venue', 'team1_winp_bat\chase',
       'team2_winp_bat\chase', 'bat_venue_win', 'bowl_venue_win',
       'runs_team1_1st_inning', 'runs_team2_1st_inning',
       'runs_team1_2nd_inning', 'runs_team2_2nd_inning',
       'wickets_team2_inning1', 'wickets_team1_inning1',
       'wickets_team1_inning2', 'wickets_team2_inning2',
       'team1_count_50runs_last15', 'team2_count_50runs_last15', 'team1_SR',
       'team2_SR', 'team1_avg', 'team2_avg', 'team1_avg_wkt', 'team2_avg_wkt',
 

In [50]:
Selected_cols_train=['team1_winp_last5', 'team2_winp_last5','team1_winp_team2_last15', 'team2_winp_team1_last15',
       'team1_winp_venue', 'team2_winp_venue', 'team1_winp_bat\chase',
       'team2_winp_bat\chase', 'bat_venue_win', 'bowl_venue_win',
       'runs_team1_1st_inning', 'runs_team2_1st_inning',
       'runs_team1_2nd_inning', 'runs_team2_2nd_inning',
       'wickets_team2_inning1', 'wickets_team1_inning2',
       'team1_count_50runs_last15', 'team2_count_50runs_last15', 'team1_SR',
       'team2_SR', 'team1_avg', 'team2_avg', 'team1_avg_wkt', 'team2_avg_wkt',
       'team1_avg_eco', 'team2_avg_eco', 'toss_winner_01','toss_decision_01','wickets_team2_inning2', 'wickets_team1_inning1']

Selected_cols_test=['team1_winp_last5', 'team2_winp_last5','team1_winp_team2_last15', 'team2_winp_team1_last15',
       'team1_winp_venue', 'team2_winp_venue', 'team1_winp_bat\chase',
       'team2_winp_bat\chase', 'bat_venue_win', 'bowl_venue_win',
       'runs_team1_1st_inning', 'runs_team2_1st_inning',
       'runs_team1_2nd_inning', 'runs_team2_2nd_inning',
       'wickets_team2_inning1', 'wickets_team1_inning2',
       'team1_count_50runs_last15', 'team2_count_50runs_last15', 'team1_SR',
       'team2_SR', 'team1_avg', 'team2_avg', 'team1_avg_wkt', 'team2_avg_wkt',
       'team1_avg_eco', 'team2_avg_eco', 'toss_winner_01', 'toss_decision_01','wickets_team2_inning2', 'wickets_team1_inning1']

  'team1_winp_venue', 'team2_winp_venue', 'team1_winp_bat\chase',
  'team2_winp_bat\chase', 'bat_venue_win', 'bowl_venue_win',
  'team1_winp_venue', 'team2_winp_venue', 'team1_winp_bat\chase',
  'team2_winp_bat\chase', 'bat_venue_win', 'bowl_venue_win',


In [51]:
train_data.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = True)

team1_avg                   -0.142492
team1_count_50runs_last15   -0.132540
team1_SR                    -0.130171
team1_winp_venue            -0.098093
wickets_team1_inning2       -0.086395
team1_winp_bat\chase        -0.077241
team1_avg_wkt               -0.075659
team1_winp_last5            -0.074258
team2_id                    -0.072937
runs_team1_2nd_inning       -0.071637
winner_id                   -0.070142
toss_winner_id              -0.066803
team1_winp_team2_last15     -0.064668
bowl_venue_win              -0.062750
bat_id                      -0.058767
team1_id                    -0.058767
runs_team1_1st_inning       -0.056216
team1_avg_eco               -0.041679
toss_winner_01              -0.035146
runs_team2_2nd_inning       -0.018389
team2_avg_eco               -0.013403
team2_avg                   -0.005347
team2_count_50runs_last15   -0.002153
ground_id                    0.009766
runs_team2_1st_inning        0.012391
wickets_team1_inning1        0.016792
bat_venue_wi

In [None]:
X,y = train_data[Selected_cols_train], train_data['winner_01']

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

In [None]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train, label=y_train)
valid_pool = Pool(data=X_test, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

In [None]:
best_model.fit(X, y)

In [None]:
test= test_lvl_data[Selected_cols_test]

In [None]:
train_data['y_pred_01'] = best_model.predict(X)
test_lvl_data['y_pred_01'] = best_model.predict(test)

In [None]:
test_lvl_data['y_pred_01'].value_counts()

In [None]:
142/271