In [1]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
match_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/664389efa0868_match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b548c98c_batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b2c60743_bowler_level_scorecard.csv')
train_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')
test_data = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')

In [3]:
## Creating a binary winner column - 0 if team1 wins, else 1
train_data['winner_01'] = train_data.apply(lambda x: 0 if (x['team1']==x['winner']) else 1, axis=1)

## Making Features

In [4]:
num_match = 15

In [5]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.

    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}

    Output-None

    Returns- dataframe having bowling/batting stats from last n games of a player before an input date.
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'

    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [6]:
def player_form_factor(player_list, date, n):
    players = player_list.split(':')
    form_factors = []
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            weights = np.linspace(1, 2, len(recent_matches))  # Linear weight increase from 1 to 2
            form_factors.append(np.average(recent_matches['runs'], weights=weights))
        else:
            form_factors.append(0)
    return np.mean(form_factors) if players else 0

In [7]:
def team_batting_strength(player_list, date, n):
    players = player_list.split(':')
    total_runs = 0
    for player_id in players:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        total_runs += recent_matches['runs'].sum()
    return total_runs / len(players) if players else 0

In [8]:
def average_wickets_taken(player_list, date, n):
    player_list = player_list.split(':')
    wickets_list = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        total_wickets = recent_matches['wicket_count'].sum()
        average_wickets = total_wickets / n if n > 0 else 0
        wickets_list.append(average_wickets)
    return sum(wickets_list) / len(wickets_list) if wickets_list else 0

In [9]:
def no50sLastn(player_list, date, n):

    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

In [10]:
def calculate_batsman_strike_rate(player_list, date, n):
    player_list = str(player_list).split(':')  # Split string of ':' separated ids into a list of ids
    res_list = []

    for player in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        runs = recent_matches['runs'].sum()
        balls = recent_matches['balls_faced'].sum()

        # Calculate strike rate and avoid division by zero
        if balls > 0:
            strike_rate = (runs / balls) * 100
        else:
            strike_rate = 0

        res_list.append(strike_rate)

    # Calculate the average strike rate across all players in the list
    if res_list:  # Ensure the list is not empty to avoid division by zero
        average_strike_rate = sum(res_list) / len(res_list)
    else:
        average_strike_rate = 0

    return average_strike_rate

# Harshit

In [11]:
def recent_team_form(team_id, match_dt, match_lvl_data, last_n=5):
    recent_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                    (match_lvl_data['match_dt'] < match_dt)].sort_values('match_dt', ascending=False).head(last_n)
    wins = recent_matches['winner_id'].apply(lambda winner_id: 1 if winner_id == team_id else 0).sum()
    return wins / last_n if last_n > 0 else 0

In [12]:
def calculate_exponential_momentum(match_lvl_data, team_id, date, alpha=0.1):
    matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                             (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False)
    wins = matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).to_numpy()
    weights = np.exp(-alpha * np.arange(len(wins)))
    if np.sum(weights) > 0:
        momentum = np.dot(wins, weights) / np.sum(weights)
    else:
        momentum = 0  # Handle case with no matches
    return momentum

In [13]:
def average_winning_margin(team_id, date, match_lvl_data):
    winning_matches = match_lvl_data[
        (match_lvl_data['winner_id'] == team_id) & (match_lvl_data['match_dt'] < date)
    ]
    if len(winning_matches) > 0:
        runs_wins = winning_matches[winning_matches['by'] == 'runs']['win amount']
        wickets_wins = winning_matches[winning_matches['by'] == 'wickets']['win amount']
        average_margin = pd.concat([runs_wins, wickets_wins]).mean()
        return average_margin
    return 0

In [14]:
def lighting_performance(match_lvl_data, lighting_type, team_id, date):
    matches = match_lvl_data[(match_lvl_data['lighting'] == lighting_type) & (match_lvl_data['match_dt'] < date)]
    if len(matches) > 0:
        wins = matches[matches['winner_id'] == team_id].shape[0]
        return wins / len(matches)
    return 0  # Return 0 if no matches found under this condition

In [15]:
def overall_historical_win_rate(match_lvl_data, team_id, date):
    historical_matches = match_lvl_data[((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id)) &
                                        (match_lvl_data['match_dt'] < date)]
    wins = historical_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    total_matches = len(historical_matches)
    return wins / total_matches if total_matches > 0 else 0

In [16]:
def recent_performance(team_id, date, n, match_lvl_data):
    recent_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ].tail(n)
    wins = recent_matches['winner_id'].apply(lambda x: 1 if x == team_id else 0).sum()
    return wins / len(recent_matches) if len(recent_matches) > 0 else 0

In [17]:
def weighted_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = team_roster.split(':')
    total_weighted_awards = 0

    # Fetch recent matches up to 'n' for all players in the roster before the specified date
    recent_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'].isin(player_ids)) &
                                    (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

    # Assigning weights - newer matches get higher weight
    for i, match in enumerate(recent_matches.itertuples(), 1):
        weight = 1 + (n - i) * 0.1  # Example weight formula: 1 + (total_matches - position) * 0.1
        total_weighted_awards += weight

    return total_weighted_awards

In [18]:
def count_recent_mom_awards(team_roster, match_lvl_data, date, n):
    player_ids = str(team_roster).split(':')  # Split string of ':' separated ids into a list of ids
    total_mom_awards = 0

    for player_id in player_ids:  # Loop over each player_id in roster
        # Get relevant matches for the player up to the specified date
        player_matches = match_lvl_data[(match_lvl_data['player_of_the_match_id'] == player_id) &
                                        (match_lvl_data['match_dt'] < date)].sort_values('match_dt', ascending=False).head(n)

        # Count the number of matches where this player was MoM
        mom_count = len(player_matches)
        total_mom_awards += mom_count  # Accumulate MoM awards

    return total_mom_awards

In [19]:
def team_batsman_performance_index(bat_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_performance_index = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bat')
        if not recent_matches.empty:
            player_performance = (recent_matches['runs'] * 0.5 +
                                  recent_matches['strike_rate'] * 0.3 +
                                  (recent_matches['Fours'] + recent_matches['Sixes'] * 2) * 0.2) * \
                                  (1 + 0.1 * recent_matches['is_batsman_captain'].iloc[0] +
                                   0.05 * recent_matches['is_batsman_keeper'].iloc[0])
            total_performance_index += player_performance.sum()

    return total_performance_index

In [20]:
def team_bowler_impact_score(bowler_df, match_id, team_roster_ids, date, n=5):
    team_ids = str(team_roster_ids).split(':')
    total_impact_score = 0

    for player_id in team_ids:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        if not recent_matches.empty:
            player_impact = (recent_matches['wicket_count'] * 2 +
                             (120 / recent_matches['economy']) * 0.5 +
                             recent_matches['maiden'] * 1) * \
                            (1 + 0.1 * recent_matches['is_bowler_captain'].iloc[0] +
                             0.05 * recent_matches['is_bowler_keeper'].iloc[0])
            total_impact_score += player_impact.sum()

    return total_impact_score

In [21]:
def calculate_bowler_economy_rate(player_list, date, n):
    player_list = player_list.split(':')
    economy_rates = []
    for player_id in player_list:
        recent_matches = giveLastNgamesPlayer(player_id=player_id, date=date, n=n, bat_or_bowl='bowl')
        runs_conceded = recent_matches['runs'].sum()
        overs = (recent_matches['balls_bowled'].sum()) / 6
        economy_rate = (runs_conceded / overs) if overs > 0 else 0
        economy_rates.append(economy_rate)
    return sum(economy_rates) / len(economy_rates) if economy_rates else 0

In [22]:
def team_scoring_average(team_id, date, match_lvl_data):
    team_matches = match_lvl_data[
        (match_lvl_data['match_dt'] < date) &
        ((match_lvl_data['team1_id'] == team_id) | (match_lvl_data['team2_id'] == team_id))
    ]
    team_scores = team_matches.apply(
        lambda x: x['inning1_runs'] if x['team1_id'] == team_id else x['inning2_runs'], axis=1
    )
    return team_scores.mean() if len(team_scores) > 0 else 0

In [23]:
train_data['team1_form_factor'] = train_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_batting_strength'] = train_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_avg_wicket'] = train_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_count_50runs_last15'] = train_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team1_strike_rate'] = train_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

train_data['team2_recent_form'] = train_data.apply(
    lambda x: recent_team_form(x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

train_data['team2_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team2_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=15,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_average_winning_margin'] = train_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team2_day_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team2_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

train_data['team1_weighted_mom'] = train_data.apply(
    lambda x: weighted_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)

train_data['team1_recent_mom_count'] = train_data.apply(
    lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)

train_data['team1_batsman_performance_index'] = train_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_bowler_impact_score'] = train_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

train_data['team1_overall_win_rate'] = train_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)

train_data['team1_bowler_eco'] = train_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)


train_data['team1_night_match_win_rate'] = train_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team1_id'], x['match_dt']),
    axis=1
)

train_data['team1_recent_win_rate'] = train_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=15,
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_scoring_average'] = train_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

train_data['team1_momentum'] = train_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)

100%|██████████| 948/948 [00:16<00:00, 57.09it/s]
100%|██████████| 948/948 [00:16<00:00, 56.47it/s]
100%|██████████| 948/948 [00:12<00:00, 76.25it/s]
100%|██████████| 948/948 [00:17<00:00, 55.52it/s]
100%|██████████| 948/948 [00:15<00:00, 61.82it/s]
100%|██████████| 948/948 [00:13<00:00, 72.62it/s]


In [24]:
test_data['team1_form_factor'] = test_data.progress_apply(lambda x: \
            player_form_factor(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_batting_strength'] = test_data.progress_apply(lambda x: \
            team_batting_strength(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_avg_wicket'] = test_data.progress_apply(lambda x: \
            average_wickets_taken(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_count_50runs_last15'] = test_data.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team1_strike_rate'] = test_data.progress_apply(lambda x: \
            calculate_batsman_strike_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=num_match), axis=1)

test_data['team2_recent_form'] = test_data.apply(
    lambda x: recent_team_form(x['team2_id'], x['match_dt'], match_lvl_data),
    axis=1)

test_data['team2_momentum'] = test_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team2_recent_win_rate'] = test_data.apply(
    lambda x: recent_performance(
        team_id=x['team2_id'],
        date=x['match_dt'],
        n=15,
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_average_winning_margin'] = test_data.apply(
    lambda x: average_winning_margin(
        team_id=x['team2_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team2_day_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'day match', x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team2_overall_win_rate'] = test_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team2_id'], x['match_dt']),
    axis=1
)

test_data['team1_weighted_mom'] = test_data.apply(
    lambda x: weighted_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 22), axis=1)

test_data['team1_recent_mom_count'] = test_data.apply(
    lambda x: count_recent_mom_awards(x['team1_roster_ids'], match_lvl_data, x['match_dt'], 15), axis=1)

test_data['team1_batsman_performance_index'] = test_data.apply(
    lambda x: team_batsman_performance_index(batsman_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team1_bowler_impact_score'] = test_data.apply(
    lambda x: team_bowler_impact_score(bowler_lvl_data, x['match id'], x['team1_roster_ids'], x['match_dt']),
    axis=1
)

test_data['team1_overall_win_rate'] = test_data.apply(
    lambda x: overall_historical_win_rate(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)

test_data['team1_bowler_eco'] = test_data.progress_apply(lambda x: \
            calculate_bowler_economy_rate(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)


test_data['team1_night_match_win_rate'] = test_data.apply(
    lambda x: lighting_performance(match_lvl_data, 'night match', x['team1_id'], x['match_dt']),
    axis=1
)

test_data['team1_recent_win_rate'] = test_data.apply(
    lambda x: recent_performance(
        team_id=x['team1_id'],
        date=x['match_dt'],
        n=15,
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_scoring_average'] = test_data.apply(
    lambda x: team_scoring_average(
        team_id=x['team1_id'],
        date=x['match_dt'],
        match_lvl_data=match_lvl_data
    ), axis=1
)

test_data['team1_momentum'] = test_data.apply(
    lambda x: calculate_exponential_momentum(match_lvl_data, x['team1_id'], x['match_dt']),
    axis=1
)

100%|██████████| 271/271 [00:05<00:00, 48.93it/s]
100%|██████████| 271/271 [00:04<00:00, 59.06it/s]
100%|██████████| 271/271 [00:03<00:00, 74.11it/s]
100%|██████████| 271/271 [00:04<00:00, 54.47it/s]
100%|██████████| 271/271 [00:04<00:00, 60.58it/s]
100%|██████████| 271/271 [00:03<00:00, 76.36it/s]


In [25]:
train_data.select_dtypes(include=['number']).corr()['winner_01'].sort_values(ascending = True)

team1_form_factor                 -0.147925
team1_batting_strength            -0.140372
team1_batsman_performance_index   -0.139209
team1_avg_wicket                  -0.133810
team1_count_50runs_last15         -0.132258
team1_strike_rate                 -0.129016
team1_bowler_impact_score         -0.121397
team_count_50runs_last15          -0.110761
team1_overall_win_rate            -0.091137
team1_bowler_eco                  -0.086270
team1_night_match_win_rate        -0.084950
team1_recent_win_rate             -0.082662
team1_scoring_average             -0.080175
team1_momentum                    -0.079657
team2_id                          -0.068129
winner_id                         -0.065381
team1_winp_team2_last15           -0.055788
team1_id                          -0.053952
team1only_avg_runs_last15         -0.047838
ground_avg_runs_last15            -0.012359
team_winp_last5                   -0.011712
ground_id                          0.015227
match id                        

## Model

In [26]:
df = train_data.select_dtypes(include=['number'])

df.drop(['match id', 'team1_id', 'team2_id', 'ground_id','winner_id','team1_weighted_mom','team1_recent_mom_count'], axis=1, inplace=True)

df.fillna(0,inplace=True)
df.replace([np.inf, -np.inf], 0, inplace=True)
# Replace infinite values with NaN to handle them similarly
# df.replace([np.inf, -np.inf], np.nan, inplace=True)
# df.fillna(df.mean(), inplace=True)

In [27]:
df.corr()['winner_01'].sort_values(ascending = True)

team1_form_factor                 -0.147925
team1_batting_strength            -0.140372
team1_batsman_performance_index   -0.139209
team1_avg_wicket                  -0.133810
team1_count_50runs_last15         -0.132258
team1_bowler_impact_score         -0.131984
team1_strike_rate                 -0.129016
team_count_50runs_last15          -0.110761
team1_overall_win_rate            -0.091137
team1_bowler_eco                  -0.086270
team1_night_match_win_rate        -0.084950
team1_recent_win_rate             -0.082662
team1_scoring_average             -0.080175
team1_momentum                    -0.079657
team1_winp_team2_last15           -0.055788
team1only_avg_runs_last15         -0.054261
ground_avg_runs_last15            -0.050018
team_winp_last5                   -0.011712
team2_recent_win_rate              0.069638
team2_overall_win_rate             0.072307
team2_day_match_win_rate           0.073309
team2_average_winning_margin       0.074013
team2_momentum                  

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

X = df.drop('winner_01', axis=1)
y = df['winner_01']

# poly = PolynomialFeatures(degree=2, include_bias=True)
# X = poly.fit_transform(X)
# print(X.shape)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Select number of components to explain desired variance (e.g., 95%)
desired_variance = 0.95
n_components = np.argmax(cumulative_explained_variance >= desired_variance) + 1
print(n_components)

pca = PCA(n_components=n_components)
X = pca.fit_transform(X_scaled)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

14
(948, 14)


In [29]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [30]:
GBM_model = GradientBoostingClassifier()
LGBM_model = LGBMClassifier()
XGB_model = XGBClassifier()
CatBoost_model = CatBoostClassifier()

# Train the models
GBM_model.fit(X_train, y_train)
LGBM_model.fit(X_train, y_train)
XGB_model.fit(X_train, y_train)
CatBoost_model.fit(X_train, y_train)

# Make predictions
y_pred_GBM = GBM_model.predict(X_test)
y_pred_LGBM = LGBM_model.predict(X_test)
y_pred_XGB = XGB_model.predict(X_test)
y_pred_CatBoost = CatBoost_model.predict(X_test)

# Evaluate models
accuracy_GBM = accuracy_score(y_test, y_pred_GBM)
accuracy_LGBM = accuracy_score(y_test, y_pred_LGBM)
accuracy_XGB = accuracy_score(y_test, y_pred_XGB)
accuracy_CatBoost = accuracy_score(y_test, y_pred_CatBoost)

print("Accuracy for GBM model:", accuracy_GBM)
print("Accuracy for LGBM model:", accuracy_LGBM)
print("Accuracy for XGB model:", accuracy_XGB)
print("Accuracy for CatBoost model:", accuracy_CatBoost)

[LightGBM] [Info] Number of positive: 386, number of negative: 372
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3542
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509235 -> initscore=0.036944
[LightGBM] [Info] Start training from score 0.036944
Learning rate set to 0.009153
0:	learn: 0.6922000	total: 58.5ms	remaining: 58.5s
1:	learn: 0.6913035	total: 59.8ms	remaining: 29.8s
2:	learn: 0.6904658	total: 61ms	remaining: 20.3s
3:	learn: 0.6891433	total: 62.3ms	remaining: 15.5s
4:	learn: 0.6879225	total: 63.7ms	remaining: 12.7s
5:	learn: 0.6871086	total: 65ms	remaining: 10.8s
6:	learn: 0.6861269	total: 66.2ms	remaining: 9.39s
7:	learn: 0.6852442	total: 67.5ms	remaining: 8.37s
8:	learn: 0.6843936	total: 68.7ms	remaining: 7.56s
9:	learn: 0.6832240	total: 69

In [31]:
import optuna
from catboost import Pool
train_pool = Pool(data=X_train, label=y_train)
valid_pool = Pool(data=X_test, label=y_test)

# Define the objective function for hyperparameter tuning
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'verbose': 0  # Suppress output for tuning
    }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    
    preds = model.predict(valid_pool)
    accuracy = accuracy_score(y_test, preds)
    
    return -accuracy  # Minimize the negative accuracy

# Create and run the Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    od_type=best_params['od_type'],
    od_wait=best_params['od_wait'],
    verbose=100  # To monitor the training process
)

best_model.fit(X_train, y_train)

# Evaluate the final model on the validation set
final_preds = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(final_accuracy)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-06-05 10:57:01,036] A new study created in memory with name: no-name-0694d9f6-9ef6-49c1-ad70-a35f4ac7ed7b
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2024-06-05 10:57:01,282] Trial 0 finished with value: -0.5894736842105263 and parameters: {'iterations': 365, 'learning_rate': 0.00041643212741892865, 'depth': 4, 'l2_leaf_reg': 0.015566198098318055, 'border_count': 129, 'bagging_temperature': 0.013461202793418534, 'random_strength': 0.006030883750945405, 'od_type': 'IncToDec', 'od_wait': 35}. Best is trial 0 with value: -0.5894736842105263.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 

Best parameters: {'iterations': 244, 'learning_rate': 0.0002561750202669847, 'depth': 4, 'l2_leaf_reg': 0.011720141257805879, 'border_count': 33, 'bagging_temperature': 0.046892823599568256, 'random_strength': 1.1635457064527728, 'od_type': 'IncToDec', 'od_wait': 33}
0:	learn: 0.6931319	total: 460us	remaining: 112ms
100:	learn: 0.6916899	total: 50.3ms	remaining: 71.2ms
200:	learn: 0.6903030	total: 99.8ms	remaining: 21.3ms
243:	learn: 0.6897262	total: 121ms	remaining: 0us
0.6684210526315789


0.6526315789473685

## Test

In [32]:
test_df = test_data.select_dtypes(include=['number'])

test_df.drop(['match id', 'team1_id', 'team2_id', 'ground_id','team1_weighted_mom','team1_recent_mom_count'], axis=1, inplace=True)

# test_df.fillna(0,inplace=True)
# test_df.replace([np.inf, -np.inf], 0, inplace=True)
# Replace infinite values with NaN to handle them similarly
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

X_test = pca.transform(scaler.transform(test_df))

In [33]:
best_model.fit(X, y)

0:	learn: 0.6931321	total: 5.46ms	remaining: 1.33s
100:	learn: 0.6918067	total: 82ms	remaining: 116ms
200:	learn: 0.6904673	total: 181ms	remaining: 38.8ms
243:	learn: 0.6899391	total: 220ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x139b96450>

In [34]:
pred = best_model.predict(X_test)
pred_proba = best_model.predict_proba(X_test)

In [35]:
def count_zeros(y_test):
    total_elements = len(y_test)
    count_zeros = np.sum(y_test == 0)
    percentage_zeros = (count_zeros / total_elements) * 100
    return percentage_zeros

In [36]:
print("% zeros in Catboost_model ",count_zeros(pred))

% zeros in Catboost_model  54.24354243542435


In [37]:
df_test = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/6644a1e287df6_test_data_with_samplefeatures.csv')
df_test['winner'] = pred
df_test['win_pred_score'] = np.max(pred_proba, axis=1)

winner_id = []
for i in range(len(df_test)):
  if df_test['winner'][i] == 0:
    winner_id.append(df_test['team1_id'][i])
  else:
    winner_id.append(df_test['team2_id'][i])

df_test['winner_id'] = winner_id

## Train

In [38]:
df_train = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/663e2b6d54457_train_data_with_samplefeatures.csv')

In [39]:
y_pred = best_model.predict(X)
pred_proba = best_model.predict_proba(X)

df_train['winner_new'] = y_pred
df_train['win_pred_score'] = np.max(pred_proba, axis=1)

winner_id = []
for i in range(len(df_train)):
  if df_train['winner'][i] == 0:
    winner_id.append(df_train['team1_id'][i])
  else:
    winner_id.append(df_train['team2_id'][i])

df_train['winner_id_new'] = winner_id

## Making Submission file

In [40]:
params = best_model.get_all_params()
print(params)

{'nan_mode': 'Min', 'eval_metric': 'Logloss', 'iterations': 244, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'od_pval': 0, 'random_score_type': 'NormalWithModelSizeDecrease', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'eval_fraction': 0, 'force_unit_auto_pair_weights': False, 'l2_leaf_reg': 0.011720141395926476, 'random_strength': 1.1635457277297974, 'od_type': 'IncToDec', 'rsm': 1, 'boost_from_average': False, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': False, 'od_wait': 33, 'class_names': [0, 1], 'random_seed': 0, 'depth': 4, 'posterior_sampling': False, 'border_count': 33, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1

In [41]:
# Extracting the relevant parameters
train_hps_trees = params.get('iterations')
train_hps_depth = params.get('depth')
train_hps_lr = params.get('learning_rate')

print(f"train_hps_trees: {train_hps_trees}")
print(f"train_hps_depth: {train_hps_depth}")
print(f"train_hps_lr: {train_hps_lr}")

train_hps_trees: 244
train_hps_depth: 4
train_hps_lr: 0.00025617502979002893


In [42]:
abs(df.corr()['winner_01']).sort_values(ascending=False)

winner_01                          1.000000
team1_form_factor                  0.147925
team1_batting_strength             0.140372
team1_batsman_performance_index    0.139209
team1_avg_wicket                   0.133810
team1_count_50runs_last15          0.132258
team1_bowler_impact_score          0.131984
team1_strike_rate                  0.129016
team_count_50runs_last15           0.110761
team2_recent_form                  0.105046
team1_overall_win_rate             0.091137
team1_bowler_eco                   0.086270
team1_night_match_win_rate         0.084950
team2_momentum                     0.083399
team1_recent_win_rate              0.082662
team1_scoring_average              0.080175
team1_momentum                     0.079657
team2_average_winning_margin       0.074013
team2_day_match_win_rate           0.073309
team2_overall_win_rate             0.072307
team2_recent_win_rate              0.069638
team1_winp_team2_last15            0.055788
team1only_avg_runs_last15       

In [43]:
df_sub = pd.read_csv('/Users/alokroy/Documents/Programming/Projects/Amex/data/main/submission_template_file1.csv')

df_sub['win_pred_team_id'] = pd.concat([df_test['winner_id'], df_train['winner_id_new']], ignore_index=True)
df_sub['win_pred_score'] = pd.concat([df_test['win_pred_score'], df_train['win_pred_score']], ignore_index=True)
df_sub['train_algorithm'] = 'catboost'
df_sub['is_ensemble'] = 'no'
df_sub['train_hps_trees'] = train_hps_trees
df_sub['train_hps_depth'] = train_hps_depth
df_sub['train_hps_lr'] = train_hps_lr
df_sub['indep_feat_id1'] = pd.concat([test_df['team1_form_factor'], df['team1_form_factor']], ignore_index=True)
df_sub['indep_feat_id2'] = pd.concat([test_df['team1_batting_strength'], df['team1_batting_strength']], ignore_index=True)
df_sub['indep_feat_id3'] = pd.concat([test_df['team1_batsman_performance_index'], df['team1_batsman_performance_index']], ignore_index=True)
df_sub['indep_feat_id4'] = pd.concat([test_df['team1_avg_wicket'], df['team1_avg_wicket']], ignore_index=True)
df_sub['indep_feat_id5'] = pd.concat([test_df['team1_count_50runs_last15'], df['team1_count_50runs_last15']], ignore_index=True)
df_sub['indep_feat_id6'] = pd.concat([test_df['team1_bowler_impact_score'], df['team1_bowler_impact_score']], ignore_index=True)
df_sub['indep_feat_id7'] = pd.concat([test_df['team1_strike_rate'], df['team1_strike_rate']], ignore_index=True)
df_sub['indep_feat_id8'] = pd.concat([test_df['team_count_50runs_last15'], df['team_count_50runs_last15']], ignore_index=True)
df_sub['indep_feat_id9'] = pd.concat([test_df['team2_recent_form'], df['team2_recent_form']], ignore_index=True)
df_sub['indep_feat_id10'] = pd.concat([test_df['team1_overall_win_rate'], df['team1_overall_win_rate']], ignore_index=True)

In [44]:
df_sub.isna().sum()

match id            0
dataset_type        0
win_pred_team_id    0
win_pred_score      0
train_algorithm     0
is_ensemble         0
train_hps_trees     0
train_hps_depth     0
train_hps_lr        0
indep_feat_id1      0
indep_feat_id2      0
indep_feat_id3      0
indep_feat_id4      0
indep_feat_id5      0
indep_feat_id6      0
indep_feat_id7      0
indep_feat_id8      0
indep_feat_id9      0
indep_feat_id10     0
dtype: int64

In [45]:
df_sub.to_csv('submission_file.csv', index=False)