In [109]:
import pandas as pd
import numpy as np

In [110]:
df = pd.read_csv('all_matches.csv')

In [111]:
del df['season']
del df['start_date']
del df['byes']
del df['legbyes']
del df['penalty']
del df['other_wicket_type']
del df['other_player_dismissed']

In [112]:
df['ball'] = None

# Define a function to apply to each group
def calculate_over_ball(group):
    # Initialize current over and ball for each group
    current_over = 0
    current_ball = 1

    # Iterate over the rows in the group
    for index, row in group.iterrows():
        wides = row['wides']  # Assuming 'wides' column exists
        noballs = row['noballs']  # Assuming 'noballs' column exists

        # Set the 'ball' column with the current over and ball
        group.at[index, 'ball'] = f"{current_over}.{current_ball}"

        # If the delivery is legal, increment the balls
        if pd.isna(wides) and pd.isna(noballs):  # NaN values indicating no wides or no-balls
            if current_ball == 6:
                current_over += 1
                current_ball = 1
            else:
                current_ball += 1

    return group

# Group by 'match_id' and 'innings' and apply the calculate_over_ball function
df = df.groupby(['match_id', 'innings']).apply(calculate_over_ball).reset_index(drop=True)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(['match_id', 'innings']).apply(calculate_over_ball).reset_index(drop=True)


In [113]:
innings1_runs = df[df['innings'] == 1].groupby('match_id').apply(lambda x: x['runs_off_bat'].sum() + x['extras'].sum() + 1)

df['target'] = df.apply(lambda x: innings1_runs.get(x['match_id']) if x['innings'] == 2 else 0, axis=1)

In [114]:
df.rename(columns={'ball': 'overs'}, inplace=True)

df['over'] = df['overs'].apply(lambda x: int(x.split('.')[0]))
df['ball'] = df['overs'].apply(lambda x: int(x.split('.')[1]))

In [115]:
df['isPlayerOut'] = 0
 
df.loc[df['player_dismissed'].isna(), 'isPlayerOut'] = 0
df.loc[df['player_dismissed'].notna(), 'isPlayerOut'] = 1

In [116]:
df['striker_total_runs'] = 0
df['non_striker_total_runs'] = 0

In [117]:
player_runs = {}

df['striker_total_runs'] = 0
df['non_striker_total_runs'] = 0

for i, row in df.iterrows():
    striker = row['striker']
    non_striker = row['non_striker']
    runs_off_bat = row['runs_off_bat']
    isPlayerOut = row['isPlayerOut']

    player_runs[striker] = player_runs.get(striker, 0) + runs_off_bat
    player_runs[non_striker] = player_runs.get(non_striker, 0)

    df.at[i, 'striker_total_runs'] = player_runs[striker]
    df.at[i, 'non_striker_total_runs'] = player_runs[non_striker]

    if isPlayerOut:
        if i < len(df) - 1: 
            next_row = df.iloc[i + 1]
            if next_row['striker'] != striker:
                player_runs[striker] = 0
            if next_row['non_striker'] != non_striker:
                player_runs[non_striker] = 0

In [118]:
if 'extras' not in df.columns:
    df['extras'] = 0
else:
    df['extras'] = df['extras'].fillna(0).astype(int)

df['bowler_wickets'] = 0
df['bowler_runs'] = 0

bowler_stats = {}

for i, row in df.iterrows():
    match_id = row['match_id']
    innings = row['innings']
    bowler = row['bowler']
    runs_conceded = row['runs_off_bat'] + row['extras']
    is_wicket = row['isPlayerOut']

    if (match_id, innings, bowler) not in bowler_stats:
        bowler_stats[(match_id, innings, bowler)] = {'wickets': 0, 'runs': 0}

    bowler_stats[(match_id, innings, bowler)]['runs'] += runs_conceded
    if is_wicket:
        bowler_stats[(match_id, innings, bowler)]['wickets'] += 1

    df.at[i, 'bowler_wickets'] = bowler_stats[(match_id, innings, bowler)]['wickets']
    df.at[i, 'bowler_runs'] = bowler_stats[(match_id, innings, bowler)]['runs']

In [119]:
df['team_score'] = (df['runs_off_bat'] + df['extras']).groupby([df['match_id'], df['innings']]).cumsum()

In [120]:
df['team_wickets'] = df.groupby(['match_id', 'innings'])['isPlayerOut'].cumsum()

In [121]:
df['striker_balls'] = 0
df['non_striker_balls'] = 0

player_balls = {}

current_striker = None
current_non_striker = None

def swap_balls():
    global current_striker, current_non_striker
    current_striker, current_non_striker = current_non_striker, current_striker

for i in range(len(df)):
    row = df.iloc[i]
    
    if current_striker != row['striker'] or current_striker is None:
        if current_striker is not None:
            swap_balls()
        current_striker = row['striker']
        current_non_striker = row['non_striker']
        if current_striker not in player_balls:
            player_balls[current_striker] = 0
        if current_non_striker not in player_balls:
            player_balls[current_non_striker] = 0
    
    player_balls[current_striker] += 1
    
    df.at[i, 'striker_balls'] = player_balls[current_striker]
    df.at[i, 'non_striker_balls'] = player_balls[current_non_striker]

    if row['isPlayerOut'] == 1 and i < len(df) - 1:
        next_row = df.iloc[i + 1]
        if current_striker != next_row['striker']:
            player_balls[current_striker] = 0
        elif current_non_striker != next_row['non_striker']:
            player_balls[current_non_striker] = 0

In [122]:
df['batsman_4s'] = 0
df['batsman_6s'] = 0

player_4s = {}
player_6s = {}

for i, row in df.iterrows():
    striker = row['striker']
    runs_off_bat = row['runs_off_bat']

    if striker not in player_4s:
        player_4s[striker] = 0
    if striker not in player_6s:
        player_6s[striker] = 0

    if runs_off_bat == 4:
        player_4s[striker] += 1
    elif runs_off_bat == 6:
        player_6s[striker] += 1

    df.at[i, 'batsman_4s'] = player_4s[striker]
    df.at[i, 'batsman_6s'] = player_6s[striker]

In [123]:
df['bowler_balls_bowled'] = 0

bowler_balls = {}

for i, row in df.iterrows():
    match_id = row['match_id']
    innings = row['innings']
    bowler = row['bowler']

    if (match_id, innings, bowler) not in bowler_balls:
        bowler_balls[(match_id, innings, bowler)] = 0

    bowler_balls[(match_id, innings, bowler)] += 1

    df.at[i, 'bowler_balls_bowled'] = bowler_balls[(match_id, innings, bowler)]

In [124]:
df['remaining_wickets'] = 10

wickets_remaining = {}

for i, row in df.iterrows():
    match_id = row['match_id']
    innings = row['innings']
    is_player_out = row['isPlayerOut']

    if (match_id, innings) not in wickets_remaining:
        wickets_remaining[(match_id, innings)] = 10

    if is_player_out == 1:
        wickets_remaining[(match_id, innings)] -= 1

    df.at[i, 'remaining_wickets'] = wickets_remaining[(match_id, innings)]

In [125]:
df['remaining_balls'] = 120

balls_remaining = {}

for i, row in df.iterrows():
    match_id = row['match_id']
    innings = row['innings']
    wides = row['wides']
    noballs = row['noballs']

    if (match_id, innings) not in balls_remaining:
        balls_remaining[(match_id, innings)] = 120

    if pd.isna(wides) and pd.isna(noballs):
        balls_remaining[(match_id, innings)] -= 1

    df.at[i, 'remaining_balls'] = balls_remaining[(match_id, innings)]

In [126]:
df['runs_to_win'] = 0

def calculate_runs_to_win(row):
    if row['innings'] == 2:
        return max(row['target'] - row['team_score'], 0)
    else:
        return 0

df['runs_to_win'] = df.apply(calculate_runs_to_win, axis=1)
df['runs_to_win'] = df['runs_to_win'].astype(int)

In [132]:
df['total_runs'] = df['runs_off_bat'] + df['extras']
df['runs_last_18_balls'] = df.groupby(['match_id', 'innings'])['total_runs'].transform(
    lambda group: group.rolling(window=18, min_periods=1).sum()
)
df['runs_last_18_balls'] = df['runs_last_18_balls'].fillna(0)
df['runs_last_18_balls'] = df['runs_last_18_balls'].astype(int)

In [136]:
df['wickets_last_18_balls'] = df.groupby(['match_id', 'innings'])['isPlayerOut'].transform(
    lambda x: x.rolling(window=18, min_periods=1).sum()
)
df['wickets_last_18_balls'] = df['wickets_last_18_balls'].astype(int)

In [137]:
df.dtypes

match_id                    int64
venue                      object
innings                     int64
overs                      object
batting_team               object
bowling_team               object
striker                    object
non_striker                object
bowler                     object
runs_off_bat                int64
extras                      int32
wides                     float64
noballs                   float64
wicket_type                object
player_dismissed           object
target                      int64
over                        int64
ball                        int64
isPlayerOut                 int64
striker_total_runs          int64
non_striker_total_runs      int64
bowler_wickets              int64
bowler_runs                 int64
team_score                  int64
team_wickets                int64
striker_balls               int64
non_striker_balls           int64
batsman_4s                  int64
batsman_6s                  int64
bowler_balls_b

In [138]:
new_order = ['match_id', 'venue', 'innings', 'over', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker',
            'bowler', 'runs_off_bat', 'extras', 'wicket_type', 'target', 'isPlayerOut', 'striker_total_runs',
            'non_striker_total_runs', 'striker_balls', 'non_striker_balls', 'batsman_4s', 'batsman_6s',
             'bowler_wickets', 'bowler_runs', 'bowler_balls_bowled', 'team_score', 'team_wickets', 'remaining_wickets',
            'remaining_balls', 'runs_to_win', 'runs_last_18_balls', 'wickets_last_18_balls']

In [139]:
df = df[new_order]

In [141]:
df.to_csv('new_dataset.csv', index=False)