In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
df_deliveries = pd.read_csv('deliveries.csv')

In [3]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [4]:
all_players = set()

all_players.update(df_deliveries['striker'].unique())
all_players.update(df_deliveries['non_striker'].unique())
all_players.update(df_deliveries['bowler'].unique())

team_players = {team: {'striker': set(), 'non_striker': set(), 'bowler': set()} for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team]['striker'].add(row['striker'])
    team_players[team]['non_striker'].add(row['non_striker'])
    team_players[team]['bowler'].add(row['bowler'])

for team, roles in team_players.items():
    print(f"Team: {team}")
    for role, players in roles.items():
        print(f"{role.capitalize()}s: {', '.join(players)}")


Team: England
Strikers: MM Ali, JE Root, SM Curran, BA Stokes, HC Brook, AU Rashid, JM Bairstow, MA Wood, RJW Topley, AAP Atkinson, LS Livingstone, DJ Willey, CR Woakes, JC Buttler, DJ Malan
Non_strikers: MM Ali, JE Root, SM Curran, BA Stokes, HC Brook, AU Rashid, MA Wood, RJW Topley, AAP Atkinson, LS Livingstone, DJ Willey, DJ Malan, JC Buttler, JM Bairstow, CR Woakes
Bowlers: Fazalhaq Farooqi, Kuldeep Yadav, CBRLS Kumara, Mujeeb Ur Rahman, MJ Santner, JJ Bumrah, M Theekshana, Taskin Ahmed, D Madushanka, M Jansen, K Rabada, JDS Neesham, Naveen-ul-Haq, DM de Silva, KA Maharaj, MJ Henry, R Ravindra, Mehedi Hasan Miraz, AD Mathews, RA Jadeja, Shoriful Islam, Mohammed Siraj, Rashid Khan, Mustafizur Rahman, CAK Rajitha, Mohammad Nabi, Mahedi Hasan, G Coetzee, Shakib Al Hasan, Azmatullah Omarzai, Mohammed Shami, L Ngidi, TA Boult, GD Phillips
Team: New Zealand
Strikers: MJ Henry, MS Chapman, R Ravindra, WA Young, KS Williamson, TG Southee, DJ Mitchell, TWM Latham, MJ Santner, JDS Neesham, D

In [5]:
team_players = {team: set() for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team].add(row['striker'])
    team_players[team].add(row['non_striker'])
    
for _, row in df_deliveries.iterrows():
    team = row['bowling_team']
    team_players[team].add(row['bowler'])

for team, players in team_players.items():
    print(f"Team: {team}")
    print(f"Number of Players: {len(players)}")
    print(f"Players: {', '.join(players)}")


Team: England
Number of Players: 15
Players: MM Ali, JE Root, SM Curran, BA Stokes, HC Brook, AU Rashid, JM Bairstow, MA Wood, RJW Topley, AAP Atkinson, LS Livingstone, DJ Willey, CR Woakes, JC Buttler, DJ Malan
Team: New Zealand
Number of Players: 14
Players: MJ Henry, MS Chapman, R Ravindra, WA Young, KS Williamson, TG Southee, DJ Mitchell, TWM Latham, MJ Santner, JDS Neesham, DP Conway, LH Ferguson, TA Boult, GD Phillips
Team: Pakistan
Number of Players: 14
Players: Shaheen Shah Afridi, Babar Azam, Hasan Ali, Mohammad Rizwan, Mohammad Wasim, Shadab Khan, Abdullah Shafique, Saud Shakeel, Fakhar Zaman, Iftikhar Ahmed, Usama Mir, Haris Rauf, Imam-ul-Haq, Mohammad Nawaz
Team: Netherlands
Number of Players: 15
Players: Vikramjit Singh, AT Nidamanuru, Saqib Zulfiqar, SA Edwards, LV van Beek, A Dutt, Shariz Ahmad, MP O'Dowd, W Barresi, RE van der Merwe, R Klein, SA Engelbrecht, BFW de Leede, CN Ackermann, PA van Meekeren
Team: Afghanistan
Number of Players: 13
Players: Ibrahim Zadran, Moha

In [6]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [7]:
df_deliveries[['extras','wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']] = df_deliveries[['extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']].fillna(0)

In [8]:
team_players = {team: set() for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team].add(row['striker'])
    team_players[team].add(row['non_striker'])

    team = row['bowling_team']
    team_players[team].add(row['bowler'])

player_wise_data = []

for _, match in df_deliveries.iterrows():
    match_id = match['match_id']
    season = match['season']
    start_date = match['start_date']
    venue = match['venue']
    batting_team = match['batting_team']
    bowling_team = match['bowling_team']

    batting_players = team_players[batting_team]
    bowling_players = team_players[bowling_team]

    batting_runs = df_deliveries[(df_deliveries['match_id'] == match_id) &
                              (df_deliveries['striker'].isin(batting_players))].groupby('striker')['runs_off_bat'].sum().reset_index()

    batting_data = [{'match_id': match_id,
                     'season': season,
                     'start_date': start_date,
                     'venue': venue,
                     'team': batting_team,
                     'player': player,
                     'runs': batting_runs[batting_runs['striker'] == player]['runs_off_bat'].values[0] if player in batting_runs['striker'].tolist() else 0,
                     'wickets': 0} for player in batting_players]

    bowling_wickets = df_deliveries[(df_deliveries['match_id'] == match_id) &
                              (df_deliveries['bowler'].isin(bowling_players)) & df_deliveries['wicket_type'].isin(['bowled', 'caught', 'caught and bowled', 'lbw', 'stumped'])].groupby('bowler')['wicket_type'].count().reset_index()

    bowling_data = [{'match_id': match_id,
                     'season': season,
                     'start_date': start_date,
                     'venue': venue,
                     'team': bowling_team,
                     'player': player,
                     'runs': 0,
                     'wickets': bowling_wickets[bowling_wickets['bowler'] == player]['wicket_type'].values[0] if player in bowling_wickets['bowler'].tolist() else 0,} for player in bowling_players]

    player_wise_data.extend(batting_data)
    player_wise_data.extend(bowling_data)

player_wise_df = pd.DataFrame(player_wise_data)


In [9]:
player_wise_df = player_wise_df.drop_duplicates(keep='first')

In [10]:
player_wise_df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'team', 'player', 'runs',
       'wickets'],
      dtype='object')

In [11]:
player_wise_df['total_runs'] = player_wise_df.groupby(['match_id', 'season', 'start_date', 'venue', 'team', 'player'])['runs'].transform('sum')
player_wise_df['total_wickets'] = player_wise_df.groupby(['match_id', 'season', 'start_date', 'venue', 'team', 'player'])['wickets'].transform('sum')

In [12]:
player_wise_df.shape

(1576, 10)

In [13]:
player_wise_df = player_wise_df.drop(['runs', 'wickets'], axis=1).reset_index(drop=True)

In [14]:
player_wise_df = player_wise_df.drop_duplicates(keep='first')

In [15]:
player_wise_df.shape

(932, 8)

In [16]:
player_wise_df.to_csv('playerwise_df.csv')

In [17]:
player_wise_df.head(20)

Unnamed: 0,match_id,season,start_date,venue,team,player,total_runs,total_wickets
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,MM Ali,11,0
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,JE Root,77,0
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,SM Curran,14,1
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,BA Stokes,0,0
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,HC Brook,25,0
5,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,AU Rashid,15,0
6,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,JM Bairstow,33,0
7,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,MA Wood,13,0
8,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,RJW Topley,0,0
9,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",England,AAP Atkinson,0,0


In [18]:
categorical_col = [col for col in player_wise_df if player_wise_df[col].dtype == 'object']
categorical_col

['season', 'start_date', 'venue', 'team', 'player']

In [19]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [20]:
mapping = {}

for column in categorical_col:
    player_wise_df[column] = le.fit_transform(player_wise_df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [21]:
for column_name in categorical_col:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()

Mapping for column "season":
2023/24: 0

Mapping for column "start_date":
2023-10-05: 0
2023-10-06: 1
2023-10-07: 2
2023-10-08: 3
2023-10-09: 4
2023-10-10: 5
2023-10-11: 6
2023-10-12: 7
2023-10-13: 8
2023-10-14: 9
2023-10-15: 10
2023-10-16: 11
2023-10-17: 12
2023-10-18: 13
2023-10-19: 14
2023-10-20: 15
2023-10-21: 16
2023-10-22: 17
2023-10-23: 18
2023-10-24: 19
2023-10-25: 20
2023-10-26: 21
2023-10-27: 22
2023-10-28: 23
2023-10-29: 24
2023-10-30: 25
2023-10-31: 26
2023-11-01: 27

Mapping for column "venue":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "team":
Afghanistan: 0
Austra

In [22]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor


In [23]:
X = player_wise_df[['match_id', 'season', 'start_date', 'venue', 'team', 'player']]
y = player_wise_df[['total_runs', 'total_wickets']]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, shuffle=True)

In [25]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(5, 50, 5)), 'max_depth': list(range(1, 10, 2))}},
    'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}
}


In [26]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test)

    return model_results,best_reg_model_ours

In [27]:
model_results,best_reg_model_ours = eval_models()
model_results

LinearRegression 19.570447673481173 {}
RandomForestRegressor 11.777425769734064 {'max_depth': 9, 'n_estimators': 30}


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


XGBRegressor 12.32448268285621 {'learning_rate': 0.01, 'n_estimators': 410}
PolynomialFeatures 19.414859940573148 {'polynomialfeatures__degree': 2}
Best model:  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=410, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,19.570448,17.844332,10.178601,9.391541,{}
RandomForestRegressor,11.777426,17.399971,6.128631,8.997713,"{'max_depth': 9, 'n_estimators': 30}"
XGBRegressor,12.324483,16.950362,6.243885,8.454027,"{'learning_rate': 0.01, 'n_estimators': 410}"
PolynomialFeatures,19.41486,17.956892,10.110641,9.380244,{'polynomialfeatures__degree': 2}


In [28]:
print(best_reg_model_ours)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=410, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [39]:
y_predicted = best_reg_model_ours.predict(X_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [31]:
X_test.head(73)

Unnamed: 0,match_id,season,start_date,venue,team,player
1489,31,0,26,2,2,84
106,3,0,2,3,0,95
899,20,0,16,9,8,133
763,17,0,14,6,4,91
99,3,0,2,3,0,31
...,...,...,...,...,...,...
1492,31,0,26,2,2,94
605,14,0,11,1,1,34
357,8,0,5,8,9,73
54,2,0,1,8,7,30


In [32]:
for i, value in enumerate(y_predicted):
    print(f"Index {i}: {value}")

Index 0: [18.728643  1.207959]
Index 1: [2.6353183 0.5500782]
Index 2: [5.74489    0.50392544]
Index 3: [5.337996  1.0174215]
Index 4: [6.539975  0.4385021]
Index 5: [10.820814    0.36327428]
Index 6: [6.875982   0.61546606]
Index 7: [16.122906    0.36000106]
Index 8: [17.090769    0.80159956]
Index 9: [4.8773866 1.3638854]
Index 10: [44.961605    0.30139294]
Index 11: [12.6432295   0.50479496]
Index 12: [14.718518   1.0191642]
Index 13: [10.159376   0.5400863]
Index 14: [14.492005    0.44797096]
Index 15: [14.847461   1.1590185]
Index 16: [7.1412587 1.7704692]
Index 17: [7.0677     0.51377594]
Index 18: [50.31144    0.8243427]
Index 19: [23.765135    0.29559115]
Index 20: [11.816932    0.26929185]
Index 21: [10.829164   0.4776273]
Index 22: [11.986203    0.41861627]
Index 23: [45.694134    0.31036076]
Index 24: [25.49178    0.5052285]
Index 25: [8.781369   0.46129152]
Index 26: [27.874166   0.5994099]
Index 27: [15.239402  0.513282]
Index 28: [16.698933    0.32808945]
Index 29: [22.68

In [33]:
import pickle
pickle_rfc = open("runs_wickets_prediction.pkl","wb")
pickle.dump(best_reg_model_ours, pickle_rfc)
pickle_rfc.close()

In [34]:
india_team = player_wise_df[player_wise_df['team'] == 4]

In [35]:
india_team = india_team.drop(['total_runs', 'total_wickets'], axis='columns')

In [36]:
india_team.head(13)

Unnamed: 0,match_id,season,start_date,venue,team,player
210,5,0,3,5,4,47
211,5,0,3,5,4,119
212,5,0,3,5,4,110
213,5,0,3,5,4,142
214,5,0,3,5,4,121
215,5,0,3,5,4,123
216,5,0,3,5,4,60
217,5,0,3,5,4,131
218,5,0,3,5,4,105
219,5,0,3,5,4,108


In [37]:
india_team_predicted = best_reg_model_ours.predict(india_team)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [38]:
for i, value in enumerate(india_team_predicted):
    print(f"Index {i}: {value}")

Index 0: [16.807663    0.52852243]
Index 1: [12.320901  0.410972]
Index 2: [32.101875   0.7003249]
Index 3: [66.768265    0.31788632]
Index 4: [8.985131 0.410972]
Index 5: [16.23709    0.3950704]
Index 6: [7.7826285 1.1649579]
Index 7: [8.693161   0.74780726]
Index 8: [2.7809606  0.85174376]
Index 9: [9.756524   0.93863326]
Index 10: [17.985062  1.228363]
Index 11: [5.257191   0.79432267]
Index 12: [5.257191  0.8657488]
Index 13: [22.855995   0.6278472]
Index 14: [7.23225  0.817265]
Index 15: [16.840532   0.5138858]
Index 16: [12.03854    0.3790935]
Index 17: [57.33394    0.5627607]
Index 18: [46.528557    0.26750958]
Index 19: [8.893428  0.3790935]
Index 20: [15.87688    0.3631919]
Index 21: [7.3251963 0.9674908]
Index 22: [7.075871   0.32119352]
Index 23: [3.3899279  0.67937183]
Index 24: [11.973494   0.7196889]
Index 25: [16.46433    1.2137266]
Index 26: [5.664803  1.0348948]
Index 27: [5.664803   0.77436787]
Index 28: [21.335264    0.61321056]
Index 29: [7.8117266 0.9960995]
Index 