In [83]:
import math
import numpy as np
import pandas as pd

In [84]:
df_deliveries = pd.read_csv('deliveries.csv')

In [85]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [86]:
all_players = set()

all_players.update(df_deliveries['striker'].unique())
all_players.update(df_deliveries['non_striker'].unique())
all_players.update(df_deliveries['bowler'].unique())

team_players = {team: {'striker': set(), 'non_striker': set(), 'bowler': set()} for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team]['striker'].add(row['striker'])
    team_players[team]['non_striker'].add(row['non_striker'])
    team_players[team]['bowler'].add(row['bowler'])

for team, roles in team_players.items():
    print(f"Team: {team}")
    for role, players in roles.items():
        print(f"{role.capitalize()}s: {', '.join(players)}")


Team: England
Strikers: SM Curran, JE Root, RJW Topley, CR Woakes, HC Brook, AU Rashid, DJ Willey, MA Wood, BA Stokes, LS Livingstone, JC Buttler, MM Ali, AAP Atkinson, DJ Malan, JM Bairstow
Non_strikers: SM Curran, JE Root, RJW Topley, CR Woakes, HC Brook, AU Rashid, DJ Willey, MA Wood, BA Stokes, LS Livingstone, JC Buttler, MM Ali, AAP Atkinson, DJ Malan, JM Bairstow
Bowlers: JDS Neesham, Kuldeep Yadav, L Ngidi, KA Maharaj, Mehedi Hasan Miraz, Mohammad Nabi, GD Phillips, RA Jadeja, JJ Bumrah, G Coetzee, M Theekshana, K Rabada, Fazalhaq Farooqi, TA Boult, MJ Henry, Shoriful Islam, Mohammed Siraj, Taskin Ahmed, Shakib Al Hasan, M Jansen, CAK Rajitha, R Ravindra, D Madushanka, Mahedi Hasan, Azmatullah Omarzai, Naveen-ul-Haq, Mujeeb Ur Rahman, AD Mathews, CBRLS Kumara, Rashid Khan, Mohammed Shami, Mustafizur Rahman, MJ Santner, DM de Silva
Team: New Zealand
Strikers: MS Chapman, KS Williamson, JDS Neesham, TWM Latham, WA Young, DP Conway, LH Ferguson, TG Southee, MJ Santner, TA Boult, DJ

In [87]:
team_players = {team: set() for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team].add(row['striker'])
    team_players[team].add(row['non_striker'])
    
for _, row in df_deliveries.iterrows():
    team = row['bowling_team']
    team_players[team].add(row['bowler'])

for team, players in team_players.items():
    print(f"Team: {team}")
    print(f"Number of Players: {len(players)}")
    print(f"Players: {', '.join(players)}")


Team: England
Number of Players: 15
Players: SM Curran, JE Root, RJW Topley, CR Woakes, HC Brook, AU Rashid, DJ Willey, MA Wood, BA Stokes, LS Livingstone, JC Buttler, MM Ali, AAP Atkinson, DJ Malan, JM Bairstow
Team: New Zealand
Number of Players: 14
Players: MS Chapman, KS Williamson, JDS Neesham, TWM Latham, WA Young, DP Conway, LH Ferguson, TG Southee, MJ Santner, TA Boult, DJ Mitchell, R Ravindra, GD Phillips, MJ Henry
Team: Pakistan
Number of Players: 14
Players: Mohammad Wasim, Saud Shakeel, Babar Azam, Iftikhar Ahmed, Hasan Ali, Imam-ul-Haq, Usama Mir, Mohammad Nawaz, Shaheen Shah Afridi, Mohammad Rizwan, Fakhar Zaman, Haris Rauf, Abdullah Shafique, Shadab Khan
Team: Netherlands
Number of Players: 15
Players: AT Nidamanuru, RE van der Merwe, CN Ackermann, R Klein, A Dutt, LV van Beek, SA Engelbrecht, Saqib Zulfiqar, Shariz Ahmad, Vikramjit Singh, PA van Meekeren, W Barresi, SA Edwards, BFW de Leede, MP O'Dowd
Team: Afghanistan
Number of Players: 13
Players: Azmatullah Omarzai, 

In [88]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [89]:
df_deliveries[['extras','wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']] = df_deliveries[['extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']].fillna(0)

In [90]:
team_players = {team: set() for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team].add(row['striker'])
    team_players[team].add(row['non_striker'])

    team = row['bowling_team']
    team_players[team].add(row['bowler'])

player_wise_data = []

for _, match in df_deliveries.iterrows():
    match_id = match['match_id']
    season = match['season']
    start_date = match['start_date']
    venue = match['venue']
    batting_team = match['batting_team']
    bowling_team = match['bowling_team']

    batting_players = team_players[batting_team]
    bowling_players = team_players[bowling_team]

    batting_runs = df_deliveries[(df_deliveries['match_id'] == match_id) &
                              (df_deliveries['striker'].isin(batting_players))].groupby('striker')['runs_off_bat'].sum().reset_index()

    batting_data = [{'match_id': match_id,
                     'season': season,
                     'start_date': start_date,
                     'venue': venue,
                     'team': batting_team,
                     'opposing_team':bowling_team,
                     'player': player,
                     'runs': batting_runs[batting_runs['striker'] == player]['runs_off_bat'].values[0] if player in batting_runs['striker'].tolist() else 0,
                     'wickets': 0} for player in batting_players]

    bowling_wickets = df_deliveries[(df_deliveries['match_id'] == match_id) &
                              (df_deliveries['bowler'].isin(bowling_players)) & df_deliveries['wicket_type'].isin(['bowled', 'caught', 'caught and bowled', 'lbw', 'stumped'])].groupby('bowler')['wicket_type'].count().reset_index()

    bowling_data = [{'match_id': match_id,
                     'season': season,
                     'start_date': start_date,
                     'venue': venue,
                     'team': bowling_team,
                     'opposing_team':batting_team,
                     'player': player,
                     'runs': 0,
                     'wickets': bowling_wickets[bowling_wickets['bowler'] == player]['wicket_type'].values[0] if player in bowling_wickets['bowler'].tolist() else 0,} for player in bowling_players]

    player_wise_data.extend(batting_data)
    player_wise_data.extend(bowling_data)

player_wise_df = pd.DataFrame(player_wise_data)


In [91]:
player_wise_df = player_wise_df.drop_duplicates(keep='first')

In [92]:
player_wise_df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'team', 'opposing_team',
       'player', 'runs', 'wickets'],
      dtype='object')

In [93]:
player_wise_df['total_runs'] = player_wise_df.groupby(['match_id', 'season', 'start_date', 'venue', 'team', 'player'])['runs'].transform('sum')
player_wise_df['total_wickets'] = player_wise_df.groupby(['match_id', 'season', 'start_date', 'venue', 'team', 'player'])['wickets'].transform('sum')

In [94]:
player_wise_df.shape

(1576, 11)

In [95]:
player_wise_df = player_wise_df.drop(['runs', 'wickets'], axis=1).reset_index(drop=True)

In [96]:
player_wise_df = player_wise_df.drop_duplicates(keep='first')

In [97]:
player_wise_df.shape

(932, 9)

In [98]:
player_wise_df.to_csv('playerwise_df.csv')

In [99]:
player_wise_df = player_wise_df.drop(['match_id', 'season', 'start_date'],axis='columns')

In [100]:
player_wise_df.head(20)

Unnamed: 0,venue,team,opposing_team,player,total_runs,total_wickets
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,SM Curran,14,1
1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,JE Root,77,0
2,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,RJW Topley,0,0
3,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,CR Woakes,11,0
4,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,HC Brook,25,0
5,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,AU Rashid,15,0
6,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,DJ Willey,0,0
7,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,MA Wood,13,0
8,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,BA Stokes,0,0
9,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,LS Livingstone,20,0


In [101]:
categorical_col = [col for col in player_wise_df if player_wise_df[col].dtype == 'object']
categorical_col

['venue', 'team', 'opposing_team', 'player']

In [102]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [103]:
mapping = {}

for column in categorical_col:
    player_wise_df[column] = le.fit_transform(player_wise_df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [104]:
for column_name in categorical_col:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()

Mapping for column "venue":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "opposing_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "player":
A Dutt: 0
A Zampa: 1
AAP Atkinson: 2
AD Mathews: 3
AK Markram: 4
AT Carey: 5
AT Nidamanuru: 6
AU Rashid: 7
Abdullah Shafique: 8
Azmatullah Omarzai: 9
BA Stokes: 10
BFW de Le

In [105]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

In [106]:
X = player_wise_df[['venue', 'team','opposing_team', 'player']]
y = player_wise_df[['total_runs', 'total_wickets']]

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, shuffle=True)

In [108]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(5, 50, 5)), 'max_depth': list(range(1, 10, 2))}},
    'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}
}


In [109]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test)

    return model_results,best_reg_model_ours

In [110]:
model_results,best_reg_model_ours = eval_models()
model_results

LinearRegression 19.903418364389108 {}
RandomForestRegressor 16.21639119908186 {'max_depth': 5, 'n_estimators': 35}


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


XGBRegressor 12.999298739704567 {'learning_rate': 0.01, 'n_estimators': 410}
PolynomialFeatures 19.80433739592434 {'polynomialfeatures__degree': 2}
Best model:  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=410, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,19.903418,16.337503,10.275577,9.300156,{}
RandomForestRegressor,16.216391,15.765567,8.354428,8.674176,"{'max_depth': 5, 'n_estimators': 35}"
XGBRegressor,12.999299,15.729239,6.45151,8.454905,"{'learning_rate': 0.01, 'n_estimators': 410}"
PolynomialFeatures,19.804337,16.514525,10.192209,9.435947,{'polynomialfeatures__degree': 2}


In [111]:
print(best_reg_model_ours)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=410, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [112]:
y_predicted = best_reg_model_ours.predict(X_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [122]:
print(X_test[:40])
print(y_predicted[:40])

      venue  team  opposing_team  player
1345      2     5              2     144
14        7     3              6      52
67        8     5              7     124
552       0     3              0     111
73        8     5              7      11
1136      0     1              5       5
903       9     8              3      21
400       0     0              4      86
449       5     2              6      85
1192      4     3              9      64
749       6     2              4      96
446       5     2              6      84
660       3     8              5      56
856       1     9              5      69
261       8     5              6      65
252       8     6              5      24
558       0     3              0      10
1529      6     8              6     132
69        8     5              7     143
457       5     6              2     137
944       3     6              4     137
54        8     7              5     127
655       3     8              5      37
414       0     

In [115]:
import pickle
pickle_rfc = open("runs_wickets_prediction.pkl","wb")
pickle.dump(best_reg_model_ours, pickle_rfc)
pickle_rfc.close()