In [41]:
import math
import numpy as np
import pandas as pd

In [42]:
df_deliveries = pd.read_csv('../csv_files/deliveries.csv')

In [43]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [44]:
all_players = set()

all_players.update(df_deliveries['striker'].unique())
all_players.update(df_deliveries['non_striker'].unique())
all_players.update(df_deliveries['bowler'].unique())

team_players = {team: {'striker': set(), 'non_striker': set(), 'bowler': set()} for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team]['striker'].add(row['striker'])
    team_players[team]['non_striker'].add(row['non_striker'])
    team_players[team]['bowler'].add(row['bowler'])

for team, roles in team_players.items():
    print(f"Team: {team}")
    for role, players in roles.items():
        print(f"{role.capitalize()}s: {', '.join(players)}")


Team: England
Strikers: JE Root, BA Stokes, DJ Willey, AAP Atkinson, SM Curran, AU Rashid, HC Brook, JM Bairstow, DJ Malan, LS Livingstone, CR Woakes, MA Wood, MM Ali, JC Buttler, RJW Topley
Non_strikers: JE Root, BA Stokes, DJ Willey, AAP Atkinson, SM Curran, AU Rashid, HC Brook, JM Bairstow, DJ Malan, LS Livingstone, CR Woakes, MA Wood, MM Ali, JC Buttler, RJW Topley
Bowlers: GD Phillips, Mustafizur Rahman, MJ Henry, Mohammad Nabi, G Coetzee, Mahedi Hasan, CAK Rajitha, Taskin Ahmed, Rashid Khan, JJ Bumrah, Fazalhaq Farooqi, Mohammed Siraj, Kuldeep Yadav, Naveen-ul-Haq, R Ravindra, MJ Santner, Azmatullah Omarzai, JDS Neesham, Shoriful Islam, TA Boult, Mehedi Hasan Miraz, RA Jadeja, M Jansen, D Madushanka, Mohammed Shami, Mujeeb Ur Rahman, Shakib Al Hasan, AD Mathews, M Theekshana, K Rabada, KA Maharaj, L Ngidi, DM de Silva, CBRLS Kumara
Team: New Zealand
Strikers: LH Ferguson, MS Chapman, GD Phillips, TG Southee, MJ Henry, KS Williamson, TWM Latham, R Ravindra, MJ Santner, JDS Neesham

In [45]:
team_players = {team: set() for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team].add(row['striker'])
    team_players[team].add(row['non_striker'])
    
for _, row in df_deliveries.iterrows():
    team = row['bowling_team']
    team_players[team].add(row['bowler'])

for team, players in team_players.items():
    print(f"Team: {team}")
    print(f"Number of Players: {len(players)}")
    print(f"Players: {', '.join(players)}")


Team: England
Number of Players: 15
Players: JE Root, BA Stokes, DJ Willey, AAP Atkinson, SM Curran, AU Rashid, HC Brook, JM Bairstow, DJ Malan, LS Livingstone, CR Woakes, MA Wood, MM Ali, JC Buttler, RJW Topley
Team: New Zealand
Number of Players: 14
Players: LH Ferguson, MS Chapman, GD Phillips, TG Southee, MJ Henry, KS Williamson, TWM Latham, R Ravindra, MJ Santner, JDS Neesham, WA Young, DP Conway, TA Boult, DJ Mitchell
Team: Pakistan
Number of Players: 14
Players: Imam-ul-Haq, Babar Azam, Mohammad Wasim, Shaheen Shah Afridi, Haris Rauf, Shadab Khan, Usama Mir, Fakhar Zaman, Saud Shakeel, Mohammad Rizwan, Mohammad Nawaz, Abdullah Shafique, Hasan Ali, Iftikhar Ahmed
Team: Netherlands
Number of Players: 15
Players: R Klein, CN Ackermann, SA Engelbrecht, BFW de Leede, MP O'Dowd, PA van Meekeren, RE van der Merwe, AT Nidamanuru, W Barresi, LV van Beek, Shariz Ahmad, Vikramjit Singh, Saqib Zulfiqar, SA Edwards, A Dutt
Team: Afghanistan
Number of Players: 13
Players: Rahmanullah Gurbaz, 

In [46]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [47]:
df_deliveries[['extras','wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']] = df_deliveries[['extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']].fillna(0)

In [48]:
team_players = {team: set() for team in df_deliveries['batting_team'].unique()}

for _, row in df_deliveries.iterrows():
    team = row['batting_team']
    team_players[team].add(row['striker'])
    team_players[team].add(row['non_striker'])

    team = row['bowling_team']
    team_players[team].add(row['bowler'])

player_wise_data = []

for _, match in df_deliveries.iterrows():
    match_id = match['match_id']
    season = match['season']
    start_date = match['start_date']
    venue = match['venue']
    batting_team = match['batting_team']
    bowling_team = match['bowling_team']

    batting_players = team_players[batting_team]
    bowling_players = team_players[bowling_team]

    batting_runs = df_deliveries[(df_deliveries['match_id'] == match_id) &
                              (df_deliveries['striker'].isin(batting_players))].groupby('striker')['runs_off_bat'].sum().reset_index()

    batting_data = [{'match_id': match_id,
                     'season': season,
                     'start_date': start_date,
                     'venue': venue,
                     'team': batting_team,
                     'opposing_team':bowling_team,
                     'player': player,
                     'runs': batting_runs[batting_runs['striker'] == player]['runs_off_bat'].values[0] if player in batting_runs['striker'].tolist() else 0,
                     'wickets': 0} for player in batting_players]

    bowling_wickets = df_deliveries[(df_deliveries['match_id'] == match_id) &
                              (df_deliveries['bowler'].isin(bowling_players)) & df_deliveries['wicket_type'].isin(['bowled', 'caught', 'caught and bowled', 'lbw', 'stumped'])].groupby('bowler')['wicket_type'].count().reset_index()

    bowling_data = [{'match_id': match_id,
                     'season': season,
                     'start_date': start_date,
                     'venue': venue,
                     'team': bowling_team,
                     'opposing_team':batting_team,
                     'player': player,
                     'runs': 0,
                     'wickets': bowling_wickets[bowling_wickets['bowler'] == player]['wicket_type'].values[0] if player in bowling_wickets['bowler'].tolist() else 0,} for player in bowling_players]

    player_wise_data.extend(batting_data)
    player_wise_data.extend(bowling_data)

player_wise_df = pd.DataFrame(player_wise_data)


In [49]:
player_wise_df = player_wise_df.drop_duplicates(keep='first')

In [50]:
player_wise_df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'team', 'opposing_team',
       'player', 'runs', 'wickets'],
      dtype='object')

In [51]:
player_wise_df['total_runs'] = player_wise_df.groupby(['match_id', 'season', 'start_date', 'venue', 'team', 'player'])['runs'].transform('sum')
player_wise_df['total_wickets'] = player_wise_df.groupby(['match_id', 'season', 'start_date', 'venue', 'team', 'player'])['wickets'].transform('sum')

In [52]:
player_wise_df.shape

(1576, 11)

In [53]:
player_wise_df = player_wise_df.drop(['runs', 'wickets'], axis=1).reset_index(drop=True)

In [54]:
player_wise_df = player_wise_df.drop_duplicates(keep='first')

In [55]:
player_wise_df.shape

(932, 9)

In [56]:
file_path = '../csv_files/playerwise_df.csv'
player_wise_df.to_csv(file_path)

In [57]:
player_wise_df = player_wise_df.drop(['match_id', 'season', 'start_date'],axis='columns')

In [58]:
player_wise_df.head(20)

Unnamed: 0,venue,team,opposing_team,player,total_runs,total_wickets
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,JE Root,77,0
1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,DJ Willey,0,0
2,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,AAP Atkinson,0,0
3,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,BA Stokes,0,0
4,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,SM Curran,14,1
5,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,AU Rashid,15,0
6,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,HC Brook,25,0
7,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,JM Bairstow,33,0
8,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,DJ Malan,14,0
9,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,LS Livingstone,20,0


In [59]:
categorical_col = [col for col in player_wise_df if player_wise_df[col].dtype == 'object']
categorical_col

['venue', 'team', 'opposing_team', 'player']

In [60]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [61]:
mapping = {}

for column in categorical_col:
    player_wise_df[column] = le.fit_transform(player_wise_df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [62]:
for column_name in categorical_col:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()

Mapping for column "venue":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "opposing_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "player":
A Dutt: 0
A Zampa: 1
AAP Atkinson: 2
AD Mathews: 3
AK Markram: 4
AT Carey: 5
AT Nidamanuru: 6
AU Rashid: 7
Abdullah Shafique: 8
Azmatullah Omarzai: 9
BA Stokes: 10
BFW de Le

In [63]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

In [64]:
X = player_wise_df[['venue', 'team','opposing_team', 'player']]
y = player_wise_df[['total_runs', 'total_wickets']]

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, shuffle=True)

In [66]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(5, 50, 5)), 'max_depth': list(range(1, 10, 2))}},
    'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}
}


In [67]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test)

    return model_results,best_reg_model_ours

In [68]:
model_results,best_reg_model_ours = eval_models()
model_results

LinearRegression 18.932702025202165 {}
RandomForestRegressor 15.943348509746187 {'max_depth': 5, 'n_estimators': 40}
XGBRegressor 12.562149178198966 {'learning_rate': 0.01, 'n_estimators': 410}
PolynomialFeatures 18.851359656570487 {'polynomialfeatures__degree': 2}
Best model:  RandomForestRegressor(max_depth=5, n_estimators=40, random_state=42)


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,18.932702,20.365811,9.759964,10.493366,{}
RandomForestRegressor,15.943349,19.601436,8.258547,9.871909,"{'max_depth': 5, 'n_estimators': 40}"
XGBRegressor,12.562149,20.140793,6.30938,9.379651,"{'learning_rate': 0.01, 'n_estimators': 410}"
PolynomialFeatures,18.85136,20.597791,9.751348,10.540588,{'polynomialfeatures__degree': 2}


In [69]:
print(best_reg_model_ours)

RandomForestRegressor(max_depth=5, n_estimators=40, random_state=42)


In [70]:
y_predicted = best_reg_model_ours.predict(X_test)

In [71]:
print(X_test[:40])
print(y_predicted[:40])

      venue  team  opposing_team  player
617       1     1              9       1
244       8     6              5      33
555       0     3              0       7
1488      2     2              7      66
402       0     0              4       9
949       3     6              4     137
350       8     9              7      12
1090      9     8              2     132
263       8     5              6       6
1097      9     8              2      56
417       0     4              0      38
260       8     5              6     101
196       5     1              4      22
757       6     4              2     108
248       8     6              5     107
754       6     4              2      51
1004      5     7              0      44
510       7     4              7     119
813       4     7              1      40
497       7     7              4      87
96        3     0              2     113
1205      4     9              3      75
1135      0     1              5     122
306       3     

In [72]:
import pickle
pickle_rfc = open("../pickle_files/runs_wickets_prediction.pkl","wb")
pickle.dump(best_reg_model_ours, pickle_rfc)
pickle_rfc.close()