In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("./bowling_custom.csv")
df["date"] = pd.to_datetime(df["date"])
df.info()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11919 entries, 0 to 11918
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   match_id       11919 non-null  int64         
 1   date           11919 non-null  datetime64[ns]
 2   bowler         11919 non-null  object        
 3   wickets        11919 non-null  int64         
 4   balls_bowled   11919 non-null  int64         
 5   runs_conceded  11919 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 558.8+ KB


In [18]:
train_year = 2018
test_year = 2023

df["year"] = df["date"].dt.year
df_bowl = df.drop(["date", "match_id"], axis=1)
df_bowl = df_bowl[(df_bowl["year"] >= train_year)].copy()

df_bowl = df_bowl.groupby(["year", "bowler"]).sum().reset_index()

df_bowl["time_value"] = 2023 - df_bowl["year"] + 1
df_bowl = df_bowl.drop(["year"], axis=1)

columns_to_scale = ["runs_conceded", "balls_bowled", "wickets"]
scaler = MinMaxScaler(feature_range=(1, 25))
df_bowl[columns_to_scale] = scaler.fit_transform(df_bowl[columns_to_scale])

df_bowl.head()

Unnamed: 0,bowler,wickets,balls_bowled,runs_conceded,time_value
0,A Dananjaya,1.0,2.356265,3.0,6
1,A Mishra,9.914286,14.031941,12.234043,6
2,AD Russell,9.914286,14.326781,16.106383,6
3,AJ Tye,20.2,20.7543,20.06383,6
4,AR Patel,3.057143,10.140049,10.276596,6


In [19]:
# calculating the strike rate
df_bowl["strike_rate"] = (df_bowl["balls_bowled"] / df_bowl["wickets"]) * 6

# calculating the bowling average
df_bowl["bowling_average"] = df_bowl["runs_conceded"] / df_bowl["wickets"]

# calculating the economy rate
df_bowl["economy_rate"] = df_bowl["runs_conceded"] / df_bowl["balls_bowled"]

# dropping the unnecessary columns from dataframe
df_bowl = df_bowl.drop(["balls_bowled", "runs_conceded", "wickets"], axis=1)

In [20]:
df_mean = df_bowl.groupby('bowler').mean()

df_train = df_bowl[(df_bowl["time_value"] < 6)].copy()
df_test = df_bowl[(df_bowl["time_value"] == 6)].copy()

In [21]:
df_mean = df_mean.reset_index()

In [22]:
df_mean.head()

Unnamed: 0,bowler,time_value,strike_rate,bowling_average,economy_rate
0,A Badoni,1.5,5.08564,0.809536,0.945239
1,A Dananjaya,6.0,14.137592,3.0,1.273201
2,A Mishra,3.8,8.344797,1.239748,0.900478
3,A Nortje,2.5,8.284965,1.400709,1.00755
4,A Zampa,2.5,9.732154,1.416347,0.90486


In [23]:
df_mean = df_mean[df_mean["bowler"].isin(df_test["bowler"])].copy()
len(df_mean)

95

In [24]:
df_mean.head()

Unnamed: 0,bowler,time_value,strike_rate,bowling_average,economy_rate
1,A Dananjaya,6.0,14.137592,3.0,1.273201
2,A Mishra,3.8,8.344797,1.239748,0.900478
5,AD Russell,3.5,6.619132,1.286512,1.17094
8,AJ Tye,4.25,9.683732,1.942671,1.1916
10,AR Patel,3.5,14.316358,2.142697,0.882882


In [25]:
df_bowl.describe()

Unnamed: 0,time_value,strike_rate,bowling_average,economy_rate
count,666.0,666.0,666.0,666.0
mean,3.283784,10.229784,1.819769,1.06525
std,1.727863,3.714011,0.737516,0.160801
min,1.0,3.594499,0.549811,0.650774
25%,2.0,7.769042,1.32142,0.952694
50%,3.0,9.518215,1.648142,1.046701
75%,5.0,12.014742,2.144014,1.166534
max,6.0,31.120393,5.851064,1.639299


In [26]:
w1, w2, w3, w4 = 1, 1, 1, 0.25
weighted_score_train = (
    w1 * df_train["strike_rate"]
    + w2 * df_train["bowling_average"]
    + w3 * df_train["economy_rate"]
    + w4 * df_train["time_value"]
)
X_train = df_train
y_train = weighted_score_train

w1, w2, w3, w4 = 1, 1, 1, 0.25
weighted_score_test = (
    w1 * df_test["strike_rate"]
    + w2 * df_test["bowling_average"]
    + w3 * df_test["economy_rate"]
    + w4 * df_test["time_value"]
)

X_test = df_test
y_test = weighted_score_test

In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 571 entries, 95 to 665
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bowler           571 non-null    object 
 1   time_value       571 non-null    int32  
 2   strike_rate      571 non-null    float64
 3   bowling_average  571 non-null    float64
 4   economy_rate     571 non-null    float64
dtypes: float64(3), int32(1), object(1)
memory usage: 24.5+ KB


In [28]:
X_train_excluded = X_train.drop(columns="bowler", axis=1)
X_test_excluded = X_test.drop(columns="bowler", axis=1)
# df_mean_excluded = df_mean.drop(columns="batter", axis=1)

In [29]:
model = LinearRegression()

In [30]:
model.fit(X_train_excluded, y_train)

In [31]:
df_test = df_mean.drop("bowler", axis=1)
y_pred = model.predict(df_test)

In [32]:
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

Mean Squared Error on Test Set: 9.198328536574326


In [33]:
print(y_pred)

[19.91079339 11.43502231  9.95158401 13.88050287 18.21693701 17.55913931
 11.87092315  9.81987819 14.61161079 15.68891639 11.08714477 12.9089365
 14.42710668 14.83126089 16.70154862 17.1445655  19.50374732 12.22611919
  9.73085224 17.18345367 15.76616693 12.7589421  10.11784982 11.43186355
 13.8880711  16.5468782  10.60338043 13.36023733 18.48546623 13.51018646
 14.24247016 14.24364617 11.78362016 17.97460711 12.51292663 13.22658929
 15.63513343 16.10271445 11.55315004 17.31106103 13.56660364 17.50642788
 12.85958719 17.33258226 12.68963769 11.47180006 11.1421063  18.83980667
  9.29461971 21.09365092 16.26552709 14.36387894 16.20208537 13.33642291
 28.19474786 15.09681501 13.43092821 11.7955063  12.17543017 11.43876288
 13.4741678  14.79304062 11.9651899  14.54913318 17.55979199 13.59898315
 11.63354305 13.35782109 12.51925591 13.34618666 16.16055233 14.49801274
 10.53979241 15.49005753 11.6817358  15.83705412 14.70209076 11.11000866
 12.67500119 12.54625093 16.46697319 15.81247092 15.

In [34]:
players = [
    "S Dhawan",
    "P Simran Singh",
    "PBB Rajapaksa",
    "JM Sharma",
    "M Shahrukh Khan",
    "SM Curran",
    "Sikandar Raza",
    "NT Ellis",
    "Harpreet Brar",
    "RD Chahar",
    "Arshdeep Singh",
    "Rahmanullah Gurbaz",
    "Mandeep Singh",
    "N Rana",
    "RK Singh",
    "AD Russell",
    "AS Roy",
    "SN Thakur",
    "SP Narine",
    "TG Southee",
    "CV Varun",
    "UT Yadav",
]



In [35]:
df_mean["result"] = y_pred
df_mean.head(3)

Unnamed: 0,bowler,time_value,strike_rate,bowling_average,economy_rate,result
1,A Dananjaya,6.0,14.137592,3.0,1.273201,19.910793
2,A Mishra,3.8,8.344797,1.239748,0.900478,11.435022
5,AD Russell,3.5,6.619132,1.286512,1.17094,9.951584


In [36]:
df_res = df_mean[df_mean["bowler"].isin(players)].copy()
len(df_res)

6

In [37]:
df_res = df_res[["bowler", "result"]]
df_res

Unnamed: 0,bowler,result
5,AD Russell,9.951584
156,N Rana,11.633543
211,SN Thakur,12.546251
212,SP Narine,16.466973
231,TG Southee,13.224024
236,UT Yadav,18.226545


In [38]:
df_res = df_res.sort_values(by="result", ascending=False)
df_res

Unnamed: 0,bowler,result
236,UT Yadav,18.226545
212,SP Narine,16.466973
231,TG Southee,13.224024
211,SN Thakur,12.546251
156,N Rana,11.633543
5,AD Russell,9.951584
