In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv("./batting_custom.csv")
df["strike_rate"].replace([np.inf, -np.inf, np.nan], 100, inplace=True)

df["date"] = pd.to_datetime(df["date"])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14551 entries, 0 to 14550
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   match_id     14551 non-null  int64         
 1   date         14551 non-null  datetime64[ns]
 2   batter       14551 non-null  object        
 3   batter_runs  14551 non-null  int64         
 4   balls_faced  14551 non-null  int64         
 5   out          14551 non-null  int64         
 6   strike_rate  14551 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 795.9+ KB


In [4]:
inf_rows = df[np.isinf(df['strike_rate'])]
print(inf_rows)

Empty DataFrame
Columns: [match_id, date, batter, batter_runs, balls_faced, out, strike_rate]
Index: []


In [5]:
df.head()

Unnamed: 0,match_id,date,batter,batter_runs,balls_faced,out,strike_rate
0,335982,2008-04-18,P Kumar,18,15,0,120.0
1,335982,2008-04-18,SB Joshi,3,8,1,37.5
2,335982,2008-04-18,Z Khan,3,8,1,37.5
3,335982,2008-04-18,AA Noffke,9,10,1,90.0
4,335982,2008-04-18,B Akhil,0,2,1,0.0


In [6]:
train_year = 2018
test_year = 2023

df["year"] = df["date"].dt.year

df_bat = df.drop("date", axis=1)
df_bat = df_bat[(df_bat["year"] >= train_year)].copy()
df_bat["time_value"] = df_bat["year"] - train_year + 1
df_bat = df_bat.drop(["year", "match_id"], axis=1)

df_mean = df_bat.groupby("batter").mean()

# df_bat = pd.get_dummies(df_bat, columns=['batter'], prefix='')
# df_mean = pd.get_dummies(df_mean, columns=['batter'], prefix='')

df_train = df_bat[(df_bat["time_value"] < 6)].copy()
df_test = df_bat[(df_bat["time_value"] == 6)].copy()

In [7]:
columns_to_scale = ["time_value", "batter_runs", "balls_faced", "out", "strike_rate"]

scaler = MinMaxScaler(feature_range=(0, 10))
df_train[columns_to_scale] = scaler.fit_transform(df_train[columns_to_scale])
df_test[columns_to_scale] = scaler.fit_transform(df_test[columns_to_scale])

df_mean[columns_to_scale] = scaler.fit_transform(df_mean[columns_to_scale])

In [8]:
df_mean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 321 entries, A Badoni to Yuvraj Singh
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   batter_runs  321 non-null    float64
 1   balls_faced  321 non-null    float64
 2   out          321 non-null    float64
 3   strike_rate  321 non-null    float64
 4   time_value   321 non-null    float64
dtypes: float64(5)
memory usage: 15.0+ KB


In [9]:
df_mean.describe()

Unnamed: 0,batter_runs,balls_faced,out,strike_rate,time_value
count,321.0,321.0,321.0,321.0,321.0
mean,1.929153,2.188069,6.537258,3.178765,5.685813
std,1.541022,1.573805,3.124435,1.309541,3.001953
min,0.0,0.0,0.0,0.0,0.0
25%,0.652174,0.869565,5.0,2.414773,3.555556
50%,1.594203,1.73913,7.2,3.398438,5.846154
75%,2.877847,3.229814,9.104478,3.887831,8.0
max,10.0,10.0,10.0,10.0,10.0


In [10]:
df_train.describe()

Unnamed: 0,batter_runs,balls_faced,out,strike_rate,time_value
count,4418.0,4418.0,4418.0,4418.0,4418.0
mean,1.522813,2.319149,7.512449,1.950727,5.297646
std,1.560507,1.975227,4.323405,1.104293,3.660471
min,0.0,0.0,0.0,0.0,0.0
25%,0.357143,0.857143,10.0,1.25,2.5
50%,1.0,1.714286,10.0,1.904762,5.0
75%,2.285714,3.428571,10.0,2.525253,7.5
max,10.0,10.0,10.0,10.0,10.0


In [11]:
df_test.describe()

Unnamed: 0,batter_runs,balls_faced,out,strike_rate,time_value
count,1112.0,1112.0,1112.0,1112.0,1112.0
mean,1.688835,2.279676,7.553957,2.030201,0.0
std,1.785553,2.001671,4.300458,1.150141,0.0
min,0.0,0.0,0.0,0.0,0.0
25%,0.310078,0.735294,10.0,1.25,0.0
50%,1.085271,1.617647,10.0,2.045455,0.0
75%,2.403101,3.125,10.0,2.719298,0.0
max,10.0,10.0,10.0,10.0,0.0


In [12]:
w1, w2, w3, w4, w5 = 1, 1, 0.3, 1, 0.3
weighted_score_train = (
    w1 * df_train["batter_runs"]
    + w2 * df_train["balls_faced"]
    + w3 * (10 - df_train["out"])
    + w4 * df_train["strike_rate"]
    + w5 * df_train["time_value"]
)
X_train = df_train
y_train = weighted_score_train

w1, w2, w3, w4, w5 = 1, 1, 0.3, 1, 0.3
weighted_score_test = (
    w1 * df_test["batter_runs"]
    + w2 * df_test["balls_faced"]
    + w3 * (10 - df_test["out"])
    + w4 * df_test["strike_rate"]
    + w5 * df_test["time_value"]
)

X_test = df_test
y_test = weighted_score_test

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4418 entries, 9021 to 13438
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   batter       4418 non-null   object 
 1   batter_runs  4418 non-null   float64
 2   balls_faced  4418 non-null   float64
 3   out          4418 non-null   float64
 4   strike_rate  4418 non-null   float64
 5   time_value   4418 non-null   float64
dtypes: float64(5), object(1)
memory usage: 241.6+ KB


In [14]:
X_train_excluded = X_train.drop(columns="batter", axis=1)
X_test_excluded = X_test.drop(columns="batter", axis=1)
# df_mean_excluded = df_mean.drop(columns="batter", axis=1)

In [15]:
model = LinearRegression()

In [16]:
model.fit(X_train_excluded, y_train)

In [17]:
y_pred = model.predict(X_test_excluded)

In [18]:
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

Mean Squared Error on Test Set: 3.006630477984023e-29


In [19]:
y_pred = model.predict(df_mean)

In [20]:
print(y_pred)

[11.70938373  6.94927536 11.48818223  7.81313269  9.34238268  6.06394928
  6.8013285  14.88964314 11.1585523   7.34973701 13.23588491  9.10615942
 10.79127308 15.27536232  3.74790373  4.71651803 14.99649946 12.77255194
 10.90259644 10.30907365  5.3777657   8.91004141  8.96735503 13.27902563
 10.43432971  9.26992754 11.21018412 12.13421502  8.43983483  8.12176178
 10.28201663 12.3504186  12.13370007  6.43070652 11.13707729  8.51690821
  7.86379875 15.18023542 10.40269151  7.12882699  7.73814415  4.62137681
 19.22166451 11.75452899 10.47572242 11.46468973  4.81847826  8.86992083
  4.97708333 15.36290543 10.30376354  6.58239734  9.83448572  9.71884536
 13.73868349 15.02354857  7.82035329  5.49061106  8.1414539   6.66551893
  6.58926548  7.68940289 12.99438585 14.17250164 10.83516272 12.67059179
 13.80515865 17.91444909  9.83362248 11.14631292 14.12789855 11.72631341
 13.80615474  8.92735066  7.02227355 19.84566543  8.14198585  5.25999396
  8.23317482  6.45577713  6.12481884 13.38850608 12

In [21]:
players = [
    "S Dhawan",
    "P Simran Singh",
    "PBB Rajapaksa",
    "JM Sharma",
    "M Shahrukh Khan",
    "SM Curran",
    "Sikandar Raza",
    "NT Ellis",
    "Harpreet Brar",
    "RD Chahar",
    "Arshdeep Singh",
    "Rahmanullah Gurbaz",
    "Mandeep Singh",
    "N Rana",
    "RK Singh",
    "AD Russell",
    "AS Roy",
    "SN Thakur",
    "SP Narine",
    "TG Southee",
    "CV Varun",
    "UT Yadav",
]

In [44]:
def get_batsman(match_id):
    df = pd.read_csv("./batting_custom.csv")
    df = df[(df["match_id"]) == match_id]
    players = df["batter"]
    return players.tolist()


players = get_batsman(1359507)

In [45]:
df_mean["result"] = y_pred
df_reset = df_mean.reset_index()

In [46]:
df_reset.head()

Unnamed: 0,batter,batter_runs,balls_faced,out,strike_rate,time_value,result
0,A Badoni,2.242926,2.443064,8.095238,3.737679,9.047619,11.709384
1,A Dananjaya,0.57971,0.869565,0.0,2.5,0.0,6.949275
2,A Manohar,2.474916,2.474916,9.230769,3.630657,8.923077,11.488182
3,A Mishra,0.990338,1.847826,5.0,2.574968,3.0,7.813133
4,A Nortje,0.676329,0.748792,2.222222,3.050595,8.444444,9.342383


In [47]:
result_df = pd.DataFrame()
result_dict = dict()

for player in players:
    # Filter rows based on the condition
    player_df = df_reset[df_reset["batter"] == player]

    # Concatenate the filtered DataFrame to the result DataFrame
    result_df = pd.concat([result_df, player_df])

    # Assuming 'result' is a unique column in your DataFrame
    result_dict[player] = player_df['result'].iloc[0]

# Display the result_dict and result_df
print(result_dict)


{'DP Conway': 19.845665426209862, 'AM Rahane': 12.772551944832369, 'S Dube': 13.243390281259591, 'RA Jadeja': 12.370107945026405, 'MS Dhoni': 12.631543163790308, 'N Jagadeesan': 11.57083122479377, 'N Rana': 14.041325742907123, 'VR Iyer': 14.287439740459158, 'JJ Roy': 14.870399641873892, 'RK Singh': 14.382489373118103, 'AD Russell': 13.235884906735158, 'UT Yadav': 7.598549214975847, 'CV Varun': 7.689402892799633, 'SP Narine': 9.666870196300902, 'D Wiese': 12.670591787439617, 'RD Gaikwad': 17.2925471991603}


In [48]:
sorted_dict_desc = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))

print(sorted_dict_desc)


{'DP Conway': 19.845665426209862, 'RD Gaikwad': 17.2925471991603, 'JJ Roy': 14.870399641873892, 'RK Singh': 14.382489373118103, 'VR Iyer': 14.287439740459158, 'N Rana': 14.041325742907123, 'S Dube': 13.243390281259591, 'AD Russell': 13.235884906735158, 'AM Rahane': 12.772551944832369, 'D Wiese': 12.670591787439617, 'MS Dhoni': 12.631543163790308, 'RA Jadeja': 12.370107945026405, 'N Jagadeesan': 11.57083122479377, 'SP Narine': 9.666870196300902, 'CV Varun': 7.689402892799633, 'UT Yadav': 7.598549214975847}


In [50]:
{
    "DP Conway": 19.845665426209862,
    "RD Gaikwad": 17.2925471991603,
    "JJ Roy": 14.870399641873892,
    "RK Singh": 14.382489373118103,
    "VR Iyer": 14.287439740459158,
    "N Rana": 14.041325742907123,
    "S Dube": 13.243390281259591,
    "AD Russell": 13.235884906735158,
    "AM Rahane": 12.772551944832369,
    "D Wiese": 12.670591787439617,
    "MS Dhoni": 12.631543163790308,
    "RA Jadeja": 12.370107945026405,
    "N Jagadeesan": 11.57083122479377,
    "SP Narine": 9.666870196300902,
    "CV Varun": 7.689402892799633,
    "UT Yadav": 7.598549214975847,
}

{'DP Conway': 19.845665426209862,
 'RD Gaikwad': 17.2925471991603,
 'JJ Roy': 14.870399641873892,
 'RK Singh': 14.382489373118103,
 'VR Iyer': 14.287439740459158,
 'N Rana': 14.041325742907123,
 'S Dube': 13.243390281259591,
 'AD Russell': 13.235884906735158,
 'AM Rahane': 12.772551944832369,
 'D Wiese': 12.670591787439617,
 'MS Dhoni': 12.631543163790308,
 'RA Jadeja': 12.370107945026405,
 'N Jagadeesan': 11.57083122479377,
 'SP Narine': 9.666870196300902,
 'CV Varun': 7.689402892799633,
 'UT Yadav': 7.598549214975847}