<center>
<img src="https://i0.wp.com/stockify.net.in/wp-content/uploads/2023/06/Here-Are-The-5-Richest-IPL-Teams.png?fit=768%2C512&ssl=1" width=1000>
</center>

In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [4]:
# --- Step 1: Data Preparation ---
# Load the datasets. Please ensure these files are in the 'IPL match' folder.
ball_data = pd.read_csv("IPL match/Ball by Ball 1.csv")
match_data = pd.read_csv("IPL match/IPL matches 1.csv")

# Merge the dataframes
merged_data = pd.merge(ball_data, match_data, on='ID')

<div style="text-align:center; border-radius:15px; padding:15px; color:white; margin:0; font-family: 'Orbitron', sans-serif; background: #11001C; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.3); overflow:hidden; margin-bottom: 1em;"><div style="font-size:250%; color:#FEE100"><b>Type 1 </b></div><div><span style="color:#6666FF">Supervised</span> Models: A hybrid approach using two separate regressors is employed for forecasting, focusing on simplicity and direct correlation. <span style="color:#6666FF">Linear Regression</span> is used to predict Total Runs for batters, assuming a direct, scalable relationship between seasons. <span style="color:#6666FF">Random Forest Regressor</span> is used for Total Wickets for bowlers, leveraging its non-linear strength to account for the variability in wicket-taking performance. Both models base their predictions solely on the player's single preceding season's performance.</div></div>

### Predictive Regression Goal
- The objective is to generate simplified, direct numerical forecasts for the next season. 
- The core features are minimal: the previous season's total runs for batting predictions, and the previous season's total wickets for bowling predictions.
- This approach prioritizes interpretability and speed over complex feature interaction, providing a baseline prediction for:

Batting: **Total Runs** to be scored.

Bowling: **Total Wickets** to be taken.

In [7]:
# --- Step 2: Feature Engineering ---
# Batting Stats
batsman_stats = merged_data.groupby(['batter', 'Season']).agg(
    total_runs=('batsman_run', 'sum'),
    balls_faced=('ballnumber', 'count'),
    fours=('batsman_run', lambda x: (x == 4).sum()),
    sixes=('batsman_run', lambda x: (x == 6).sum())
).reset_index()

batsman_stats['strike_rate'] = (batsman_stats['total_runs'] / batsman_stats['balls_faced']) * 100
batsman_stats['strike_rate'].replace(np.inf, 0, inplace=True)
batsman_stats.fillna(0, inplace=True)
batsman_stats.head()

Unnamed: 0,batter,Season,total_runs,balls_faced,fours,sixes,strike_rate
0,A Ashish Reddy,2012,35,30,3,1,116.666667
1,A Ashish Reddy,2013,125,90,8,5,138.888889
2,A Ashish Reddy,2015,73,46,3,5,158.695652
3,A Ashish Reddy,2016,47,30,2,4,156.666667
4,A Badoni,2022,161,139,11,7,115.827338


In [8]:
# Bowling Stats
bowler_stats = merged_data.groupby(['bowler', 'Season']).agg(
    total_wickets=('isWicketDelivery', 'sum'),
    total_runs_conceded=('total_run', 'sum'),
    total_balls_bowled=('ballnumber', 'count')
).reset_index()

bowler_stats['economy_rate'] = (bowler_stats['total_runs_conceded'] / bowler_stats['total_balls_bowled']) * 6
bowler_stats.fillna(0, inplace=True)
bowler_stats.head()

Unnamed: 0,bowler,Season,total_wickets,total_runs_conceded,total_balls_bowled,economy_rate
0,A Ashish Reddy,2012,11,238,169,8.449704
1,A Ashish Reddy,2013,3,71,41,10.390244
2,A Ashish Reddy,2015,4,51,37,8.27027
3,A Ashish Reddy,2016,1,40,23,10.434783
4,A Badoni,2022,2,12,12,6.0


In [9]:
# --- Step 3: Data Preparation for ML Models ---
# For Batting: Prepare data to predict 'total_runs'
batsman_stats_ml = batsman_stats.copy()
batsman_stats_ml['prev_season_runs'] = batsman_stats_ml.groupby('batter')['total_runs'].shift(1)
batsman_stats_ml.dropna(subset=['prev_season_runs'], inplace=True)

X_batting = batsman_stats_ml[['prev_season_runs']]
y_batting = batsman_stats_ml['total_runs']

In [10]:
# For Bowling: Prepare data to predict 'total_wickets'
bowler_stats_ml = bowler_stats.copy()
bowler_stats_ml['prev_season_wickets'] = bowler_stats_ml.groupby('bowler')['total_wickets'].shift(1)
bowler_stats_ml.dropna(subset=['prev_season_wickets'], inplace=True)

X_bowling = bowler_stats_ml[['prev_season_wickets']]
y_bowling = bowler_stats_ml['total_wickets']

In [11]:
# --- Step 4: Model Training and Prediction ---
# Predict Runs for Batters
X_train_bat, X_test_bat, y_train_bat, y_test_bat = train_test_split(X_batting, y_batting, test_size=0.2, random_state=42)
model_runs = LinearRegression()
model_runs.fit(X_train_bat, y_train_bat)

In [12]:
# Predict Wickets for Bowlers
X_train_bowl, X_test_bowl, y_train_bowl, y_test_bowl = train_test_split(X_bowling, y_bowling, test_size=0.2, random_state=42)
model_wickets = RandomForestRegressor(n_estimators=100, random_state=42)
model_wickets.fit(X_train_bowl, y_train_bowl)

In [13]:
# --- Step 5: Final Predictions for Active Players ---
latest_season = match_data['Season'].max()
active_batters = batsman_stats[batsman_stats['Season'] == latest_season]['batter'].unique()
active_bowlers = bowler_stats[bowler_stats['Season'] == latest_season]['bowler'].unique()

In [14]:
prediction_data_batters = batsman_stats[batsman_stats['batter'].isin(active_batters) & (batsman_stats['Season'] == latest_season)].copy()
prediction_data_batters.rename(columns={'total_runs': 'prev_season_runs'}, inplace=True)
X_predict_bat = prediction_data_batters[['prev_season_runs']]

In [15]:
prediction_data_bowlers = bowler_stats[bowler_stats['bowler'].isin(active_bowlers) & (bowler_stats['Season'] == latest_season)].copy()
prediction_data_bowlers.rename(columns={'total_wickets': 'prev_season_wickets'}, inplace=True)
X_predict_bowl = prediction_data_bowlers[['prev_season_wickets']]

In [16]:
# Use a loop to perform predictions only on non-empty datasets
for df_bat in [prediction_data_batters]:
    if not df_bat.empty:
        df_bat['predicted_runs_next_season'] = model_runs.predict(X_predict_bat).round().astype(int)

for df_bowl in [prediction_data_bowlers]:
    if not df_bowl.empty:
        df_bowl['predicted_wickets_next_season'] = model_wickets.predict(X_predict_bowl).round().astype(int)

In [17]:
# --- Final Output in Proper Table Format ---
print("\nPredicted Total Runs for Active Batters in the Next Season:")
for df_bat in [prediction_data_batters]:
    if not df_bat.empty:
        print(df_bat[['batter', 'predicted_runs_next_season']].sort_values(by='predicted_runs_next_season', ascending=False).head(10).to_string(index=False))




Predicted Total Runs for Active Batters in the Next Season:
         batter  predicted_runs_next_season
        V Kohli                         570
     RD Gaikwad                         458
        R Parag                         451
        TM Head                         447
      SV Samson                         421
B Sai Sudharsan                         418
       KL Rahul                         414
       N Pooran                         399
      SP Narine                         391
Abhishek Sharma                         388


In [18]:
print("\nPredicted Total Wickets for Active Bowlers in the Next Season:")
for df_bowl in [prediction_data_bowlers]:
    if not df_bowl.empty:
        print(df_bowl[['bowler', 'predicted_wickets_next_season']].sort_values(by='predicted_wickets_next_season', ascending=False).head(10).to_string(index=False))


Predicted Total Wickets for Active Bowlers in the Next Season:
        bowler  predicted_wickets_next_season
      HV Patel                             16
    Avesh Khan                             15
  Mukesh Kumar                             15
     JJ Bumrah                             15
      CV Varun                             15
  Harshit Rana                             14
   T Natarajan                             14
      MA Starc                             14
Arshdeep Singh                             14
    PJ Cummins                             13


<div style="text-align:center; border-radius:15px; padding:15px; color:white; margin:0; font-family: 'Orbitron', sans-serif; background: #11001C; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.3); overflow:hidden; margin-bottom: 1em;"><div style="font-size:250%; color:#FEE100"><b>Type 2</b></div><div><span style="color:#6666FF">Supervised</span> Model: <span style="color:#6666FF">Random Forest Regressor</span> is used for <span style="color:#6666FF">precision forecasting</span> of individual player statistics. This method is fundamentally different from clustering as it aims to predict the <span style="color:#6666FF">exact numerical outcome</span> for the next season, utilizing historical data as training examples. The model is independently trained for each metric, enabling it to capture the distinct factors influencing: <span style="color:#6666FF">Runs</span> to be <span style="color:#6666FF">scored</span>, <span style="color:#6666FF">Balls</span> to be <span style="color:#6666FF">faced</span>, <span style="color:#6666FF">Fours</span>, and <span style="color:#6666FF">Sixes</span> to be <span style="color:#6666FF">hit</span>, along with Wickets taken, resulting Economy Rate, and Overs Bowled (represented in the Overs.Balls cricket format) for the <span style="color:#6666FF">next</span> season.</div></div>

### Predictive Regression Goal
- The core objective is high-fidelity regression: generating a specific numerical output for each player based on their recent performance features (previous season's runs, wickets, strike rate, etc.). 
- The **Random Forest Regressor** is chosen for its ability to handle complex, non-linear feature relationships common in sports performance data, providing a robust and generalized predictive capability. 
- This supervised approach gives users actionable, projected statistics for team drafting and strategic planning.

In [21]:
# --- Step 2: Advanced Feature Engineering ---
# Batting Stats
batsman_stats = merged_data.groupby(['batter', 'Season']).agg(
    total_runs=('batsman_run', 'sum'),
    balls_faced=('ballnumber', 'count'),
    fours=('batsman_run', lambda x: (x == 4).sum()),
    sixes=('batsman_run', lambda x: (x == 6).sum())
).reset_index()
batsman_stats.loc[:, 'strike_rate'] = (batsman_stats['total_runs'] / batsman_stats['balls_faced']) * 100
batsman_stats.replace([np.inf, -np.inf], 0, inplace=True)
batsman_stats.fillna(0, inplace=True)

# Convert Season column to numeric for comparisons, handling mixed formats
batsman_stats['Season'] = pd.to_numeric(batsman_stats['Season'].astype(str).str.split('/').str[0])

In [22]:
# Bowling Stats (UPDATED to include Overs and Partial Balls)
bowler_stats = merged_data.groupby(['bowler', 'Season']).agg(
    total_wickets=('isWicketDelivery', 'sum'),
    total_runs_conceded=('total_run', 'sum'),
    total_balls_bowled=('ballnumber', 'count')
).reset_index()

# Convert total balls bowled into overs and remaining balls
bowler_stats.loc[:, 'total_overs_bowled'] = (bowler_stats['total_balls_bowled'] // 6)
bowler_stats.loc[:, 'balls_in_partial_over'] = (bowler_stats['total_balls_bowled'] % 6)

# Economy rate calculation remains the same (uses the original total_balls_bowled)
bowler_stats.loc[:, 'economy_rate'] = (bowler_stats['total_runs_conceded'] / bowler_stats['total_balls_bowled']) * 6
bowler_stats.replace([np.inf, -np.inf], 0, inplace=True)
bowler_stats.fillna(0, inplace=True)

# Convert Season column to numeric for comparisons, handling mixed formats
bowler_stats['Season'] = pd.to_numeric(bowler_stats['Season'].astype(str).str.split('/').str[0])

In [23]:
# Prepare Batting Data for predictions
batsman_ml_data = batsman_stats.copy()
batsman_ml_data.loc[:, 'prev_runs'] = batsman_ml_data.groupby('batter')['total_runs'].shift(1)
batsman_ml_data.loc[:, 'prev_balls'] = batsman_ml_data.groupby('batter')['balls_faced'].shift(1)
batsman_ml_data.loc[:, 'prev_fours'] = batsman_ml_data.groupby('batter')['fours'].shift(1)
batsman_ml_data.loc[:, 'prev_sixes'] = batsman_ml_data.groupby('batter')['sixes'].shift(1)
batsman_ml_data.dropna(inplace=True)

# Prepare Bowling Data for predictions (UPDATED)
bowler_ml_data = bowler_stats.copy()
bowler_ml_data.loc[:, 'prev_wickets'] = bowler_ml_data.groupby('bowler')['total_wickets'].shift(1)
bowler_ml_data.loc[:, 'prev_runs_conceded'] = bowler_ml_data.groupby('bowler')['total_runs_conceded'].shift(1)
# Shift the new overs/partial balls features
bowler_ml_data.loc[:, 'prev_overs_bowled'] = bowler_ml_data.groupby('bowler')['total_overs_bowled'].shift(1)
bowler_ml_data.loc[:, 'prev_partial_balls'] = bowler_ml_data.groupby('bowler')['balls_in_partial_over'].shift(1)
bowler_ml_data.dropna(inplace=True)

# Define features and targets for Batting
X_batting = batsman_ml_data[['prev_runs', 'prev_balls', 'prev_fours', 'prev_sixes']]
y_batting_runs = batsman_ml_data['total_runs']
y_batting_balls = batsman_ml_data['balls_faced']
y_batting_fours = batsman_ml_data['fours']
y_batting_sixes = batsman_ml_data['sixes']

# Define features and targets for Bowling (UPDATED)
# X_bowling now includes prev_overs_bowled and prev_partial_balls
X_bowling = bowler_ml_data[['prev_wickets', 'prev_runs_conceded', 'prev_overs_bowled', 'prev_partial_balls']]
y_bowling_wickets = bowler_ml_data['total_wickets']
y_bowling_runs_conceded = bowler_ml_data['total_runs_conceded']
# New target variables for overs/partial balls
y_bowling_overs = bowler_ml_data['total_overs_bowled']
y_bowling_partial_balls = bowler_ml_data['balls_in_partial_over']

In [24]:
# --- Step 3: Model Training (UPDATED) ---
model_runs = RandomForestRegressor(n_estimators=100)
model_runs.fit(X_batting, y_batting_runs)
model_balls = RandomForestRegressor(n_estimators=100)
model_balls.fit(X_batting, y_batting_balls)
model_fours = RandomForestRegressor(n_estimators=100)
model_fours.fit(X_batting, y_batting_fours)
model_sixes = RandomForestRegressor(n_estimators=100)
model_sixes.fit(X_batting, y_batting_sixes)

In [25]:
model_wickets = RandomForestRegressor(n_estimators=100)
model_wickets.fit(X_bowling, y_bowling_wickets)
model_runs_conceded = RandomForestRegressor(n_estimators=100)
model_runs_conceded.fit(X_bowling, y_bowling_runs_conceded)

# Two new models replace model_balls_bowled
model_overs_bowled = RandomForestRegressor(n_estimators=100)
model_overs_bowled.fit(X_bowling, y_bowling_overs)
model_partial_balls_bowled = RandomForestRegressor(n_estimators=100)
model_partial_balls_bowled.fit(X_bowling, y_bowling_partial_balls)

In [26]:
# --- Step 4: Making Predictions ---
latest_season = int(match_data['Season'].max())
lookback_seasons = 4
start_season = latest_season - lookback_seasons

# Get all stats within the lookback window (4 seasons)
active_batters_data = batsman_stats[batsman_stats['Season'] >= start_season].copy()
active_bowlers_data = bowler_stats[bowler_stats['Season'] >= start_season].copy()

# 1. IDENTIFY PLAYERS ACTIVE IN THE LATEST SEASON for filtering inactive players
# This step dynamically removes any player who did not participate in the most recent season.
latest_season_batters = active_batters_data[active_batters_data['Season'] == latest_season]['batter'].unique()
latest_season_bowlers = active_bowlers_data[active_bowlers_data['Season'] == latest_season]['bowler'].unique()

# 2. FILTER DATA: Keep only the stats for the players found in the latest season
active_batters_data_filtered_latest = active_batters_data[active_batters_data['batter'].isin(latest_season_batters)]
active_bowlers_data_filtered_latest = active_bowlers_data[active_bowlers_data['bowler'].isin(latest_season_bowlers)]

# 3. SELECT MOST RECENT STATS: Get the statistics from the player's most recent season 
# (within the 4-season lookback) for all players who played in the latest season.
active_batters_data_filtered = active_batters_data_filtered_latest.loc[active_batters_data_filtered_latest.groupby('batter')['Season'].idxmax()]
active_bowlers_data_filtered = active_bowlers_data_filtered_latest.loc[active_bowlers_data_filtered_latest.groupby('bowler')['Season'].idxmax()]

print(f"Total batters from the last {lookback_seasons} seasons, active in Season {latest_season}, after filtering: {len(active_batters_data_filtered)}")
print(f"Total bowlers from the last {lookback_seasons} seasons, active in Season {latest_season}, after filtering: {len(active_bowlers_data_filtered)}")

# Prepare Batting Features for Prediction
prediction_features_batters = active_batters_data_filtered.rename(columns={
    'total_runs': 'prev_runs',
    'balls_faced': 'prev_balls',
    'fours': 'prev_fours',
    'sixes': 'prev_sixes'
})[['prev_runs', 'prev_balls', 'prev_fours', 'prev_sixes']]

# Prepare Bowling Features for Prediction (Uses new overs/partial balls columns)
prediction_features_bowlers = active_bowlers_data_filtered.rename(columns={
    'total_wickets': 'prev_wickets',
    'total_runs_conceded': 'prev_runs_conceded',
    'total_overs_bowled': 'prev_overs_bowled', 
    'balls_in_partial_over': 'prev_partial_balls' 
})[['prev_wickets', 'prev_runs_conceded', 'prev_overs_bowled', 'prev_partial_balls']]

Total batters from the last 4 seasons, active in Season 2024, after filtering: 171
Total bowlers from the last 4 seasons, active in Season 2024, after filtering: 138


In [27]:
# Ensure the columns match the training features (X_bowling)
X_pred_batting = prediction_features_batters.drop(columns=['Season'], errors='ignore')
X_pred_bowling = prediction_features_bowlers.drop(columns=['Season'], errors='ignore')

In [28]:
# Making the actual predictions
predicted_runs = model_runs.predict(X_pred_batting)
predicted_balls = model_balls.predict(X_pred_batting)
predicted_fours = model_fours.predict(X_pred_batting)
predicted_sixes = model_sixes.predict(X_pred_batting)

In [29]:
# Vectorized prediction for bowlers
predicted_wickets = model_wickets.predict(X_pred_bowling)
predicted_runs_conceded = model_runs_conceded.predict(X_pred_bowling)
predicted_overs = model_overs_bowled.predict(X_pred_bowling)
predicted_partial_balls = model_partial_balls_bowled.predict(X_pred_bowling)

In [30]:
# --- Step 5: Consolidating Results and Calculating Final Metrics ---

# 5.1 Consolidate Batting Predictions
# Create a DataFrame using the index/batter names from the filtered data
batting_predictions = active_batters_data_filtered[['batter']].copy()
batting_predictions['predicted_runs'] = predicted_runs.round(0).astype(int)
batting_predictions['predicted_balls'] = predicted_balls.round(0).astype(int)
batting_predictions['predicted_fours'] = predicted_fours.round(0).astype(int)
batting_predictions['predicted_sixes'] = predicted_sixes.round(0).astype(int)

# Calculate Predicted Strike Rate: (Runs / Balls) * 100
# Add a small value to predicted_balls to prevent division by zero, though replace handles it.
batting_predictions['predicted_strike_rate'] = (
    batting_predictions['predicted_runs'] / batting_predictions['predicted_balls']
) * 100

# Handle division by zero/inf values
batting_predictions.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

In [31]:
# 5.2 Consolidate Bowling Predictions
bowling_predictions = active_bowlers_data_filtered[['bowler']].copy()
bowling_predictions['predicted_wickets'] = predicted_wickets.round(0).astype(int)
bowling_predictions['predicted_runs_conceded'] = predicted_runs_conceded.round(0).astype(int)
bowling_predictions['predicted_overs'] = predicted_overs.round(0).astype(int)
# Balls in a partial over must be capped between 0 and 5
bowling_predictions['predicted_partial_balls'] = np.clip(predicted_partial_balls.round(0).astype(int), 0, 5)

# Combine overs and partial balls into 'Overs.Balls' format for display
bowling_predictions['predicted_overs_balls_format'] = (
    bowling_predictions['predicted_overs'].astype(str) + '.' + 
    bowling_predictions['predicted_partial_balls'].astype(str)
)

# Calculate Predicted Total Balls for Economy Rate
predicted_total_balls = (bowling_predictions['predicted_overs'] * 6) + bowling_predictions['predicted_partial_balls']

# Calculate Predicted Economy Rate: (Runs Conceded / Total Balls) * 6
bowling_predictions['predicted_economy_rate'] = (
    bowling_predictions['predicted_runs_conceded'] / predicted_total_balls
) * 6

# Handle division by zero/inf values
bowling_predictions.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

In [32]:
# --- Step 5: Final Output Table ---
print("\nComplete Predicted IPL Dataset for the Next Season")

print("\n--- Final Batting Predictions for Next Season (Top 10 Run-Scores) ---")
batting_predictions.sort_values(by='predicted_runs', ascending=False).head(10)


Complete Predicted IPL Dataset for the Next Season

--- Final Batting Predictions for Next Season (Top 10 Run-Scores) ---


Unnamed: 0,batter,predicted_runs,predicted_balls,predicted_fours,predicted_sixes,predicted_strike_rate
2408,TM Head,501,309,43,17,162.135922
1779,R Parag,467,340,37,17,137.352941
901,JC Buttler,416,288,37,14,144.444444
349,B Sai Sudharsan,416,323,39,15,128.79257
2568,YBK Jaiswal,407,298,48,16,136.577181
2344,Shubman Gill,405,277,31,16,146.209386
1092,KL Rahul,395,318,36,18,124.213836
2020,S Dube,389,263,30,20,147.908745
1921,RR Pant,376,263,25,22,142.965779
1702,PP Shaw,370,230,34,15,160.869565


In [33]:
print("\n--- Final Bowling Predictions for Next Season (Top 10 Wicket-takers) ---")
# Display the new combined overs column in the output
bowling_predictions[['bowler', 'predicted_wickets', 'predicted_runs_conceded', 'predicted_overs_balls_format', 'predicted_economy_rate']].sort_values(by='predicted_wickets', ascending=False).head(10)


--- Final Bowling Predictions for Next Season (Top 10 Wicket-takers) ---


Unnamed: 0,bowler,predicted_wickets,predicted_runs_conceded,predicted_overs_balls_format,predicted_economy_rate
1644,SP Narine,19,406,56.3,7.185841
736,JJ Bumrah,19,394,51.3,7.650485
840,KK Ahmed,19,425,52.2,8.121019
1287,PVD Chameera,19,267,35.3,7.521127
242,Avesh Khan,17,362,43.3,8.321839
573,HV Patel,15,364,34.2,10.601942
1314,R Ashwin,14,309,40.2,7.661157
233,Arshdeep Singh,14,380,43.2,8.769231
1739,T Natarajan,14,342,45.2,7.544118
386,CV Varun,14,365,44.2,8.233083


<div style="text-align:center; border-radius:15px; padding:15px; color:white; margin:0; font-family: 'Orbitron', sans-serif; background: #11001C; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.3); overflow:hidden; margin-bottom: 1em;"><div style="font-size:250%; color:#FEE100"><b>Type 3</b></div><div><span style="color:#6666FF">Supervised Gradient Boosting</span> Model: <span style="color:#6666FF">XGBoost Regressor</span> is used for <span style="color:#6666FF">precision forecasting</span> of individual player statistics. This method is fundamentally different from clustering as it aims to predict the <span style="color:#6666FF">exact numerical outcome</span> for the next season, utilizing historical data as training examples. The model is independently trained for each metric, enabling it to capture the distinct factors influencing: <span style="color:#6666FF">Runs</span> to be <span style="color:#6666FF">scored</span>, <span style="color:#6666FF">Balls</span> to be <span style="color:#6666FF">faced</span>, <span style="color:#6666FF">Fours</span>, and <span style="color:#6666FF">Sixes</span> to be <span style="color:#6666FF">hit</span>, along with **Wickets** taken, resulting **Economy Rate**, and **Overs Bowled** (represented in the `Overs.Balls` cricket format) for the <span style="color:#6666FF">next</span> season.</div></div>

### Predictive Gradient Boosting Goal

- The core objective is high-fidelity regression: generating a specific numerical output for each player based on their recent performance features (previous season's runs, wickets, strike rate, etc.). 
- The **XGBoost Regressor** is chosen for its ability to handle complex, non-linear feature relationships common in sports performance data, providing a robust and generalized predictive capability.
- This supervised approach gives users actionable, projected statistics for team drafting and strategic planning.

In [36]:
# --- Step 2: Advanced Feature Engineering ---
# Batting Stats
batsman_stats = merged_data.groupby(['batter', 'Season']).agg(
    total_runs=('batsman_run', 'sum'),
    balls_faced=('ballnumber', 'count'),
    fours=('batsman_run', lambda x: (x == 4).sum()),
    sixes=('batsman_run', lambda x: (x == 6).sum())
).reset_index()
batsman_stats.loc[:, 'strike_rate'] = (batsman_stats['total_runs'] / batsman_stats['balls_faced']) * 100
batsman_stats.replace([np.inf, -np.inf], 0, inplace=True)
batsman_stats.fillna(0, inplace=True)

# Convert Season column to numeric for comparisons, handling mixed formats
batsman_stats['Season'] = pd.to_numeric(batsman_stats['Season'].astype(str).str.split('/').str[0])

In [37]:
# Bowling Stats
bowler_stats = merged_data.groupby(['bowler', 'Season']).agg(
    total_wickets=('isWicketDelivery', 'sum'),
    total_runs_conceded=('total_run', 'sum'),
    total_balls_bowled=('ballnumber', 'count')
).reset_index()
bowler_stats.loc[:, 'economy_rate'] = (bowler_stats['total_runs_conceded'] / bowler_stats['total_balls_bowled']) * 6
bowler_stats.replace([np.inf, -np.inf], 0, inplace=True)
bowler_stats.fillna(0, inplace=True)

# Convert Season column to numeric for comparisons, handling mixed formats
bowler_stats['Season'] = pd.to_numeric(bowler_stats['Season'].astype(str).str.split('/').str[0])

In [38]:
# Prepare Batting Data for predictions
batsman_ml_data = batsman_stats.copy()
batsman_ml_data.loc[:, 'prev_runs'] = batsman_ml_data.groupby('batter')['total_runs'].shift(1)
batsman_ml_data.loc[:, 'prev_balls'] = batsman_ml_data.groupby('batter')['balls_faced'].shift(1)
batsman_ml_data.loc[:, 'prev_fours'] = batsman_ml_data.groupby('batter')['fours'].shift(1)
batsman_ml_data.loc[:, 'prev_sixes'] = batsman_ml_data.groupby('batter')['sixes'].shift(1)
batsman_ml_data.dropna(inplace=True)

# Prepare Bowling Data for predictions
bowler_ml_data = bowler_stats.copy()
bowler_ml_data.loc[:, 'prev_wickets'] = bowler_ml_data.groupby('bowler')['total_wickets'].shift(1)
bowler_ml_data.loc[:, 'prev_runs_conceded'] = bowler_ml_data.groupby('bowler')['total_runs_conceded'].shift(1)
bowler_ml_data.loc[:, 'prev_balls_bowled'] = bowler_ml_data.groupby('bowler')['total_balls_bowled'].shift(1)
bowler_ml_data.dropna(inplace=True)

# Define features and targets for Batting
X_batting = batsman_ml_data[['prev_runs', 'prev_balls', 'prev_fours', 'prev_sixes']]
y_batting_runs = batsman_ml_data['total_runs']
y_batting_balls = batsman_ml_data['balls_faced']
y_batting_fours = batsman_ml_data['fours']
y_batting_sixes = batsman_ml_data['sixes']

# Define features and targets for Bowling
X_bowling = bowler_ml_data[['prev_wickets', 'prev_runs_conceded', 'prev_balls_bowled']]
y_bowling_wickets = bowler_ml_data['total_wickets']
y_bowling_runs_conceded = bowler_ml_data['total_runs_conceded']
y_bowling_balls_bowled = bowler_ml_data['total_balls_bowled']

In [39]:
# --- Step 3: Model Training ---
model_runs = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_runs.fit(X_batting, y_batting_runs)
model_balls = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_balls.fit(X_batting, y_batting_balls)
model_fours = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_fours.fit(X_batting, y_batting_fours)
model_sixes = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_sixes.fit(X_batting, y_batting_sixes)

In [40]:
model_wickets = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_wickets.fit(X_bowling, y_bowling_wickets)
model_runs_conceded = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_runs_conceded.fit(X_bowling, y_bowling_runs_conceded)
model_balls_bowled = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
model_balls_bowled.fit(X_bowling, y_bowling_balls_bowled)

In [41]:
# --- Step 4: Making Predictions ---
latest_season = int(match_data['Season'].max())
lookback_seasons = 5
start_season = latest_season - lookback_seasons

active_batters_data = batsman_stats[batsman_stats['Season'] >= start_season].copy()
active_bowlers_data = bowler_stats[bowler_stats['Season'] >= start_season].copy()

inactive_players = ['PP Shaw', 'R Dhawan', 'RR Rossouw']

active_batters_data = active_batters_data[~active_batters_data['batter'].isin(inactive_players)]
active_bowlers_data = active_bowlers_data[~active_bowlers_data['bowler'].isin(inactive_players)]

active_batters_data_filtered = active_batters_data.loc[active_batters_data.groupby('batter')['Season'].idxmax()]
active_bowlers_data_filtered = active_bowlers_data.loc[active_bowlers_data.groupby('bowler')['Season'].idxmax()]

print(f"Total batters from the last {lookback_seasons} seasons after filtering: {len(active_batters_data_filtered)}")
print(f"Total bowlers from the last {lookback_seasons} seasons after filtering: {len(active_bowlers_data_filtered)}")

Total batters from the last 5 seasons after filtering: 344
Total bowlers from the last 5 seasons after filtering: 267


In [42]:
prediction_features_batters = active_batters_data_filtered.rename(columns={
    'total_runs': 'prev_runs',
    'balls_faced': 'prev_balls',
    'fours': 'prev_fours',
    'sixes': 'prev_sixes'
})[['prev_runs', 'prev_balls', 'prev_fours', 'prev_sixes']]

prediction_features_bowlers = active_bowlers_data_filtered.rename(columns={
    'total_wickets': 'prev_wickets',
    'total_runs_conceded': 'prev_runs_conceded',
    'total_balls_bowled': 'prev_balls_bowled'
})[['prev_wickets', 'prev_runs_conceded', 'prev_balls_bowled']]

In [43]:
# Vectorized prediction for batters
final_predictions_batters = active_batters_data_filtered[['batter']].copy()
final_predictions_batters.loc[:, 'predicted_runs'] = model_runs.predict(prediction_features_batters).round().astype(int)
final_predictions_batters.loc[:, 'predicted_balls_faced'] = model_balls.predict(prediction_features_batters).round().astype(int)
final_predictions_batters.loc[:, 'predicted_fours'] = model_fours.predict(prediction_features_batters).round().astype(int)
final_predictions_batters.loc[:, 'predicted_sixes'] = model_sixes.predict(prediction_features_batters).round().astype(int)

In [44]:
# Vectorized prediction for bowlers
final_predictions_bowlers = active_bowlers_data_filtered[['bowler']].copy()
final_predictions_bowlers.loc[:, 'predicted_wickets'] = model_wickets.predict(prediction_features_bowlers).round().astype(int)
final_predictions_bowlers.loc[:, 'predicted_runs_conceded'] = model_runs_conceded.predict(prediction_features_bowlers).round().astype(int)
final_predictions_bowlers.loc[:, 'predicted_balls_bowled'] = model_balls_bowled.predict(prediction_features_bowlers).round().astype(int)

In [45]:
# --- Step 5: Final Output Table ---
print("\nComplete Predicted IPL Dataset for the Next Season")

print("\nPredicted Batting Statistics:")
print(final_predictions_batters.sort_values(by='predicted_runs', ascending=False).head(10).to_string(index=False))


Complete Predicted IPL Dataset for the Next Season

Predicted Batting Statistics:
        batter  predicted_runs  predicted_balls_faced  predicted_fours  predicted_sixes
   YBK Jaiswal             549                    300               55               20
       TM Head             482                    229               37               15
      KL Rahul             467                    348               47               18
       R Parag             434                    291               41               16
    MP Stoinis             417                    237               23               21
  Shubman Gill             415                    272               32               13
Shashank Singh             404                    232               30               24
       C Green             401                    342               44               12
    JC Buttler             377                    290               40               13
       V Kohli             375       

In [46]:
print("\nPredicted Bowling Statistics:")
print(final_predictions_bowlers.sort_values(by='predicted_wickets', ascending=False).head(10).to_string(index=False))


Predicted Bowling Statistics:
            bowler  predicted_wickets  predicted_runs_conceded  predicted_balls_bowled
      PVD Chameera                 23                      280                     223
          KK Ahmed                 21                      460                     332
         JJ Bumrah                 19                      311                     332
          OF Smith                 18                      245                     304
          R Ashwin                 18                      371                     198
        Avesh Khan                 17                      359                     356
         SP Narine                 17                      416                     327
         KR Mayers                 16                      210                     213
Azmatullah Omarzai                 16                      233                     253
          CV Varun                 15                      362                     270


<div style="text-align:center; border-radius:15px; padding:15px; color:white; margin:0; font-family: 'Orbitron', sans-serif; background: #11001C; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.3); overflow:hidden; margin-bottom: 1em;"><div style="font-size:250%; color:#FEE100"><b>Type 4</b></div><div><span style="color:#6666FF">Unsupervised</span> Model: <span style="color:#6666FF">K-Means Clustering</span> is used to categorize players into <span style="color:#6666FF">Performance Tiers</span> (clusters) for the next season based on their <span style="color:#6666FF">recent statistical patterns</span>, rather than predicting exact numerical values.</div></div>

In [48]:
# --- Step 2: Advanced Feature Engineering ---
# Batting Stats
batsman_stats = merged_data.groupby(['batter', 'Season']).agg(
    total_runs=('batsman_run', 'sum'),
    balls_faced=('ballnumber', 'count'),
    fours=('batsman_run', lambda x: (x == 4).sum()),
    sixes=('batsman_run', lambda x: (x == 6).sum())
).reset_index()

batsman_stats.loc[:, 'strike_rate'] = (batsman_stats['total_runs'] / batsman_stats['balls_faced']) * 100
batsman_stats.replace([np.inf, -np.inf], 0, inplace=True)
batsman_stats.fillna(0, inplace=True)

In [49]:
# Bowling Stats
bowler_stats = merged_data.groupby(['bowler', 'Season']).agg(
    total_wickets=('isWicketDelivery', 'sum'),
    total_runs_conceded=('total_run', 'sum'),
    total_balls_bowled=('ballnumber', 'count')
).reset_index()

# Convert total balls bowled into overs and remaining balls
bowler_stats.loc[:, 'total_overs_bowled'] = (bowler_stats['total_balls_bowled'] // 6)
bowler_stats.loc[:, 'balls_in_partial_over'] = (bowler_stats['total_balls_bowled'] % 6)

# Economy rate calculation remains the same (uses the original total_balls_bowled)
bowler_stats.loc[:, 'economy_rate'] = (bowler_stats['total_runs_conceded'] / bowler_stats['total_balls_bowled']) * 6
bowler_stats.replace([np.inf, -np.inf], 0, inplace=True)
bowler_stats.fillna(0, inplace=True)

In [50]:
# --- Step 3: Preparing Data for Clustering ---
latest_season = int(batsman_stats['Season'].max())
lookback_seasons = 4
start_season = latest_season - lookback_seasons

# Get players who were active in the LATEST season, and their most recent stats (within lookback)
def filter_active_players(df, player_col):
    latest_season_players = df[df['Season'] == latest_season][player_col].unique()
    active_data = df[(df['Season'] >= start_season) & (df[player_col].isin(latest_season_players))].copy()
    # Get the stats from the player's most recent season within the lookback window
    return active_data.loc[active_data.groupby(player_col)['Season'].idxmax()]

# Filter for active players
active_batters_data = filter_active_players(batsman_stats, 'batter')
active_bowlers_data = filter_active_players(bowler_stats, 'bowler')

# Select features for clustering
batting_features = active_batters_data[['total_runs', 'balls_faced', 'strike_rate', 'fours', 'sixes']]
bowling_features = active_bowlers_data[['total_wickets', 'total_runs_conceded', 'economy_rate', 'total_overs_bowled', 'balls_in_partial_over']]

In [51]:
# --- Step 4: K-Means Model Training ---
n_clusters = 4
scaler_bat = StandardScaler()
X_bat_scaled = scaler_bat.fit_transform(batting_features)
kmeans_bat = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
active_batters_data['cluster_label'] = kmeans_bat.fit_predict(X_bat_scaled)

In [52]:
scaler_bowl = StandardScaler()
X_bowl_scaled = scaler_bowl.fit_transform(bowling_features)
kmeans_bowl = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
active_bowlers_data['cluster_label'] = kmeans_bowl.fit_predict(X_bowl_scaled)

In [53]:
# --- Step 5: Consolidating Results and Interpreting Clusters (CORRECTED) ---

# 5.1 Batting Cluster Averages and Interpretation
batting_cluster_summary = active_batters_data.groupby('cluster_label').agg(
    predicted_runs=('total_runs', 'mean'),
    predicted_balls_faced=('balls_faced', 'mean'),
    predicted_fours=('fours', 'mean'),
    predicted_sixes=('sixes', 'mean'),
    predicted_strike_rate=('strike_rate', 'mean')
).reset_index()

# Rounding metrics to integers (except strike rate)
batting_cluster_summary['predicted_runs'] = batting_cluster_summary['predicted_runs'].round(0).astype(int)
batting_cluster_summary['predicted_balls_faced'] = batting_cluster_summary['predicted_balls_faced'].round(0).astype(int)
batting_cluster_summary['predicted_fours'] = batting_cluster_summary['predicted_fours'].round(0).astype(int)
batting_cluster_summary['predicted_sixes'] = batting_cluster_summary['predicted_sixes'].round(0).astype(int)

# Define Batting Tiers
def get_batter_tier(runs):
    if runs > 350.0:
        return 'Top Performers'
    elif 301.0 <= runs <= 350.0:
        return 'Moderate Performers'
    elif 150.0 <= runs <= 300.0:
        return 'Average Performers'
    else:
        return 'Least Performers'

batting_cluster_summary['Tier_Name'] = batting_cluster_summary['predicted_runs'].apply(get_batter_tier)

# Merge cluster summary back into player data
batting_predictions = active_batters_data.merge(
    batting_cluster_summary[['cluster_label', 'Tier_Name', 'predicted_runs', 'predicted_balls_faced', 'predicted_fours', 'predicted_sixes', 'predicted_strike_rate']],
    on='cluster_label',
    how='left'
).drop(columns=['Season', 'total_runs', 'balls_faced', 'fours', 'sixes', 'strike_rate'])

# Sort tiers by performance (highest runs first)
tier_order_bat = ['Top Performers', 'Moderate Performers', 'Average Performers', 'Least Performers']
batting_predictions['Tier_Name'] = pd.Categorical(batting_predictions['Tier_Name'], categories=tier_order_bat, ordered=True)
batting_predictions.sort_values(by=['Tier_Name', 'batter'], inplace=True)

In [54]:
# 5.2 Bowling Cluster Averages and Interpretation
bowling_cluster_summary = active_bowlers_data.groupby('cluster_label').agg(
    predicted_wickets=('total_wickets', 'mean'),
    predicted_runs_conceded=('total_runs_conceded', 'mean'),
    predicted_economy_rate=('economy_rate', 'mean'),
    predicted_overs=('total_overs_bowled', 'mean'),
    predicted_partial_balls=('balls_in_partial_over', 'mean')
).reset_index()

# Rounding metrics
bowling_cluster_summary['predicted_wickets'] = bowling_cluster_summary['predicted_wickets'].round(0).astype(int)
bowling_cluster_summary['predicted_runs_conceded'] = bowling_cluster_summary['predicted_runs_conceded'].round(0).astype(int)
bowling_cluster_summary['predicted_overs'] = bowling_cluster_summary['predicted_overs'].round(0).astype(int)
bowling_cluster_summary['predicted_partial_balls'] = bowling_cluster_summary['predicted_partial_balls'].round(0).astype(int)

# Define Bowling Tiers
def get_bowler_tier(wickets):
    if wickets >= 15:
        return 'Top Performers'
    elif 10 >= wickets < 15:
        return 'Moderate Performers'
    elif 5 > wickets < 10:
        return 'Average Performers'
    else: # wickets < 5
        return 'Least Performers'

bowling_cluster_summary['Tier_Name'] = bowling_cluster_summary['predicted_wickets'].apply(get_bowler_tier)

# Merge cluster summary back into player data
bowling_predictions = active_bowlers_data.merge(
    bowling_cluster_summary[['cluster_label', 'Tier_Name', 'predicted_wickets', 'predicted_runs_conceded', 'predicted_economy_rate', 'predicted_overs', 'predicted_partial_balls']],
    on='cluster_label',
    how='left'
).drop(columns=['Season', 'total_wickets', 'total_runs_conceded', 'total_balls_bowled', 'economy_rate', 'total_overs_bowled', 'balls_in_partial_over'])

# Sort tiers by performance (highest wickets first)
tier_order_bowl = ['Top Performers', 'Moderate Performers', 'Average Performers', 'Least Performers']
bowling_predictions['Tier_Name'] = pd.Categorical(bowling_predictions['Tier_Name'], categories=tier_order_bowl, ordered=True)
bowling_predictions.sort_values(by=['Tier_Name', 'bowler'], inplace=True)

In [108]:
# 5.3 FINAL OUTPUT FORMATTING AND DISPLAY (CORRECTED)

# Overs Format Correction: Combine integer overs and integer balls into 'Overs.Balls' format (e.g., 15.3)
# MOVED THIS STEP OUTSIDE THE LOOP TO ENSURE THE COLUMN EXISTS BEFORE FILTERING
bowling_predictions['predicted_overs_bowled'] = (
    bowling_predictions['predicted_overs'].astype(str) + '.' +
    bowling_predictions['predicted_partial_balls'].astype(str)
)

# --- Output Batting Predictions ---
print("\n" + "="*80)
print(f"UNSUPERVISED PREDICTION: BATTER PERFORMANCE TIERS (Based on Season {latest_season})")
print("="*80)

for tier in tier_order_bat:
    tier_data = batting_predictions[batting_predictions['Tier_Name'] == tier]
    # FIX 1: Drop duplicates to correct naming issues (e.g., Arshad Khan (2))
    tier_data = tier_data.drop_duplicates(subset=['batter'], keep='first')
    
    if not tier_data.empty:
        # Get the cluster average for display in the header
        avg_row = batting_cluster_summary[batting_cluster_summary['Tier_Name'] == tier].iloc[0]
        
        print(f"\n--- Tier: {tier} ---")
        print(f"(Expected Performance: {avg_row['predicted_runs']} Runs, {avg_row['predicted_strike_rate']:.1f} SR)")
        
        print(tier_data[['batter', 'predicted_runs', 'predicted_balls_faced', 'predicted_fours', 'predicted_sixes', 'predicted_strike_rate']]
              .sort_values(by='predicted_runs', ascending=False)
              .head(10).to_string(index=False))


UNSUPERVISED PREDICTION: BATTER PERFORMANCE TIERS (Based on Season 2024)

--- Tier: Top Performers ---
(Expected Performance: 437 Runs, 156.9 SR)
         batter  predicted_runs  predicted_balls_faced  predicted_fours  predicted_sixes  predicted_strike_rate
Abhishek Sharma             437                    283               39               24             156.923843
  Abishek Porel             437                    283               39               24             156.923843
        VR Iyer             437                    283               39               24             156.923843
        V Kohli             437                    283               39               24             156.923843
    Tilak Varma             437                    283               39               24             156.923843
        TM Head             437                    283               39               24             156.923843
       T Stubbs             437                    283               

In [110]:

# --- Output Bowling Predictions ---
print("\n" + "="*80)
print(f"UNSUPERVISED PREDICTION: BOWLER PERFORMANCE TIERS (Based on Season {latest_season})")
print("="*80)

for tier in tier_order_bowl:
    tier_data = bowling_predictions[bowling_predictions['Tier_Name'] == tier]
    # FIX 1: Drop duplicates to correct naming issues (e.g., Arshad Khan (2))
    tier_data = tier_data.drop_duplicates(subset=['bowler'], keep='first')
    
    if not tier_data.empty:
        # Get the cluster average for display in the header
        avg_row = bowling_cluster_summary[bowling_cluster_summary['Tier_Name'] == tier].iloc[0]
        
        # FIX 2: Correct Overs Format for Header Display
        formatted_overs = f"{avg_row['predicted_overs']}.{avg_row['predicted_partial_balls']}"
        
        print(f"\n--- Tier: {tier} ---")
        print(f"(Expected Performance: {avg_row['predicted_wickets']} Wickets, {avg_row['predicted_economy_rate']:.2f} Econ, {formatted_overs} Overs)")
        
        # Display the correctly formatted overs column
        # Now 'predicted_overs_bowled' is guaranteed to be in tier_data as it was added before filtering.
        print(tier_data[['bowler', 'predicted_wickets', 'predicted_runs_conceded', 'predicted_economy_rate', 'predicted_overs_bowled']]
              .sort_values(by='predicted_wickets', ascending=False)
              .head(10).to_string(index=False))



UNSUPERVISED PREDICTION: BOWLER PERFORMANCE TIERS (Based on Season 2024)

--- Tier: Top Performers ---
(Expected Performance: 15 Wickets, 8.80 Econ, 44.3 Overs)
           bowler  predicted_wickets  predicted_runs_conceded  predicted_economy_rate predicted_overs_bowled
       AD Russell                 15                      391                8.803389                   44.3
      Mohsin Khan                 15                      391                8.803389                   44.3
Mustafizur Rahman                 15                      391                8.803389                   44.3
    Naveen-ul-Haq                 15                      391                8.803389                   44.3
       Noor Ahmad                 15                      391                8.803389                   44.3
       PJ Cummins                 15                      391                8.803389                   44.3
        PP Chawla                 15                      391              