## Import important libraries


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Players Data 2022 to 2024

train_data=pd.read_excel("IPL_training_dataset(1).xlsx")
train_data.head()

Unnamed: 0,Player_name,Current_team,Role,Age,Innings_22,Innings_23,Innings_24,Bowling_Innings_22,Bowling_Innings_23,Bowling_Innings_24,Runs_22,Runs_23,Runs_24,Wickets_22,Wickets_23,Wickets_24,Retained_23,Retained_24,Retained_25
0,Anshul Kamboj,CSK,All-Rounder,24,0.0,0,1,0,0,3,0,0,2,0,0,2,-1,-1,0
1,Deepak Hooda,CSK,All-Rounder,30,14.0,12,9,2,0,0,451,84,145,1,0,0,1,1,0
2,Rachin Ravindra,CSK,All-Rounder,25,0.0,0,10,0,0,0,0,0,222,0,0,0,-1,-1,1
3,Ravichandran Ashwin,CSK,All-Rounder,38,12.0,10,9,17,13,14,191,67,86,12,14,9,1,1,0
4,Ravindra Jadeja,CSK,All-Rounder,36,10.0,12,11,10,16,14,116,190,267,5,20,8,1,1,1


In [6]:
# Players Data 2025

test_data=pd.read_excel("TestData(1).xlsx")
test_data.head()

Unnamed: 0,Player,Role,Age,Batting_Innings,Runs,Bowling_Innings,Wickets,Current_Team,Retained_24,Retained_25
0,Sai Sudharsan,Batter,23,15,759,0,0,GT,1,1
1,Suryakumar Yadav,Batter,34,16,717,0,0,MI,1,1
2,Virat Kohli,Batter,36,15,657,0,0,RCB,1,1
3,Shubman Gill,Batter,25,15,650,0,0,GT,1,1
4,Mitchell Marsh,All Rounder,33,13,627,0,0,LSG,1,0


## Feature Construction¶

In [9]:
# Batting Impact of training and test data

# On train data
train_data['Total_runs']=train_data['Runs_22']+train_data['Runs_23']+train_data['Runs_24']
train_data['Total_bat_innings']=train_data['Innings_22']+train_data['Innings_23']+train_data['Innings_24']
train_data['Batting_Impact']=train_data['Total_runs']/(train_data['Total_bat_innings']+1)
train_data["Batting_Impact"] = train_data["Batting_Impact"].fillna(0)


# On test data
test_data['Batting_Impact']=test_data['Runs']/(test_data['Batting_Innings']+1)
test_data["Batting_Impact"] = test_data["Batting_Impact"].fillna(0)



In [11]:
# Bowling impact of training and test data

# On train data
train_data['Total_Wickets']=train_data['Wickets_22']+train_data['Wickets_23']+train_data['Wickets_24']
train_data['Total_ball_innings']=train_data['Bowling_Innings_22']+train_data['Bowling_Innings_23']+train_data['Bowling_Innings_24']
train_data['Bowling_Impact']=train_data['Total_Wickets']/(train_data['Total_ball_innings']+1)
train_data['Bowling_Impact']=train_data['Bowling_Impact'].fillna(0)


# On test data
test_data['Bowling_Impact']=test_data['Wickets']/(test_data['Bowling_Innings']+1)
test_data['Bowling_Impact']=test_data['Bowling_Impact'].fillna(0)



In [13]:
# IsAllRounder

# On train data
train_data['IsAllRounder'] = train_data['Role'].str.lower().str.contains('all').astype(int)


# On test data
test_data['IsAllRounder'] = test_data['Role'].str.lower().str.contains('all').astype(int)




In [15]:
# IsWicketKeeper

# On train data
train_data['IsWicketKeeper'] = train_data['Role'].str.lower().str.contains('WicketKeeper Batter').astype(int)

# On test data
test_data['IsWicketKeeper'] = test_data['Role'].str.lower().str.contains('WicketKeeper Batter').astype(int)



In [17]:
# IsBatter

# On train data
train_data['IsBatter'] = train_data['Role'].str.lower().str.contains('batter').astype(int)

#  On test data
test_data['IsBatter'] = test_data['Role'].str.lower().str.contains('batter').astype(int)


In [19]:
# IsBowler

# On train data
train_data['IsBowler'] = train_data['Role'].str.lower().str.contains('bowler').astype(int)

#  On test data
test_data['IsBowler'] = test_data['Role'].str.lower().str.contains('bowler').astype(int)


In [21]:
# Performance Score

def compute_score(row):
    if row["IsBatter"]:
        return 0.8 * row["Batting_Impact"] + 0.2 * row["Bowling_Impact"]
    elif row["IsBowler"]:
        return 0.2 * row["Batting_Impact"] + 0.8 * row["Bowling_Impact"]
    else:
        return 0.6 * row["Batting_Impact"] + 0.4 * row["Bowling_Impact"]  

# On train data
train_data['PerformanceScore'] = train_data.apply(compute_score, axis=1)

#  On test data
test_data['PerformanceScore'] = test_data.apply(compute_score, axis=1)


In [23]:
# RunsPerAge

# On train data
train_data["RunsPerAge"] = train_data["Total_runs"] / (train_data["Age"] + 1)

#  On test data
test_data["RunsPerAge"] = test_data["Runs"] / (test_data["Age"] + 1)



In [25]:
# WicketsPerInning

train_data["WicketsPerInning"] = train_data["Total_Wickets"] / (train_data["Total_ball_innings"] + 1)

test_data["WicketsPerInning"] = test_data["Wickets"] / (test_data["Bowling_Innings"] + 1)

train_data["WicketsPerInning"] = train_data["WicketsPerInning"].fillna(0)
test_data["WicketsPerInning"] = test_data["WicketsPerInning"].fillna(0)

In [27]:
# IsExperienced

# On train data
train_data["Total_Innings"] = train_data["Total_bat_innings"] + train_data["Total_ball_innings"]
train_data["IsExperienced"] = (train_data["Total_Innings"] >= 20).astype(int)

# On test data
test_data["Total_Innings"] = test_data["Batting_Innings"] + test_data["Bowling_Innings"]
test_data["IsExperienced"] = (test_data["Total_Innings"] >= 8).astype(int)


In [29]:
# PerformancePerAge

#  On train data
train_data["PerformancePerAge"] = train_data["PerformanceScore"] / (train_data["Age"] + 1)

# On test data
test_data["PerformancePerAge"] = test_data["PerformanceScore"] / (test_data["Age"] + 1)


In [31]:
# RecentWeightedRuns

#  On train data
train_data["RecentWeightedRuns"] = (
    0.2 * train_data["Runs_22"] + 0.3 * train_data["Runs_23"] + 0.5 * train_data["Runs_24"]
)

# On test data
test_data["RecentWeightedRuns"] = test_data["Runs"]


In [33]:
# RecentWeightedWickets

#  On train data
train_data["RecentWeightedWickets"] = (
    0.2 * train_data["Wickets_22"] + 0.3 * train_data["Wickets_23"] + 0.5 * train_data["Wickets_24"]
)

# On test data
test_data["RecentWeightedWickets"] = test_data['Wickets']


In [35]:
# WasRetainedConsistently

#  On train data
train_data["WasRetainedConsistently"] = (
    (train_data["Retained_23"] == 1) &
    (train_data["Retained_24"] == 1) 
    
    
).astype(int)


# On test data
test_data["WasRetainedConsistently"] =  (
    (test_data["Retained_24"] == 1) &
    (test_data["Retained_25"] == 1) 
    
).astype(int)


In [37]:
# WasRetainedBefore

#  On train data
train_data["WasRetainedBefore"] = (
    (train_data["Retained_23"] == 1) |
    (train_data["Retained_24"] == 1) |
     (train_data['Retained_25'] ==1)
    
    
).astype(int)


# On test data
test_data["WasRetainedBefore"] =  (
    (test_data["Retained_24"] == 1) |
    (test_data["Retained_25"] == 1) 
    
).astype(int)


In [39]:
# RetentionTrend

#  On train data
train_data[["Retained_23", "Retained_24", "Retained_25"]] = train_data[["Retained_23", "Retained_24", "Retained_25"]].replace(-1, 0)
train_data["RetentionTrend"] = (
    train_data["Retained_23"] + train_data["Retained_24"]+train_data["Retained_25"]
) / 3

# On test data
test_data["RetentionTrend"] =(test_data["Retained_24"]+test_data["Retained_25"])/2


In [41]:
# ImpactDifference

#  On train data
train_data["ImpactDifference"] = train_data["Batting_Impact"] - train_data["Bowling_Impact"]

# On test data
test_data["ImpactDifference"] = test_data["Batting_Impact"] - test_data["Bowling_Impact"]


In [43]:
# Team Strength

team_strength_map = train_data.groupby("Current_team")["Retained_25"].mean().to_dict()
train_data["TeamStrength"] = train_data["Current_team"].map(team_strength_map)


test_data["TeamStrength"] = test_data["Current_Team"].map(team_strength_map)
test_data["TeamStrength"] = test_data["TeamStrength"].fillna(0)

## Feature Selection

In [46]:
features = ["Age",
            "IsExperienced",
            "Batting_Impact",
            "Bowling_Impact",
            "WasRetainedConsistently",
            "WasRetainedBefore",
            "RunsPerAge",
            "PerformancePerAge",
            "RecentWeightedRuns",
            "RetentionTrend",
            "TeamStrength",
            "RecentWeightedWickets"
]


## Train_Test split¶

In [49]:
X_train = train_data[features]
X_train

Unnamed: 0,Age,IsExperienced,Batting_Impact,Bowling_Impact,WasRetainedConsistently,WasRetainedBefore,RunsPerAge,PerformancePerAge,RecentWeightedRuns,RetentionTrend,TeamStrength,RecentWeightedWickets
0,24,0,1.000000,0.500000,0,0,0.080000,0.032000,1.0,0.000000,0.421053,1.0
1,30,1,18.888889,0.333333,1,1,21.935484,0.369892,187.9,0.666667,0.421053,0.2
2,25,0,20.181818,0.000000,0,1,8.538462,0.465734,111.0,0.333333,0.421053,0.0
3,38,1,10.750000,0.777778,1,1,8.820513,0.173362,101.3,0.666667,0.421053,11.1
4,36,1,16.852941,0.804878,1,1,15.486486,0.281992,213.7,1.000000,0.421053,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...
166,32,1,12.437500,1.136364,0,1,6.030303,0.102927,80.6,0.333333,0.571429,10.4
167,25,1,8.909091,0.864865,0,1,3.769231,0.095143,25.9,0.333333,0.571429,10.2
168,27,0,1.750000,0.818182,1,1,0.250000,0.035877,1.4,0.666667,0.571429,3.3
169,34,1,34.333333,0.000000,0,1,26.485714,0.784762,373.9,0.666667,0.571429,0.0


In [51]:
y_train = train_data["Retained_25"] 
y_train

0      0
1      0
2      1
3      0
4      1
      ..
166    1
167    1
168    0
169    1
170    0
Name: Retained_25, Length: 171, dtype: int64

In [53]:
X_test = test_data[features] 
X_test

Unnamed: 0,Age,IsExperienced,Batting_Impact,Bowling_Impact,WasRetainedConsistently,WasRetainedBefore,RunsPerAge,PerformancePerAge,RecentWeightedRuns,RetentionTrend,TeamStrength,RecentWeightedWickets
0,23,1,47.437500,0.000000,1,1,31.625000,1.581250,759,1.0,0.388889,0
1,34,1,42.176471,0.000000,1,1,20.485714,0.964034,717,1.0,0.428571,0
2,36,1,41.062500,0.000000,1,1,17.756757,0.887838,657,1.0,0.388889,0
3,25,1,40.625000,0.000000,1,1,25.000000,1.250000,650,1.0,0.388889,0
4,33,1,44.785714,0.000000,0,1,18.441176,0.790336,627,0.5,0.500000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
188,29,0,0.000000,0.333333,0,1,0.000000,0.008889,0,0.5,0.421053,1
189,31,0,0.000000,0.333333,0,1,0.000000,0.008333,0,0.5,0.700000,1
190,26,0,0.000000,0.333333,0,1,0.000000,0.009877,0,0.5,0.266667,1
191,20,0,0.000000,0.500000,0,0,0.000000,0.009524,0,0.0,0.266667,1


## Model Training¶

In [56]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,max_depth=5, min_samples_split=10, min_samples_leaf=5,  random_state=42)
rf.fit(X_train, y_train)


## Predict on test data

In [61]:
predictions = rf.predict(X_test)

test_data["Predicted_RF_2026"] = predictions
test_data[["Player","Current_Team","Age","Predicted_RF_2026"]].sample(50)

Unnamed: 0,Player,Current_Team,Age,Predicted_RF_2026
11,Heinrich Klassen,SRH,34,1
48,Hardik Pandya,MI,31,1
4,Mitchell Marsh,LSG,33,1
22,Shivam Dube,CSK,32,1
160,Yuzvendra Chahal,PBKS,35,0
23,Ishan Kishan,SRH,27,1
68,Ruturaj Gaikwad,CSK,28,1
168,Zeeshan Ansari,SRH,25,0
104,Harshal Patel,SRH,34,0
113,Atharva Taide,SRH,25,1


In [63]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {scores.mean():.2%}")


Cross-Validation Accuracy: 84.77%
