In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
# Load the data
data = pd.read_csv("Dataset_T20_Ind.csv")

In [3]:
# Preprocess the data
le_team = LabelEncoder()
data['Team'] = le_team.fit_transform(data['Team'])

In [4]:
# Define input features for batsmen
batsmen_features = ['Runs', 'HS', 'Bat Av']

In [5]:
# Define input features for bowlers
bowlers_features = ['Wkts', 'Bowl Av']

In [6]:
# Define input features for all-rounders
allrounders_features = ['Runs', 'Wkts']

In [7]:
# Define input features for wicketkeeper-batsmen
wk_batsmen_features = ['Runs', 'Ct']


In [8]:
# Combine all input features
all_features = batsmen_features + bowlers_features + allrounders_features + wk_batsmen_features

In [9]:
# Define target variable
target_variable = 'Role'  # Assuming 'Role' contains information about batsmen, bowlers, all-rounders, etc.


In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[all_features], data[target_variable], test_size=0.2, random_state=42)


In [11]:
#data.drop(['Span'], axis=1)
data

Unnamed: 0,Player,Span,Mat,Runs,HS,Bat Av,100,Wkts,Bowl Av,5,...,vs AFG(avg),vs PAK(avg),vs BAN(avg),vs SA(avg),vs ENG(avg),vs IRE(avg),vs NED(avg),vs SL(avg),vs NZ(avg),vs WI(avg)
0,RG Sharma,2018-2022,59,1639,85,29.26,0,0,0.0,0,...,74/0,11/0,37.14/0,12.33/0,35.66/0,48.5/0,53/0,22.16/0,40.2,36.92/0
1,RR Pant,2018-2022,57,830,65,22.43,0,0,0.0,0,...,0/0,26.5/0,13.33/0,15.42/0,19.28/0,0/0,0/0,20.50/0,22.33/0,32.63/0
2,HH Pandya,2019-2023,57,1077,71,29.1,0,40,28.97,0,...,0/0,28/14.14,5.0/14,33.40/40.33,42.83/22.35,75/45,0/0,14.40/35.66,26.16/27.75,20.14/33
3,B Kumar,2018-2022,55,45,16,9.0,0,59,22.28,1,...,0/0.8,0/18.83,0/0,0/13.30,3/25.44,0/39,0/4.5,14.50/15.87,2.0/33.57,1.0/29
4,YS Chahal,2019-2023,53,3,1,1.5,0,52,30.46,0,...,0/0,0/75,0/17,0/31.28,0/31.62,0/10,0/0,0/23.41,1/41.25,0/29.81
5,SA Yadav,2021-2023,53,1841,117,46.02,3,0,0.0,0,...,6/0,14.25/0,30/0,62.33/0,45.66/0,7.5/0,0/0,63.50/0,47.33,40.80/0
6,V Kohli,2018-2022,52,1902,122,59.43,1,0,0.0,0,...,0/0,78/0,0/0,34.4/0,57.57/0,4.5/0,0/0,28/0,22.8/0,59.66/0
7,KL Rahul,2018-2022,49,1497,91,34.81,0,0,0.0,0,...,65.5/0,8.75/0,37.25/0,58.5/0,20.85/0,70/0,9/0,30.75/0,46.0/0,40.5/0
8,SS Iyer,2019-2022,43,960,74,33.1,0,0,0.0,0,...,0/0,0/0,54/0,19.33/0,37.25/0,0/0,0/0,121/0,28.0/0,19.57/0
9,Arshdeep Singh,2022-2023,36,32,12,16.0,0,54,18.87,0,...,0/7,0/15.33,0/19,0/17,0/16.5,0/32,0/18.5,0/32.33,0/15.55,13.50/19.21


In [12]:
# Replace '-' and '*' with NaN
#data.replace(['-', '*'], [np.nan, np.nan], inplace=True)

# Remove asterisks and convert columns to numeric
#for column in dataset.columns:
 #   dataset[column] = pd.to_numeric(dataset[column].str.replace('*', ''), errors='coerce')

# Handle missing values, for example, by replacing them with the mean
#data.fillna(data.mean(), inplace=True)

In [13]:
# Create and train the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [14]:
# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

In [15]:

# Evaluate the classifier
print("Classification Report:\n", classification_report(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

         all       1.00      1.00      1.00         1
         bat       0.67      0.50      0.57         4
      bat/wk       0.00      0.00      0.00         1
         bow       1.00      1.00      1.00         4

    accuracy                           0.70        10
   macro avg       0.67      0.62      0.64        10
weighted avg       0.77      0.70      0.73        10



In [16]:
# Feature importances (optional)
feature_importances = pd.DataFrame({'Feature': all_features, 'Importance': rf_classifier.feature_importances_})
print("\nFeature Importances:\n", feature_importances.sort_values(by='Importance', ascending=False))



Feature Importances:
    Feature  Importance
2   Bat Av    0.219800
3     Wkts    0.128938
1       HS    0.121097
6     Wkts    0.120230
5     Runs    0.111630
0     Runs    0.104417
4  Bowl Av    0.086430
7     Runs    0.074232
8       Ct    0.033226


In [17]:
# Use the trained model to make predictions on the entire dataset
all_predictions = rf_classifier.predict(data[all_features])

In [18]:
# Add predictions to the original dataset
#data['Role'] = le_team.inverse_transform(all_predictions)


In [19]:
# Filter top 6 batsmen
top_batsmen = data[data['Role'] == 'bat'].nlargest(6, 'Runs')

In [20]:
# Filter top 4 bowlers
top_bowlers = data[data['Role'] == 'bow'].nlargest(4, 'Wkts')

In [21]:
# Filter top 3 all-rounders
top_allrounders = data[data['Role'] == 'all'].nlargest(3, ['Runs', 'Wkts'])


In [22]:
# Filter top 2 wicketkeeper-batsmen
top_wk_batsmen = data[data['Role'] == 'bat/wk'].nlargest(2, 'Runs')



In [23]:
# Display the selected players
print("\nTop 6 Batsmen:\n", top_batsmen[['Player', 'Runs', 'HS', 'Bat Av']])



Top 6 Batsmen:
        Player  Runs   HS  Bat Av
6     V Kohli  1902  122   59.43
5    SA Yadav  1841  117   46.02
0   RG Sharma  1639   85   29.26
8     SS Iyer   960   74   33.10
16   S Dhawan   568   52   25.81
19  SV Samson   355   77   19.72


In [24]:
print("\nTop 4 Bowlers:\n", top_bowlers[['Player', 'Wkts', 'Bowl Av']])



Top 4 Bowlers:
             Player  Wkts  Bowl Av
3          B Kumar    59    22.28
9   Arshdeep Singh    54    18.87
4        YS Chahal    52    30.46
14        HV Patel    29    26.55


In [25]:
print("\nTop 3 All-rounders:\n", top_allrounders[['Player', 'Runs', 'Wkts']])



Top 3 All-rounders:
        Player  Runs  Wkts
2   HH Pandya  1077    40
20   DJ Hooda   368     6
17  RA Jadeja   341    20


In [26]:
print("\nTop 2 Wicketkeeper-Batsmen:\n", top_wk_batsmen[['Player', 'Runs', 'Ct']])


Top 2 Wicketkeeper-Batsmen:
      Player  Runs  Ct
7  KL Rahul  1497  17
1   RR Pant   830  25
