In [1]:
## Importing libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pycaret.regression import setup, compare_models, create_model, save_model, load_model
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import learning_curve, train_test_split
from tqdm import tqdm

In [2]:
## Importing dataset

batsman_df = pd.read_csv(r"C:\Users\Harsh\Documents\dream11\ipl\Merged Batting CSV\batting.csv")
bowler_df = pd.read_csv(r"C:\Users\Harsh\Documents\dream11\ipl\Merged Bowling CSV\bowling.csv")

In [3]:
# Converting object data types to categorical variables

# get list of categorical columns
cat_cols = batsman_df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for col in cat_cols:
    le.fit(batsman_df[col])
    batsman_df[col] = le.transform(batsman_df[col])

In [4]:
# Converting object data types to categorical variables

# get list of categorical columns
cat_cols = bowler_df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    le.fit(bowler_df[col])
    bowler_df[col] = le.transform(bowler_df[col])

In [5]:
print(batsman_df.isna().sum())
print(bowler_df.isna().sum())

player        0
runs          0
balls         0
4s            0
6s            0
SR            0
bowler        0
fielders      0
kind          0
player_out    0
date          0
team2         0
winner        0
result        0
venue         0
team1         0
MF            0
30s           0
50s           0
100s          0
0s            0
d11           0
dtype: int64
player            0
overs             0
runs              0
maidens           0
wicket            0
econrate         11
date              0
team2             0
winner            0
result            0
venue             0
team1             0
MF                0
3_wicket_haul     0
4_wicket_haul     0
5_wicket_haul     0
d11               0
dtype: int64


In [6]:
# Replacing na values with mean 
bowler_df.fillna(bowler_df.mean(), inplace=True)
print(bowler_df.isna().sum())

player           0
overs            0
runs             0
maidens          0
wicket           0
econrate         0
date             0
team2            0
winner           0
result           0
venue            0
team1            0
MF               0
3_wicket_haul    0
4_wicket_haul    0
5_wicket_haul    0
d11              0
dtype: int64


In [7]:
# Checking whether any infinity value
print("For batsman:")
print(np.isinf(batsman_df).any())
print("For bowler:")
print(np.isinf(bowler_df).any())

For batsman:
player        False
runs          False
balls         False
4s            False
6s            False
SR            False
bowler        False
fielders      False
kind          False
player_out    False
date          False
team2         False
winner        False
result        False
venue         False
team1         False
MF            False
30s           False
50s           False
100s          False
0s            False
d11           False
dtype: bool
For bowler:
player           False
overs            False
runs             False
maidens          False
wicket           False
econrate          True
date             False
team2            False
winner           False
result           False
venue            False
team1            False
MF               False
3_wicket_haul    False
4_wicket_haul    False
5_wicket_haul    False
d11              False
dtype: bool


In [8]:
# replace infinity values in the 'column_name' column with the mean of the column
column_mean = bowler_df['econrate'].replace([np.inf, -np.inf], np.nan).mean()
bowler_df['econrate'].replace([np.inf, -np.inf], column_mean, inplace=True)

In [9]:
print("Bowler df Columns:")
print(bowler_df.columns)

print("Batsman df Columns:")
print(batsman_df.columns)

Bowler df Columns:
Index(['player', 'overs', 'runs', 'maidens', 'wicket', 'econrate', 'date',
       'team2', 'winner', 'result', 'venue', 'team1', 'MF', '3_wicket_haul',
       '4_wicket_haul', '5_wicket_haul', 'd11'],
      dtype='object')
Batsman df Columns:
Index(['player', 'runs', 'balls', '4s', '6s', 'SR', 'bowler', 'fielders',
       'kind', 'player_out', 'date', 'team2', 'winner', 'result', 'venue',
       'team1', 'MF', '30s', '50s', '100s', '0s', 'd11'],
      dtype='object')


In [10]:
bowler_df = bowler_df[['player', 'date',
       'team2', 'venue', 'team1', 'MF','d11']]
print("Bowler df Columns:")
print(bowler_df.columns)

batsman_df = batsman_df[['player','date', 'team2','venue',
       'team1', 'MF','d11']]
print("Batsman df Columns:")
print(batsman_df.columns)

Bowler df Columns:
Index(['player', 'date', 'team2', 'venue', 'team1', 'MF', 'd11'], dtype='object')
Batsman df Columns:
Index(['player', 'date', 'team2', 'venue', 'team1', 'MF', 'd11'], dtype='object')


### PyCaret: A Low-Code Machine Learning Library in Python

PyCaret is a powerful open-source, low-code machine learning library in Python that streamlines the entire machine learning workflow. With PyCaret, you can automate various aspects of machine learning experiments, saving you time and making you more productive.

#### Key Features and Benefits

- End-to-End Machine Learning: PyCaret provides a seamless end-to-end machine learning experience, handling data preprocessing, feature engineering, model training, hyperparameter tuning, and model evaluation.

- Rapid Experimentation: Replace hundreds of lines of code with just a few words! PyCaret enables you to quickly iterate through different models and configurations, making experiments exponentially fast and efficient.

- Integration with Popular Libraries: PyCaret acts as a convenient Python wrapper around various well-known machine learning libraries and frameworks, including scikit-learn, XGBoost, LightGBM, CatBoost, Optuna, Hyperopt, Ray, and more.

#### How PyCaret Works

1. **Import the Library**: Start by importing the PyCaret library into your Python environment.

2. **Load and Preprocess Data**: Load your dataset and perform necessary data preprocessing using PyCaret's simple interface.

3. **Setup the Experiment**: With just one line of code, set up your machine learning experiment, specifying the target variable and any additional configuration settings.

4. **Compare Models**: PyCaret lets you quickly compare multiple models with minimal effort, so you can identify the best-performing ones.

5. **Tune Hyperparameters**: Utilize PyCaret's built-in hyperparameter tuning capabilities to fine-tune your models and improve their performance.

6. **Evaluate and Analyze Results**: PyCaret provides comprehensive model evaluation metrics and visualizations, making it easy to analyze the results of your experiments.

7. **Finalize and Deploy Models**: Once you're satisfied with a model, you can finalize it for production and deploy it with ease.

PyCaret is an excellent tool for both beginners and experienced data scientists, empowering them to build high-quality machine learning models efficiently.

Give PyCaret a try and witness the speed and productivity it brings to your machine learning projects!

For more information, visit the [PyCaret GitHub repository](https://github.com/pycaret/pycaret).


In [11]:
# Set up the PyCaret environment for the batsman dataset
# Make sure to replace 'target' with the name of the target column (the one you want to predict)
batsman_regression_setup = setup(
    data=batsman_df, target='d11',train_size=0.7, verbose=False, preprocess=False
)

# Compare all models and print the results for the batsman dataset
best_batsman_model = compare_models(verbose=True)
compare_models(fold = 5, round = 4, sort = 'R2', turbo = True)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,19.9549,734.9322,27.0999,0.1602,1.2401,3.7344,2.411
catboost,CatBoost Regressor,21.2548,789.6962,28.0929,0.0983,1.2807,3.9447,2.402
xgboost,Extreme Gradient Boosting,20.9919,790.7274,28.115,0.0965,1.2615,3.7574,0.897
lightgbm,Light Gradient Boosting Machine,21.4279,798.8314,28.2534,0.0881,1.2912,4.0236,0.975
gbr,Gradient Boosting Regressor,21.92,827.8775,28.7618,0.0551,1.3159,4.1719,1.008
et,Extra Trees Regressor,20.9606,840.3014,28.9786,0.0393,1.2751,3.8639,1.844
dummy,Dummy Regressor,22.6138,876.8078,29.5992,-0.0007,1.347,4.3514,0.579
br,Bayesian Ridge,22.6171,877.1316,29.6045,-0.001,1.3471,4.3523,0.483
omp,Orthogonal Matching Pursuit,22.6185,877.3339,29.6079,-0.0012,1.3472,4.3527,0.55
llar,Lasso Least Angle Regression,22.62,877.4265,29.6095,-0.0014,1.3472,4.3529,0.581


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,20.3279,753.5256,27.4464,0.1399,1.2517,3.7932,1.72
catboost,CatBoost Regressor,21.3604,799.5025,28.2727,0.0876,1.2858,3.967,1.442
lightgbm,Light Gradient Boosting Machine,21.5116,805.6375,28.3809,0.0806,1.2934,4.0449,0.728
xgboost,Extreme Gradient Boosting,21.1227,807.2471,28.4098,0.0786,1.2671,3.7808,0.66
gbr,Gradient Boosting Regressor,21.9654,831.3666,28.8288,0.0516,1.3177,4.1811,0.67
et,Extra Trees Regressor,21.2559,853.9439,29.2186,0.0253,1.2821,3.8839,1.27
dummy,Dummy Regressor,22.6129,876.7489,29.6049,-0.0002,1.3471,4.351,0.42
br,Bayesian Ridge,22.6146,876.9882,29.609,-0.0004,1.3472,4.3523,0.478
omp,Orthogonal Matching Pursuit,22.6149,877.136,29.6115,-0.0006,1.3472,4.3518,0.446
lasso,Lasso Regression,22.616,877.2347,29.6133,-0.0007,1.3472,4.3525,0.52


In [12]:
# Create the Random Forest Regressor model for the batsman dataset using the top-performing model's hyperparameters
batsman_rf_model = create_model('rf', fold=5)

# Save the trained Random Forest Regressor model model for the batsman dataset to a file
save_model(batsman_rf_model, 'batsman_Random_Forest_Regressor_model')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,22.3211,854.8935,29.2386,0.0498,1.3469,4.2705
1,22.0206,818.6301,28.6117,0.0482,1.3398,4.2616
2,22.1176,844.3365,29.0575,0.0578,1.3118,4.3255
3,21.9769,859.3222,29.3142,0.0514,1.2993,4.1408
4,21.3906,779.6506,27.9222,0.0506,1.2907,3.9072
Mean,21.9654,831.3666,28.8288,0.0516,1.3177,4.1811
Std,0.3109,29.4619,0.5147,0.0033,0.0221,0.1496


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\harsh\AppData\Local\Temp\joblib),
          steps=[('placeholder', None),
                 ('trained_model',
                  GradientBoostingRegressor(random_state=7776))]),
 'batsman_Gradient_Boosting_Regressor_model.pkl')

In [13]:
# Set up the PyCaret environment for the bowler dataset
# Make sure to replace 'target' with the name of the target column (the one you want to predict)
bowler_regression_setup = setup(bowler_df, target='d11', train_size=0.7, 
                                verbose=False, preprocess=False)

# Compare all models and print the results for the bowler dataset
best_bowler_model = compare_models(verbose = True)
compare_models(fold = 5,round = 4,sort = 'R2',turbo = True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,11.2709,164.3806,12.8205,0.0522,1.7371,0.3548,1.383
catboost,CatBoost Regressor,11.9225,167.5667,12.9439,0.0339,1.7797,0.3719,1.702
lightgbm,Light Gradient Boosting Machine,12.0358,167.7101,12.9497,0.033,1.7862,0.3746,0.85
gbr,Gradient Boosting Regressor,12.3504,169.5679,13.0215,0.0223,1.8041,0.3837,1.025
xgboost,Extreme Gradient Boosting,11.4657,170.8738,13.071,0.0147,1.7503,0.3626,0.783
ada,AdaBoost Regressor,12.65,173.3926,13.1674,0.0002,1.8102,0.4014,1.08
omp,Orthogonal Matching Pursuit,12.6101,173.6233,13.1763,-0.0011,1.8189,0.3911,0.532
br,Bayesian Ridge,12.6099,173.6337,13.1767,-0.0012,1.8189,0.3911,0.515
llar,Lasso Least Angle Regression,12.6097,173.6748,13.1782,-0.0014,1.8189,0.3911,0.538
dummy,Dummy Regressor,12.6149,173.6745,13.1782,-0.0014,1.8191,0.3913,0.581


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,11.4646,168.9304,12.997,0.0267,1.7487,0.3618,1.54
catboost,CatBoost Regressor,11.9504,168.948,12.9975,0.0266,1.7815,0.3726,1.664
lightgbm,Light Gradient Boosting Machine,12.0572,169.0498,13.0017,0.026,1.7896,0.3741,0.746
gbr,Gradient Boosting Regressor,12.359,169.8936,13.0342,0.0211,1.8051,0.3837,0.94
ada,AdaBoost Regressor,12.6676,173.5254,13.1727,0.0002,1.8079,0.4048,0.746
dummy,Dummy Regressor,12.6144,173.6637,13.1779,-0.0006,1.8193,0.3913,0.49
br,Bayesian Ridge,12.6112,173.6926,13.179,-0.0007,1.8192,0.3911,0.478
omp,Orthogonal Matching Pursuit,12.6124,173.7009,13.1794,-0.0008,1.8192,0.3912,0.484
lasso,Lasso Regression,12.6116,173.7598,13.1816,-0.0011,1.8192,0.3912,0.482
en,Elastic Net,12.6107,173.7611,13.1816,-0.0011,1.8192,0.3911,0.516


In [14]:
# Create the Random Forest Regressor model for the bowler dataset using the top-performing model's hyperparameters
bowler_rf_model = create_model('rf', fold=5)

# Save the trained Random Forest Regressor model for the bowler dataset to a file
save_model(bowler_rf_model, 'bowler_Random_Forest_Regressor_model')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12.4101,172.0637,13.1173,0.0231,1.8462,0.3764
1,12.3034,168.9833,12.9994,0.015,1.807,0.3791
2,12.3897,170.5033,13.0577,0.019,1.7846,0.3897
3,12.4108,170.5834,13.0608,0.0254,1.8033,0.3891
4,12.2808,167.3344,12.9358,0.0231,1.7845,0.3845
Mean,12.359,169.8936,13.0342,0.0211,1.8051,0.3837
Std,0.0556,1.6084,0.0617,0.0037,0.0226,0.0053


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\harsh\AppData\Local\Temp\joblib),
          steps=[('placeholder', None),
                 ('trained_model',
                  GradientBoostingRegressor(random_state=2763))]),
 'bowler_Gradient_Boosting_Regressor_model.pkl')

In [15]:
# Split the datasets into train and test sets 
train_batsman_df, test_batsman_df = train_test_split(
    batsman_df, test_size=0.2, random_state=42
)
train_bowler_df, test_bowler_df = train_test_split(
    bowler_df, test_size=0.2, random_state=42
)

In [16]:
X_train_batsman = train_batsman_df.iloc[:,:-1]
y_train_batsman = train_batsman_df.iloc[:,-1:]

X_test_batsman = test_batsman_df.iloc[:,:-1]
y_test_batsman = test_batsman_df.iloc[:,-1:]

X_train_bowler = train_bowler_df.iloc[:,:-1]
y_train_bowler = train_bowler_df.iloc[:,-1:]

X_test_bowler = test_bowler_df.iloc[:,:-1]
y_test_bowler = test_bowler_df.iloc[:,-1:]

In [17]:
# Print X_train_batsman column names
print("X_train Batsman Columns:")
print(X_train_batsman.columns)

# Print y_train_batsman column names
print("y_train Batsman Columns:")
print(y_train_batsman.columns)

# Print X_train_bowler column names
print("X_train Bowler Columns:")
print(X_train_bowler.columns)

# Print y_train_bowler column names
print("y_train Bowler Columns:")
print(y_train_bowler.columns)

X_train Batsman Columns:
Index(['player', 'date', 'team2', 'venue', 'team1', 'MF'], dtype='object')
y_train Batsman Columns:
Index(['d11'], dtype='object')
X_train Bowler Columns:
Index(['player', 'date', 'team2', 'venue', 'team1', 'MF'], dtype='object')
y_train Bowler Columns:
Index(['d11'], dtype='object')


In [18]:
## Predicting for batsman
batsman_rf_model.fit(X_train_batsman, y_train_batsman)
pred = batsman_rf_model.predict(X_test_batsman)
r2 = r2_score(pred,y_test_batsman)
print('The r2 using gbr is:',r2)

The r2 using gbr is: -29.761309773177064


In [19]:
## Predicting for bowler
bowler_rf_model.fit(X_train_bowler, y_train_bowler)
pred = bowler_rf_model.predict(X_test_bowler)
r2 = r2_score(pred,y_test_bowler)
print('The r2 using gbr is:',r2)

The r2 using gbr is: -55.84587437927597
