In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from pycaret.regression import setup, compare_models, create_model, save_model

In [2]:
## Importing dataset

batsman_df = pd.read_csv(r"C:\Users\Harsh\Documents\dream11\ipl\Merged Batting CSV\batting.csv")
bowler_df = pd.read_csv(r"C:\Users\Harsh\Documents\dream11\ipl\Merged Bowling CSV\bowling.csv")

In [3]:
# Converting object data types to categorical variables

# get list of categorical columns
cat_cols = batsman_df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for col in cat_cols:
    le.fit(batsman_df[col])
    batsman_df[col] = le.transform(batsman_df[col])

In [4]:
# Converting object data types to categorical variables

# get list of categorical columns
cat_cols = bowler_df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    le.fit(bowler_df[col])
    bowler_df[col] = le.transform(bowler_df[col])

In [5]:
print(batsman_df.isna().sum())
print(bowler_df.isna().sum())

player        0
runs          0
balls         0
4s            0
6s            0
SR            0
bowler        0
fielders      0
kind          0
player_out    0
date          0
team2         0
winner        0
result        0
venue         0
team1         0
MF            0
30s           0
50s           0
100s          0
0s            0
d11           0
dtype: int64
player            0
overs             0
runs              0
maidens           0
wicket            0
econrate         11
date              0
team2             0
winner            0
result            0
venue             0
team1             0
MF                0
3_wicket_haul     0
4_wicket_haul     0
5_wicket_haul     0
d11               0
dtype: int64


In [6]:
# Replacing na values with mean 
bowler_df.fillna(bowler_df.mean(), inplace=True)
print(bowler_df.isna().sum())

player           0
overs            0
runs             0
maidens          0
wicket           0
econrate         0
date             0
team2            0
winner           0
result           0
venue            0
team1            0
MF               0
3_wicket_haul    0
4_wicket_haul    0
5_wicket_haul    0
d11              0
dtype: int64


In [7]:
# Checking whether any infinity value
print("For batsman:")
print(np.isinf(batsman_df).any())
print("For bowler:")
print(np.isinf(bowler_df).any())

For batsman:
player        False
runs          False
balls         False
4s            False
6s            False
SR            False
bowler        False
fielders      False
kind          False
player_out    False
date          False
team2         False
winner        False
result        False
venue         False
team1         False
MF            False
30s           False
50s           False
100s          False
0s            False
d11           False
dtype: bool
For bowler:
player           False
overs            False
runs             False
maidens          False
wicket           False
econrate          True
date             False
team2            False
winner           False
result           False
venue            False
team1            False
MF               False
3_wicket_haul    False
4_wicket_haul    False
5_wicket_haul    False
d11              False
dtype: bool


In [8]:
# replace infinity values in the 'column_name' column with the mean of the column
column_mean = bowler_df['econrate'].replace([np.inf, -np.inf], np.nan).mean()
bowler_df['econrate'].replace([np.inf, -np.inf], column_mean, inplace=True)
print(np.isinf(bowler_df).any())

player           False
overs            False
runs             False
maidens          False
wicket           False
econrate         False
date             False
team2            False
winner           False
result           False
venue            False
team1            False
MF               False
3_wicket_haul    False
4_wicket_haul    False
5_wicket_haul    False
d11              False
dtype: bool


In [9]:
num_unique_venues = len(batsman_df['venue'].unique())
print(f"The 'venue' column has {num_unique_venues} unique values.")
venue_grouped_batsman = batsman_df.groupby(['player','venue'])['d11'].mean()
venue_grouped_batsman

The 'venue' column has 49 unique values.


player  venue
0       9        24.5
        11        8.0
        13       28.0
        19       30.5
        22       32.5
                 ... 
604     22       27.0
        29        4.0
        33        0.5
        35        2.0
        38       -1.0
Name: d11, Length: 5465, dtype: float64

In [10]:
num_unique_venues = len(bowler_df['venue'].unique())
print(f"The 'venue' column has {num_unique_venues} unique values.")
venue_grouped_bowler = bowler_df.groupby(['player','venue'])['d11'].mean()
venue_grouped_bowler

The 'venue' column has 49 unique values.


player  venue
0       2        25.000
        9        12.500
        11       25.000
        19       12.500
        22       12.500
                  ...  
471     40        0.000
        41       25.000
        42       25.000
        43       25.000
        46       16.625
Name: d11, Length: 4379, dtype: float64

In [11]:
venue_grouped_batsman = pd.DataFrame(venue_grouped_batsman)
venue_grouped_bowler = pd.DataFrame(venue_grouped_bowler)

In [12]:
venue_grouped_batsman.reset_index(inplace = True)
venue_grouped_batsman

Unnamed: 0,player,venue,d11
0,0,9,24.5
1,0,11,8.0
2,0,13,28.0
3,0,19,30.5
4,0,22,32.5
...,...,...,...
5460,604,22,27.0
5461,604,29,4.0
5462,604,33,0.5
5463,604,35,2.0


In [13]:
venue_grouped_bowler.reset_index(inplace = True)
venue_grouped_bowler

Unnamed: 0,player,venue,d11
0,0,2,25.000
1,0,9,12.500
2,0,11,25.000
3,0,19,12.500
4,0,22,12.500
...,...,...,...
4374,471,40,0.000
4375,471,41,25.000
4376,471,42,25.000
4377,471,43,25.000


### PyCaret: A Low-Code Machine Learning Library in Python

PyCaret is a powerful open-source, low-code machine learning library in Python that streamlines the entire machine learning workflow. With PyCaret, you can automate various aspects of machine learning experiments, saving you time and making you more productive.

#### Key Features and Benefits

- End-to-End Machine Learning: PyCaret provides a seamless end-to-end machine learning experience, handling data preprocessing, feature engineering, model training, hyperparameter tuning, and model evaluation.

- Rapid Experimentation: Replace hundreds of lines of code with just a few words! PyCaret enables you to quickly iterate through different models and configurations, making experiments exponentially fast and efficient.

- Integration with Popular Libraries: PyCaret acts as a convenient Python wrapper around various well-known machine learning libraries and frameworks, including scikit-learn, XGBoost, LightGBM, CatBoost, Optuna, Hyperopt, Ray, and more.

#### How PyCaret Works

1. **Import the Library**: Start by importing the PyCaret library into your Python environment.

2. **Load and Preprocess Data**: Load your dataset and perform necessary data preprocessing using PyCaret's simple interface.

3. **Setup the Experiment**: With just one line of code, set up your machine learning experiment, specifying the target variable and any additional configuration settings.

4. **Compare Models**: PyCaret lets you quickly compare multiple models with minimal effort, so you can identify the best-performing ones.

5. **Tune Hyperparameters**: Utilize PyCaret's built-in hyperparameter tuning capabilities to fine-tune your models and improve their performance.

6. **Evaluate and Analyze Results**: PyCaret provides comprehensive model evaluation metrics and visualizations, making it easy to analyze the results of your experiments.

7. **Finalize and Deploy Models**: Once you're satisfied with a model, you can finalize it for production and deploy it with ease.

PyCaret is an excellent tool for both beginners and experienced data scientists, empowering them to build high-quality machine learning models efficiently.

Give PyCaret a try and witness the speed and productivity it brings to your machine learning projects!

For more information, visit the [PyCaret GitHub repository](https://github.com/pycaret/pycaret).


In [14]:
# Set up the PyCaret environment for the batsman dataset
# Make sure to replace 'target' with the name of the target column (the one you want to predict)
batsman_regression_setup = setup(
    data=venue_grouped_batsman, target='d11',train_size=0.7, verbose=False, preprocess=False
)
# Compare all models and print the results for the batsman dataset
best_batsman_model = compare_models(fold = 5, round = 4, sort = 'R2', turbo = True, verbose=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,15.3755,420.9202,20.4987,0.1051,1.0892,2.7488,1.417
gbr,Gradient Boosting Regressor,15.6534,428.9732,20.7009,0.0879,1.1125,2.9203,0.763
lightgbm,Light Gradient Boosting Machine,15.6395,435.4154,20.8572,0.0733,1.0988,2.7988,0.878
xgboost,Extreme Gradient Boosting,15.5539,442.3036,21.0151,0.0586,1.0736,2.5591,0.812
lasso,Lasso Regression,16.624,471.1155,21.6943,-0.0015,1.1581,3.1447,0.878
ridge,Ridge Regression,16.6249,471.1157,21.6943,-0.0015,1.1581,3.1445,0.898
en,Elastic Net,16.6244,471.1133,21.6942,-0.0015,1.1581,3.1446,0.966
lar,Least Angle Regression,16.6249,471.1157,21.6943,-0.0015,1.1581,3.1445,0.937
llar,Lasso Least Angle Regression,16.624,471.1155,21.6943,-0.0015,1.1581,3.1447,0.964
lr,Linear Regression,16.6249,471.1157,21.6943,-0.0015,1.1581,3.1445,1.547


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,15.4423,420.7987,20.5072,0.1065,1.0927,2.7637,1.462
gbr,Gradient Boosting Regressor,15.7083,431.0744,20.7585,0.0843,1.1164,2.941,0.842
lightgbm,Light Gradient Boosting Machine,15.7092,437.0106,20.8995,0.0719,1.101,2.7951,1.066
xgboost,Extreme Gradient Boosting,15.6844,447.7629,21.1543,0.0491,1.0858,2.6101,1.078
dummy,Dummy Regressor,16.6129,471.2513,21.7047,-0.0011,1.1589,3.1478,0.922
omp,Orthogonal Matching Pursuit,16.6259,471.549,21.7116,-0.0017,1.1591,3.1477,0.628
en,Elastic Net,16.6325,471.6373,21.7137,-0.0019,1.1587,3.1458,0.718
lar,Least Angle Regression,16.6331,471.6418,21.7138,-0.0019,1.1586,3.1457,0.628
llar,Lasso Least Angle Regression,16.632,471.6397,21.7137,-0.0019,1.1587,3.1459,0.814
ridge,Ridge Regression,16.6331,471.6418,21.7138,-0.0019,1.1586,3.1457,0.71


<catboost.core.CatBoostRegressor at 0x1f5f2e4fbb0>

In [15]:
# Create the Catboost Regressor model for the batsman dataset using the top-performing model's hyperparameters
batsman_catboost_model = create_model('catboost', fold=5)

# Save the trained Catboost Regressor model for the batsman dataset to a file
save_model(batsman_catboost_model, 'batsman_Catboost_Regressor_model')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15.8975,446.898,21.14,0.0804,1.1281,2.7689
1,15.7521,421.0243,20.5189,0.1142,1.0861,2.7384
2,15.2367,402.9362,20.0733,0.1154,1.0999,2.9449
3,15.0678,393.2204,19.8298,0.118,1.1211,3.0529
4,15.2573,439.9148,20.9741,0.1043,1.0285,2.3136
Mean,15.4423,420.7987,20.5072,0.1065,1.0927,2.7637
Std,0.3225,20.6218,0.5031,0.0139,0.0354,0.2529


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\harsh\AppData\Local\Temp\joblib),
          steps=[('placeholder', None),
                 ('trained_model',
                  <catboost.core.CatBoostRegressor object at 0x000001F5F379F910>)]),
 'batsman_Catboost_Regressor_model.pkl')

In [16]:
# Set up the PyCaret environment for the batsman dataset
# Make sure to replace 'target' with the name of the target column (the one you want to predict)
bowler_regression_setup = setup(
    data=venue_grouped_bowler, target='d11', train_size=0.7, verbose=False, preprocess=False
)
# Compare all models and print the results for the batsman dataset
best_bowler_model = compare_models(fold = 5, round = 4, sort = 'R2', turbo = True, verbose=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,9.5685,119.2744,10.9124,-0.0026,1.4928,0.3724,0.752
omp,Orthogonal Matching Pursuit,9.6736,119.6986,10.9334,-0.0065,1.5012,0.3751,0.629
br,Bayesian Ridge,9.6746,119.7099,10.9341,-0.0067,1.5013,0.3751,0.576
lasso,Lasso Regression,9.6748,119.7267,10.9347,-0.0068,1.5013,0.3752,0.805
llar,Lasso Least Angle Regression,9.6748,119.7267,10.9347,-0.0068,1.5013,0.3752,0.642
en,Elastic Net,9.6757,119.7495,10.9357,-0.0069,1.5013,0.3752,0.634
ridge,Ridge Regression,9.677,119.7841,10.9373,-0.0072,1.5014,0.3753,0.72
lar,Least Angle Regression,9.677,119.7841,10.9373,-0.0072,1.5014,0.3753,0.648
lr,Linear Regression,9.677,119.7841,10.9373,-0.0072,1.5014,0.3753,1.089
dummy,Dummy Regressor,9.6758,119.7823,10.9377,-0.0074,1.5016,0.375,0.603


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,9.5536,119.3707,10.9187,0.0015,1.4943,0.372,0.672
ada,AdaBoost Regressor,9.6786,119.6489,10.9325,-0.0012,1.4927,0.3773,0.738
omp,Orthogonal Matching Pursuit,9.6728,119.6773,10.9339,-0.0014,1.5032,0.3751,0.58
br,Bayesian Ridge,9.6737,119.6842,10.9344,-0.0015,1.5033,0.3751,0.548
dummy,Dummy Regressor,9.6727,119.6994,10.9353,-0.0017,1.5034,0.3749,0.574
llar,Lasso Least Angle Regression,9.6747,119.7293,10.9363,-0.0018,1.5034,0.3752,0.602
lasso,Lasso Regression,9.6747,119.7293,10.9363,-0.0018,1.5034,0.3752,0.638
en,Elastic Net,9.6756,119.7607,10.9377,-0.0021,1.5035,0.3752,0.546
lr,Linear Regression,9.6768,119.7967,10.9394,-0.0024,1.5036,0.3752,0.562
lar,Least Angle Regression,9.6768,119.7967,10.9394,-0.0024,1.5036,0.3752,0.57


In [17]:
# Create the Gradient Boosting Regressor model for the bowler dataset using the top-performing model's hyperparameters
bowler_gbr_model = create_model('gbr', fold=5)

# Save the trained Gradient Boosting Regressor model for the bowler dataset to a file
save_model(bowler_gbr_model, 'bowler_Gradient_Boosting_Regressor_model')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.7505,121.9103,11.0413,0.0174,1.5218,0.3859
1,9.9508,128.5907,11.3398,-0.0211,1.5936,0.3683
2,9.4116,118.6515,10.8927,-0.011,1.4851,0.3562
3,9.8853,123.8007,11.1266,0.0025,1.5081,0.3795
4,8.77,103.9,10.1931,0.0197,1.3629,0.3701
Mean,9.5536,119.3707,10.9187,0.0015,1.4943,0.372
Std,0.4338,8.3779,0.3906,0.0158,0.0751,0.0102


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\harsh\AppData\Local\Temp\joblib),
          steps=[('placeholder', None),
                 ('trained_model',
                  GradientBoostingRegressor(random_state=1800))]),
 'bowler_Gradient_Boosting_Regressor_model.pkl')

In [18]:
# Split the datasets into train and test sets
train_batsman_df, test_batsman_df = train_test_split(
    venue_grouped_batsman, test_size=0.2, random_state=42
)
train_bowler_df, test_bowler_df = train_test_split(
    venue_grouped_bowler, test_size=0.2, random_state=42
)

In [19]:
X_train_batsman, y_train_batsman = train_batsman_df.iloc[:,:-1], train_batsman_df.iloc[:,-1:]

X_test_batsman, y_test_batsman = test_batsman_df.iloc[:,:-1], test_batsman_df.iloc[:,-1:]

X_train_bowler, y_train_bowler = train_bowler_df.iloc[:,:-1], train_bowler_df.iloc[:,-1:]

X_test_bowler, y_test_bowler = test_bowler_df.iloc[:,:-1], test_bowler_df.iloc[:,-1:]

In [20]:
# Print X_train_batsman column names
print("X_train Batsman Columns:")
print(X_train_batsman.columns)

# Print y_train_batsman column names
print("y_train Batsman Columns:")
print(y_train_batsman.columns)

# Print X_train_bowler column names
print("X_train Bowler Columns:")
print(X_train_bowler.columns)

# Print y_train_bowler column names
print("y_train Bowler Columns:")
print(y_train_bowler.columns)

X_train Batsman Columns:
Index(['player', 'venue'], dtype='object')
y_train Batsman Columns:
Index(['d11'], dtype='object')
X_train Bowler Columns:
Index(['player', 'venue'], dtype='object')
y_train Bowler Columns:
Index(['d11'], dtype='object')


In [21]:
## Predicting for batsman
batsman_catboost_model.fit(X_train_batsman, y_train_batsman)
pred = batsman_catboost_model.predict(X_test_batsman)
r2 = r2_score(pred,y_test_batsman)
print('The r2 using gbr is:',r2)

The r2 using gbr is: -4.6730677515907635


In [22]:
## Predicting for bowler
bowler_gbr_model.fit(X_train_bowler, y_train_bowler)
pred = bowler_gbr_model.predict(X_test_bowler)
r2 = r2_score(pred,y_test_bowler)
print('The r2 using gbr is:',r2)

The r2 using gbr is: -31.554075936715144
