In [1]:
## Importing libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pycaret.regression import setup, compare_models, create_model, save_model, load_model
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import learning_curve, train_test_split

In [2]:
# Importing dataset

batsman_df = pd.read_csv(r"C:\Users\harsh\Documents\dream11\ipl\Merged Batting CSV\batting.csv")
bowler_df = pd.read_csv(r"C:\Users\Harsh\Documents\dream11\ipl\Merged Bowling CSV\bowling.csv")

In [3]:
# print(batsman_df.dtypes)
# print(bowler_df.dtypes)

In [4]:
# get list of categorical columns
cat_cols = batsman_df.select_dtypes(include=['object']).columns.tolist()

# Converting object data types to categorical variables
le = LabelEncoder()
for col in cat_cols:
    le.fit(batsman_df[col])
    batsman_df[col] = le.transform(batsman_df[col])

In [5]:
cat_cols = bowler_df.select_dtypes(include=['object']).columns.tolist()
cat_cols
for col in cat_cols:
    le.fit(bowler_df[col])
    bowler_df[col] = le.transform(bowler_df[col])

In [6]:
# Checking whether any na values and replacing it with mean 
# print(bowler_df.isna().sum())
bowler_df.fillna(bowler_df.mean(), inplace=True)
# print(bowler_df.isna().sum())
# Checking whether any infinity value
# print(np.isinf(bowler_df).any())

In [7]:
# replace infinity values in the 'column_name' column with the mean of the column
column_mean = bowler_df['econrate'].replace([np.inf, -np.inf], np.nan).mean()
bowler_df['econrate'].replace([np.inf, -np.inf], column_mean, inplace=True)
print(np.isinf(bowler_df).any())

player           False
overs            False
runs             False
maidens          False
wicket           False
econrate         False
date             False
team2            False
winner           False
result           False
venue            False
team1            False
MF               False
3_wicket_haul    False
4_wicket_haul    False
5_wicket_haul    False
d11              False
dtype: bool


### PyCaret: A Low-Code Machine Learning Library in Python

PyCaret is a powerful open-source, low-code machine learning library in Python that streamlines the entire machine learning workflow. With PyCaret, you can automate various aspects of machine learning experiments, saving you time and making you more productive.

#### Key Features and Benefits

- End-to-End Machine Learning: PyCaret provides a seamless end-to-end machine learning experience, handling data preprocessing, feature engineering, model training, hyperparameter tuning, and model evaluation.

- Rapid Experimentation: Replace hundreds of lines of code with just a few words! PyCaret enables you to quickly iterate through different models and configurations, making experiments exponentially fast and efficient.

- Integration with Popular Libraries: PyCaret acts as a convenient Python wrapper around various well-known machine learning libraries and frameworks, including scikit-learn, XGBoost, LightGBM, CatBoost, Optuna, Hyperopt, Ray, and more.

#### How PyCaret Works

1. **Import the Library**: Start by importing the PyCaret library into your Python environment.

2. **Load and Preprocess Data**: Load your dataset and perform necessary data preprocessing using PyCaret's simple interface.

3. **Setup the Experiment**: With just one line of code, set up your machine learning experiment, specifying the target variable and any additional configuration settings.

4. **Compare Models**: PyCaret lets you quickly compare multiple models with minimal effort, so you can identify the best-performing ones.

5. **Tune Hyperparameters**: Utilize PyCaret's built-in hyperparameter tuning capabilities to fine-tune your models and improve their performance.

6. **Evaluate and Analyze Results**: PyCaret provides comprehensive model evaluation metrics and visualizations, making it easy to analyze the results of your experiments.

7. **Finalize and Deploy Models**: Once you're satisfied with a model, you can finalize it for production and deploy it with ease.

PyCaret is an excellent tool for both beginners and experienced data scientists, empowering them to build high-quality machine learning models efficiently.

Give PyCaret a try and witness the speed and productivity it brings to your machine learning projects!

For more information, visit the [PyCaret GitHub repository](https://github.com/pycaret/pycaret).


In [8]:
# This code is using the PyCaret library to perform regression modeling on a batsman dataset.

# The setup() function is used to prepare the data for modeling. 
# The batsman_df dataset is passed to the function, and the target parameter is set to 'd11', 
# which is the name of the column that contains the target variable to be predicted. 
# The train_size parameter is set to 0.1, which means that 10% of the data will be used for training 
# and the remaining 90% will be used for testing. The preprocess parameter is set to False, 
# which means that PyCaret will not perform any preprocessing on the data.

batsman_regression_setup = setup(
    batsman_df, target='d11', train_size=0.1, verbose=False, preprocess=False
)

# The compare_models() function is used to train and evaluate all available regression models in PyCaret
# on the batsman dataset. The best-performing model is automatically selected based on the metric specified
# in the sort parameter, which is set to the R-squared ('R2') in this case. The verbose parameter is set to
# True to print the results of each model.

best_batsman_model = compare_models(verbose=True)
compare_models(fold = 5,round = 4,sort = 'R2',turbo = True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,0.9926,1.8293,1.3481,0.9979,0.2164,0.1822,1.932
br,Bayesian Ridge,0.9958,1.8309,1.349,0.9979,0.216,0.1819,0.829
ridge,Ridge Regression,1.0085,1.8468,1.3557,0.9978,0.2149,0.1812,1.006
lar,Least Angle Regression,1.1111,2.18,1.4671,0.9975,0.226,0.2028,1.156
xgboost,Extreme Gradient Boosting,0.5571,3.3841,1.4277,0.9967,0.0411,0.0256,1.038
gbr,Gradient Boosting Regressor,0.6379,3.4399,1.3797,0.9967,0.0809,0.0553,1.013
catboost,CatBoost Regressor,0.4408,3.9713,1.4347,0.9962,0.0446,0.0278,2.977
et,Extra Trees Regressor,0.6225,4.0261,1.5959,0.996,0.0352,0.0201,1.096
huber,Huber Regressor,1.5556,3.8073,1.9501,0.9956,0.3412,0.3193,0.903
en,Elastic Net,1.6078,3.936,1.9815,0.9954,0.3551,0.3332,0.996


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,0.9907,1.8188,1.3469,0.9979,0.2174,0.1816,1.232
ridge,Ridge Regression,1.0082,1.8415,1.3559,0.9979,0.2157,0.1806,0.738
br,Bayesian Ridge,0.9942,1.8212,1.3479,0.9979,0.2169,0.1812,0.546
gbr,Gradient Boosting Regressor,0.6442,3.328,1.5267,0.9964,0.0851,0.0577,0.748
et,Extra Trees Regressor,0.6332,3.7144,1.6243,0.996,0.0371,0.0223,0.816
xgboost,Extreme Gradient Boosting,0.6486,3.7861,1.7099,0.9959,0.0438,0.0295,0.92
huber,Huber Regressor,1.5571,3.8384,1.958,0.9956,0.3331,0.3153,0.64
en,Elastic Net,1.6073,3.9333,1.9815,0.9955,0.3531,0.3333,0.558
lasso,Lasso Regression,1.6203,4.0014,1.9986,0.9954,0.3481,0.3337,0.966
llar,Lasso Least Angle Regression,1.6191,3.996,1.9973,0.9954,0.3491,0.3335,0.666


In [9]:
# After the best-performing model is identified, the create_model() function is used to create a model of the
# same type as the best-performing model. In this case, the 'lr' model type, which stands for 
# Linear Regression, is selected. The fold parameter is set to 5 to perform 5-fold cross-validation
# during training.
batsman_lr_model = create_model('lr', fold=5)

# Finally, the save_model() function is used to save the trained model to a file named 
# 'batsman_Linear_Regression_model'. This file can be loaded later to make predictions on new data.
save_model(batsman_lr_model, 'batsman_Linear_Regression_model')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0032,1.6692,1.292,0.9983,0.2269,0.1599
1,1.0075,1.8233,1.3503,0.9978,0.2396,0.1833
2,0.981,2.1597,1.4696,0.9972,0.1816,0.1913
3,0.971,1.6156,1.2711,0.9981,0.2262,0.2008
4,0.9909,1.8263,1.3514,0.9981,0.2126,0.1726
Mean,0.9907,1.8188,1.3469,0.9979,0.2174,0.1816
Std,0.0136,0.1897,0.0691,0.0004,0.0198,0.0143


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\harsh\AppData\Local\Temp\joblib),
          steps=[('placeholder', None),
                 ('trained_model', LinearRegression(n_jobs=-1))]),
 'batsman_Linear_Regression_model.pkl')

In [10]:
# This code is using the PyCaret library to perform regression modeling on a batsman dataset.

# The setup() function is used to prepare the data for modeling. 
# The bowler_df dataset is passed to the function, and the target parameter is set to 'd11', 
# which is the name of the column that contains the target variable to be predicted. 
# The train_size parameter is set to 0.1, which means that 10% of the data will be used for training 
# and the remaining 90% will be used for testing. The preprocess parameter is set to False, 
# which means that PyCaret will not perform any preprocessing on the data.

bowler_regression_setup = setup(
    bowler_df, target='d11', train_size=0.1, verbose=False, preprocess=False
)

# The compare_models() function is used to train and evaluate all available regression models in PyCaret
# on the bowler dataset. The best-performing model is automatically selected based on the metric specified
# in the sort parameter, which is set to the R-squared ('R2') in this case. The verbose parameter is set to
# True to print the results of each model.

best_bowler_model = compare_models(verbose=True)
compare_models(fold = 5,round = 4,sort = 'R2',turbo = True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0028,0.0013,0.0222,1.0,0.0008,0.0001,0.76
xgboost,Extreme Gradient Boosting,0.0053,0.028,0.062,0.9998,0.0015,0.0002,0.921
et,Extra Trees Regressor,0.0155,0.1113,0.1518,0.9994,0.0094,0.0006,0.818
rf,Random Forest Regressor,0.0183,0.1123,0.1591,0.9994,0.0059,0.0009,0.921
dt,Decision Tree Regressor,0.0117,0.1168,0.1081,0.9993,0.0026,0.0004,0.61
lightgbm,Light Gradient Boosting Machine,0.1806,0.5132,0.6809,0.997,0.1164,0.0096,0.954
catboost,CatBoost Regressor,0.1339,0.6362,0.5093,0.9964,0.0803,0.0051,1.443
ada,AdaBoost Regressor,0.3896,0.7857,0.8278,0.9954,0.0256,0.0233,0.719
lr,Linear Regression,4.8995,30.5656,5.5263,0.8194,0.8993,0.2445,0.622
br,Bayesian Ridge,4.9136,30.575,5.5272,0.8194,0.9066,0.244,0.645


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0143,0.0747,0.1338,0.9996,0.0032,0.0005,0.772
xgboost,Extreme Gradient Boosting,0.0148,0.1228,0.1636,0.9993,0.0037,0.0005,0.756
et,Extra Trees Regressor,0.0242,0.2037,0.2364,0.9989,0.0079,0.0009,0.708
rf,Random Forest Regressor,0.0292,0.2012,0.2827,0.9989,0.0115,0.0016,0.952
dt,Decision Tree Regressor,0.0234,0.2813,0.2372,0.9985,0.0056,0.0008,0.632
lightgbm,Light Gradient Boosting Machine,0.1917,0.4202,0.5997,0.9976,0.091,0.0092,0.968
ada,AdaBoost Regressor,0.5028,0.8261,0.9056,0.9952,0.029,0.0302,0.612
catboost,CatBoost Regressor,0.1574,1.0467,0.6711,0.9942,0.0856,0.0059,1.562
lr,Linear Regression,4.8962,30.5281,5.5238,0.8204,0.9002,0.2441,0.694
br,Bayesian Ridge,4.9141,30.5474,5.5256,0.8203,0.9084,0.2436,0.636


In [11]:
# Create the Gradient Boosting Regressor model for the bowler dataset using the top-performing model's hyperparameters
bowler_gbr_model = create_model('gbr', fold=5)

# Save the trained Gradient Boosting Regressor model for the bowler dataset to a file
save_model(bowler_gbr_model, 'bowler_Gradient_Boosting_Regressor_model')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0006,0.0,0.002,1.0,0.0004,0.0
1,0.0008,0.0,0.0037,1.0,0.0004,0.0
2,0.001,0.0001,0.0086,1.0,0.0005,0.0
3,0.0649,0.3714,0.6094,0.998,0.0134,0.0024
4,0.0042,0.002,0.045,1.0,0.0014,0.0002
Mean,0.0143,0.0747,0.1338,0.9996,0.0032,0.0005
Std,0.0253,0.1483,0.2383,0.0008,0.0051,0.0009


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\harsh\AppData\Local\Temp\joblib),
          steps=[('placeholder', None),
                 ('trained_model',
                  GradientBoostingRegressor(random_state=4015))]),
 'bowler_Gradient_Boosting_Regressor_model.pkl')

In [12]:
# Split the datasets into train and test sets
train_batsman_df, test_batsman_df = train_test_split(
    batsman_df, test_size=0.2, random_state=42
)
train_bowler_df, test_bowler_df = train_test_split(
    bowler_df, test_size=0.2, random_state=42
)

In [13]:
X_train_batsman = train_batsman_df.iloc[:,:-1]
y_train_batsman = train_batsman_df.iloc[:,-1:]

X_test_batsman = test_batsman_df.iloc[:,:-1]
y_test_batsman = test_batsman_df.iloc[:,-1:]

X_train_bowler = train_bowler_df.iloc[:,:-1]
y_train_bowler = train_bowler_df.iloc[:,-1:]

X_test_bowler = test_bowler_df.iloc[:,:-1]
y_test_bowler = test_bowler_df.iloc[:,-1:]

In [14]:
# Print X_train_batsman column names
print("X_train Batsman Columns:")
print(X_train_batsman.columns)

# Print y_train_batsman column names
print("y_train Batsman Columns:")
print(y_train_batsman.columns)

# Print X_train_bowler column names
print("X_train Bowler Columns:")
print(X_train_bowler.columns)

# Print y_train_bowler column names
print("y_train Bowler Columns:")
print(y_train_bowler.columns)

X_train Batsman Columns:
Index(['player', 'runs', 'balls', '4s', '6s', 'SR', 'bowler', 'fielders',
       'kind', 'player_out', 'date', 'team2', 'winner', 'result', 'venue',
       'team1', 'MF', '30s', '50s', '100s', '0s'],
      dtype='object')
y_train Batsman Columns:
Index(['d11'], dtype='object')
X_train Bowler Columns:
Index(['player', 'overs', 'runs', 'maidens', 'wicket', 'econrate', 'date',
       'team2', 'winner', 'result', 'venue', 'team1', 'MF', '3_wicket_haul',
       '4_wicket_haul', '5_wicket_haul'],
      dtype='object')
y_train Bowler Columns:
Index(['d11'], dtype='object')


In [16]:
## Predicting for batsman
batsman_lr_model.fit(X_train_batsman, y_train_batsman)
pred = batsman_lr_model.predict(X_test_batsman)
r2 = r2_score(pred,y_test_batsman)
print('The r2 using gbr is:',r2)

The r2 using gbr is: 0.9978683960678743


In [17]:
## Predicting for bowler
bowler_gbr_model.fit(X_train_bowler, y_train_bowler)
pred = bowler_gbr_model.predict(X_test_bowler)
r2 = r2_score(pred,y_test_bowler)
print('The r2 using gbr is:',r2)

The r2 using gbr is: 0.9999999797046697
