<a href="https://colab.research.google.com/github/Kamran-imaz/IPL_Score_Prediction/blob/main/IPL_SCORE_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6d1be6b9f9b119c82f24797381aa78495d623bf31359faa1db6ef312998c9a83
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('IPL_SCORE').getOrCreate()

In [5]:
data_frame = spark.read.csv('/content/dataset/deliveries.csv',header = True, inferSchema=True )


In [6]:
#length of the dataset
data_frame.count()

40750

In [7]:
data_frame.show()

+--------+------+--------------------+--------------------+----+----+-----------+---------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|        batting_team|        bowling_team|over|ball|     batter|   bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+--------------------+--------------------+----+----+-----------+---------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   1| SC Ganguly|  P Kumar|BB McCullum|           0|         1|         1|    legbyes|        0|              NA|            NA|     NA|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   2|BB McCullum|  P Kumar| SC Ganguly|           0|         0|         0|       NULL|        0|              NA|            NA|    

In [8]:
# checking wether is_wicket is having value other than 0 or not?
use_of_iswicket = data_frame.filter(data_frame['is_wicket'] != 0).count()
use_of_iswicket

2061

In [9]:
# checking the usage of player_dismissed,dismissal_kind and fielder
data_frame.filter(data_frame['player_dismissed'] != 'NA').show()
data_frame.filter(data_frame['dismissal_kind'] != 'NA').show()
data_frame.filter(data_frame['fielder']!='NA').show()

# from these we can say that the the 3 columns are not valuable for us our target is to predict the score.
# hence we remove these 3 columns.

+--------+------+--------------------+--------------------+----+----+----------+----------+-------------+------------+----------+----------+-----------+---------+----------------+--------------+---------------+
|match_id|inning|        batting_team|        bowling_team|over|ball|    batter|    bowler|  non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|        fielder|
+--------+------+--------------------+--------------------+----+----+----------+----------+-------------+------------+----------+----------+-----------+---------+----------------+--------------+---------------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   5|   2|SC Ganguly|    Z Khan|  BB McCullum|           0|         0|         0|       NULL|        1|      SC Ganguly|        caught|      JH Kallis|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|  12|   1|RT Ponting| JH Kallis|  BB McCullum|           0|         0|         0|       NULL|    

In [10]:
# droping the columns.
data_frame = data_frame.drop('player_dismissed','dismissal_kind','fielder')

In [11]:
# further we can see that there is no need of 'extra_type, batter, bowler,non_striker, batsman_runs' column also
data_frame = data_frame.drop('extras_type', 'batter', 'bowler','non_striker','batsman_runs')
data_frame.show()

+--------+------+--------------------+--------------------+----+----+----------+----------+---------+
|match_id|inning|        batting_team|        bowling_team|over|ball|extra_runs|total_runs|is_wicket|
+--------+------+--------------------+--------------------+----+----+----------+----------+---------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   1|         1|         1|        0|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   2|         0|         0|        0|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   3|         1|         1|        0|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   4|         0|         0|        0|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   5|         0|         0|        0|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   6|         0|         0|        0|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   7|         1|  

In [12]:
# here we need to check wether the columns contains null values or not?
# data_frame.filter(data_frame['batting_team'].isNull()).count() ---- No NULL Value found
# data_frame.filter(data_frame['bowling_team'].isNull()).count() ---- No NULL Value found

In [13]:
# since we removed the unwanted columns by manually but that is not enough since there might be lots of hard
# to noticeable columns so we need to apply principal component analysis here.

# since the vector assembler cannot convert the string directly so we need to convert it into numerical value using string indexer.
from pyspark.ml.feature import StringIndexer
indexers = [
    StringIndexer(inputCol='batting_team', outputCol='batting_team_index'),
    StringIndexer(inputCol='bowling_team', outputCol='bowling_team_index')
]
for indexer in indexers:
    data_frame = indexer.fit(data_frame).transform(data_frame)


In [21]:
inputs = ['match_id','extra_runs','batting_team_index', 'bowling_team_index', 'inning', 'over', 'ball', 'is_wicket']
from pyspark.ml.feature import PCA, VectorAssembler
assembler = VectorAssembler(inputCols=inputs,outputCol='features')
new_df = assembler.transform(data_frame)
new_df = new_df.drop('batting_team','bowling_team')
new_df.show()

+--------+------+----+----+----------+----------+---------+------------------+------------------+--------------------+
|match_id|inning|over|ball|extra_runs|total_runs|is_wicket|batting_team_index|bowling_team_index|            features|
+--------+------+----+----+----------+----------+---------+------------------+------------------+--------------------+
|  335982|     1|   0|   1|         1|         1|        0|               7.0|               0.0|[335982.0,1.0,7.0...|
|  335982|     1|   0|   2|         0|         0|        0|               7.0|               0.0|(8,[0,2,4,6],[335...|
|  335982|     1|   0|   3|         1|         1|        0|               7.0|               0.0|[335982.0,1.0,7.0...|
|  335982|     1|   0|   4|         0|         0|        0|               7.0|               0.0|(8,[0,2,4,6],[335...|
|  335982|     1|   0|   5|         0|         0|        0|               7.0|               0.0|(8,[0,2,4,6],[335...|
|  335982|     1|   0|   6|         0|         0

In [22]:
#since we have already imported PCA
pca = PCA(inputCol='features',outputCol='pca_features',k=8)
model = pca.fit(new_df)
new_df = model.transform(new_df)

In [23]:
new_df.show()

+--------+------+----+----+----------+----------+---------+------------------+------------------+--------------------+--------------------+
|match_id|inning|over|ball|extra_runs|total_runs|is_wicket|batting_team_index|bowling_team_index|            features|        pca_features|
+--------+------+----+----+----------+----------+---------+------------------+------------------+--------------------+--------------------+
|  335982|     1|   0|   1|         1|         1|        0|               7.0|               0.0|[335982.0,1.0,7.0...|[-335982.00000737...|
|  335982|     1|   0|   2|         0|         0|        0|               7.0|               0.0|(8,[0,2,4,6],[335...|[-335982.00000739...|
|  335982|     1|   0|   3|         1|         1|        0|               7.0|               0.0|[335982.0,1.0,7.0...|[-335982.00000723...|
|  335982|     1|   0|   4|         0|         0|        0|               7.0|               0.0|(8,[0,2,4,6],[335...|[-335982.00000726...|
|  335982|     1|   

In [24]:
loadings = model.pc.toArray()

In [25]:
# taken from chatgpt reference.
original_columns = ['match_id','extra_runs','batting_team_index', 'bowling_team_index', 'inning', 'over', 'ball', 'is_wicket']

# PCA loadings
for i, pc in enumerate(loadings):
    print(f"Principal Component {i+1} Loadings:")
    for j, loading in enumerate(pc):
        print(f"  {original_columns[j]}: {loading}")


Principal Component 1 Loadings:
  match_id: -0.9999999999952109
  extra_runs: -2.8021683937977314e-06
  batting_team_index: 9.585381057551388e-07
  bowling_team_index: 8.872284140421739e-07
  inning: 5.4847479432834027e-08
  over: -8.912994418091828e-08
  ball: -9.1284486934976e-08
  is_wicket: 2.9473896729743616e-08
Principal Component 2 Loadings:
  match_id: 9.216330511398363e-08
  extra_runs: -0.0009196502682798827
  batting_team_index: -0.00029165711185319214
  bowling_team_index: -0.0018287263406168233
  inning: -0.0005701252437237262
  over: 0.009253715763342291
  ball: -0.9992179346975576
  is_wicket: 0.038383428976593775
Principal Component 3 Loadings:
  match_id: -1.2961019814363525e-06
  extra_runs: -0.0038133824811116246
  batting_team_index: -0.72311465338482
  bowling_team_index: -0.6906768917728265
  inning: -0.005621500302658776
  over: 0.004707069192498768
  ball: 0.001526171192382218
  is_wicket: 1.946811328280231e-05
Principal Component 4 Loadings:
  match_id: 4.16297

In [31]:
# Hence we can say that all are very important for us so now we can move to training of our model.
# we will be using linear regression, decision tree regressor, random forest regressor, support vector regressor, neural network.
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
#since we have already done the vector assembler....
lr = LinearRegression(featuresCol='features',labelCol='total_runs')
dtr = DecisionTreeRegressor(featuresCol='features',labelCol='total_runs')
rfr = RandomForestRegressor(featuresCol='features',labelCol='total_runs')
gbtr = GBTRegressor(featuresCol='features',labelCol='total_runs')


In [33]:
paramGrid = ParamGridBuilder().build()
models = [("Linear_Regression",lr),("Decision_Tree_Regressor",dtr),("Random_Forest_Regressor",rfr),("Gradient_Boosting_Regressor",gbtr)]
for name,model in models:
  crossval = CrossValidator(numFolds=5,estimator=model,estimatorParamMaps=paramGrid,evaluator=RegressionEvaluator(labelCol='total_runs'))
  cvModel = crossval.fit(new_df)
  predictions = cvModel.transform(new_df)
  evaluator = RegressionEvaluator(labelCol='total_runs')
  rmse = evaluator.evaluate(predictions)
  print(f"RMSE {name} is {rmse}")

RMSE Linear_Regression is 1.5387369826793211
RMSE Decision_Tree_Regressor is 1.5254465500762537
RMSE Random_Forest_Regressor is 1.5263223978506848
RMSE Gradient_Boosting_Regressor is 1.5054300914997487
