<a href="https://colab.research.google.com/github/Kamran-imaz/IPL_Score_Prediction/blob/main/IPL_SCORE_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents

1. Spark Session Creation and Dataset Loading
2. Data Preprocessing
3. Principal Component Analysis
4. Model Training and Testing
5. Stacking Method
6. Testing on Real Data



In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=cb14dc42115b78210bd33d7e9864695766c7554695306b9eb47a7be63b48e9e8
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('IPL_SCORE').getOrCreate()

In [94]:
data_frame = spark.read.csv('/content/dataset/data.csv',header = True, inferSchema=True )


# Data Pre-Processing

In [95]:
# count no of rows in the data_frame
data_frame.count()

76014

In [96]:
# let's look at our dataset once...
data_frame.show()

+---+----------+--------------------+--------------------+--------------------+-----------+---------+----+-------+-----+-----------+--------------+-------+-----------+-----+
|mid|      date|               venue|        batting_team|        bowling_team|    batsman|   bowler|runs|wickets|overs|runs_last_5|wickets_last_5|striker|non-striker|total|
+---+----------+--------------------+--------------------+--------------------+-----------+---------+----+-------+-----+-----------+--------------+-------+-----------+-----+
|  1|2008-04-18|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...| SC Ganguly|  P Kumar|   1|      0|  0.1|          1|             0|      0|          0|  222|
|  1|2008-04-18|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|BB McCullum|  P Kumar|   1|      0|  0.2|          1|             0|      0|          0|  222|
|  1|2008-04-18|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|BB McCullum|  P Kumar|   2|      0|  0.2|          

In [97]:
# lets check wether we are having any null value in venue. we cannot delete the venue column directly
# because it depends on the stadium pitch as well. so we will run pca and cross-check once!
# as off now there is no such null valus in the venue. so we need not insert any values.
data_frame.filter(data_frame['venue'].isNull()).count()

0

In [98]:
# Let us Remove Some Un-Wanted Columns (less-Variance Columns Manually)
data_frame = data_frame.drop('mid','date','batsman','bowler','striker','non-striker')
data_frame.show()

+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+
|               venue|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|
+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+
|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.1|          1|             0|  222|
|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.2|          1|             0|  222|
|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.2|          2|             0|  222|
|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.3|          2|             0|  222|
|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.4|          2|             0|  222|
|M Chinnaswamy Sta...|Kolkata Knight Ri...|Royal Challengers...|

In [99]:
# let us look how many unique teams and venues are there!
data_frame.select('venue').distinct().count()

35

In [100]:
# since we are having huge no of venues so it will be messy when we do one-hot encoding and string Indexing
# so let us remove this column also.
data_frame = data_frame.drop('venue')

In [101]:
# so now let us look at the batting_team column
data_frame.select('batting_team').distinct().count()
data_frame.select('batting_team').distinct().show()

+--------------------+
|        batting_team|
+--------------------+
| Sunrisers Hyderabad|
| Chennai Super Kings|
|     Deccan Chargers|
|Kochi Tuskers Kerala|
|    Rajasthan Royals|
|Royal Challengers...|
|Kolkata Knight Ri...|
|     Kings XI Punjab|
|       Pune Warriors|
|    Delhi Daredevils|
|      Mumbai Indians|
|Rising Pune Super...|
|       Gujarat Lions|
|Rising Pune Super...|
+--------------------+



In [102]:
# let us write only the consistent teams which play most of the matches in one list for the future purpose.
teams=['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
              'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
              'Delhi Daredevils', 'Sunrisers Hyderabad']

In [103]:
# now let us remove other teams from our datasets.
data_frame = data_frame.filter(data_frame['batting_team'].isin(teams) & data_frame['bowling_team'].isin(teams))
data_frame.show()

+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+
|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|
+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+
|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.1|          1|             0|  222|
|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.2|          1|             0|  222|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.2|          2|             0|  222|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.3|          2|             0|  222|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.4|          2|             0|  222|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.5|          2|             0|  222|
|Kolkata Knight Ri...|Royal Challengers...|   3|      0|  0.6|          3|             0|  222|
|Kolkata Knight Ri...|Royal Challengers.

In [104]:
# now count the length of the dataset.
data_frame.count()

53811

In [105]:
# now let us convert the batting_team to numerical variable string Indexer followed by one-hot encoder
from pyspark.ml.feature import StringIndexer, OneHotEncoder
indexer = StringIndexer(inputCol='batting_team',outputCol='batting_team_index')
data_frame = indexer.fit(data_frame).transform(data_frame)
encoder = OneHotEncoder(inputCol='batting_team_index',outputCol='batting_team_vector')
data_frame = encoder.fit(data_frame).transform(data_frame)
data_frame.show()

+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+
|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|
+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+
|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.1|          1|             0|  222|               4.0|      (7,[4],[1.0])|
|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.2|          1|             0|  222|               4.0|      (7,[4],[1.0])|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.2|          2|             0|  222|               4.0|      (7,[4],[1.0])|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.3|          2|             0|  222|               4.0|      (7,[4],[1.0])|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0

In [136]:
# now let us check which team is assigned which index number.
data_frame.select('batting_team','batting_team_index').distinct().show()

+--------------------+------------------+
|        batting_team|batting_team_index|
+--------------------+------------------+
| Sunrisers Hyderabad|               7.0|
|Kolkata Knight Ri...|               4.0|
| Chennai Super Kings|               2.0|
|     Kings XI Punjab|               1.0|
|Royal Challengers...|               3.0|
|      Mumbai Indians|               0.0|
|    Delhi Daredevils|               6.0|
|    Rajasthan Royals|               5.0|
+--------------------+------------------+



In [142]:
teams_index = {0: 'Mumbai Indians', 1: 'Kings XI Punjab', 2: 'Chennai Super Kings',
               3: 'Royal Challengers Bangalore', 4: 'Kolkata Knight Riders',
               5: 'Rajasthan Royals', 6: 'Delhi Daredevils', 7: 'Sunrisers Hyderabad'}

In [106]:
# in the same way let us do for bowling_team also
indexer = StringIndexer(inputCol='bowling_team',outputCol='bowling_team_index')
data_frame = indexer.fit(data_frame).transform(data_frame)
encoder = OneHotEncoder(inputCol='bowling_team_index',outputCol='bowling_team_vector')
data_frame = encoder.fit(data_frame).transform(data_frame)
data_frame.show()

+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+
|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|
+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+
|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.1|          1|             0|  222|               4.0|      (7,[4],[1.0])|               2.0|      (7,[2],[1.0])|
|Kolkata Knight Ri...|Royal Challengers...|   1|      0|  0.2|          1|             0|  222|               4.0|      (7,[4],[1.0])|               2.0|      (7,[2],[1.0])|
|Kolkata Knight Ri...|Royal Challengers...|   2|      0|  0.2|          2|             0|  222|               4.0|      (7,[4],[1.

In [107]:
# remove the first five overs of the every match.
data_frame = data_frame.filter(data_frame['overs']>=5)

In [108]:
data_frame.count()

40108

In [109]:
data_frame.select('batting_team_index').distinct().show()

+------------------+
|batting_team_index|
+------------------+
|               0.0|
|               7.0|
|               1.0|
|               4.0|
|               3.0|
|               2.0|
|               6.0|
|               5.0|
+------------------+



In [110]:
# now let us use the vector assembler \
features = ['runs','wickets','overs','runs_last_5','wickets_last_5','batting_team_index','bowling_team_index']
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = features,outputCol='features')
data_frame = assembler.transform(data_frame)
data_frame.show()

+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+
|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|            features|
+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+
|Kolkata Knight Ri...|Royal Challengers...|  61|      0|  5.1|         59|             0|  222|               4.0|      (7,[4],[1.0])|               2.0|      (7,[2],[1.0])|[61.0,0.0,5.1,59....|
|Kolkata Knight Ri...|Royal Challengers...|  61|      1|  5.2|         59|             1|  222|               4.0|      (7,[4],[1.0])|               2.0|      (7,[2],[1.0])|[61.0,1.0,5.2,59....|
|Kolkata Knight Ri...|Roy

# Applying Principal Component Analysis

In [111]:
# now let us apply PCA to check the top factors
from pyspark.ml.feature import PCA
pca = PCA(k=7,inputCol='features',outputCol='pca_features')
model = pca.fit(data_frame)
data_frame = model.transform(data_frame)

In [112]:
loadings = model.pc.toArray()
for i, pc in enumerate(loadings):
    print(f"Principal Component {i+1} Loadings:")
    for j, loading in enumerate(pc):
        print(f"  {features[j]}: {loading}")

Principal Component 1 Loadings:
  runs: -0.9813757274475713
  wickets: -0.15109898916664044
  overs: 0.008684533844366116
  runs_last_5: -0.011858569275273416
  wickets_last_5: -0.10602316328268503
  batting_team_index: 0.04285700481354677
  bowling_team_index: -0.02787644601025793
Principal Component 2 Loadings:
  runs: -0.015696352544848857
  wickets: -0.11167138682359938
  overs: -0.056622554885935795
  runs_last_5: 0.06253655113633953
  wickets_last_5: 0.6425326516467382
  batting_team_index: 0.49712975132933773
  bowling_team_index: -0.5658444961673523
Principal Component 3 Loadings:
  runs: -0.09023437527044673
  wickets: -0.14963097629481462
  overs: -0.0713707428157487
  runs_last_5: 0.0915057883690677
  wickets_last_5: 0.7072253094560303
  batting_team_index: -0.5834131936468129
  bowling_team_index: 0.33979794490423926
Principal Component 4 Loadings:
  runs: -0.1688364642812037
  wickets: 0.969309486445601
  overs: -0.0018712694257607416
  runs_last_5: 0.013587165214664609
  

# Model Training and Testing

In [113]:
# Here we will be using different Regression algorithms to train our model.
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [114]:
data_frame.printSchema()

root
 |-- batting_team: string (nullable = true)
 |-- bowling_team: string (nullable = true)
 |-- runs: integer (nullable = true)
 |-- wickets: integer (nullable = true)
 |-- overs: double (nullable = true)
 |-- runs_last_5: integer (nullable = true)
 |-- wickets_last_5: integer (nullable = true)
 |-- total: integer (nullable = true)
 |-- batting_team_index: double (nullable = false)
 |-- batting_team_vector: vector (nullable = true)
 |-- bowling_team_index: double (nullable = false)
 |-- bowling_team_vector: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- pca_features: vector (nullable = true)



In [116]:
lr = LinearRegression(featuresCol='features',labelCol='total')
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 1.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())
crossVal_lr = CrossValidator(estimator=lr,estimatorParamMaps=paramGrid,evaluator=RegressionEvaluator(labelCol='total'),numFolds=3)
cv_lr = crossVal_lr.fit(data_frame)

In [118]:
predictions_lr = cv_lr.transform(data_frame)
predictions_lr.select('total','prediction').show(5)

+-----+------------------+
|total|        prediction|
+-----+------------------+
|  222|191.05811112075722|
|  222|184.25323988562516|
|  222|183.62012722646577|
|  222|182.98701456730635|
|  222|182.35065768021775|
+-----+------------------+
only showing top 5 rows



In [119]:
# let us look at the rmse score of it
evaluator_lr = RegressionEvaluator(labelCol='total')
rmse_lr = evaluator_lr.evaluate(predictions_lr)
print(f"RMSE: {rmse_lr}")

RMSE: 17.60254449572745


In [178]:
# this is an additional step to add the lr_predictions into the data_frame
data_frame = data_frame.withColumnRenamed('r_prediction','lr_prediction')
data_frame.show()

+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+--------------------+------------------+
|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|            features|        pca_features|     lr_prediction|
+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+--------------------+------------------+
|Kolkata Knight Ri...|Royal Challengers...|  61|      0|  5.1|         59|             0|  222|               4.0|      (7,[4],[1.0])|               2.0|      (7,[2],[1.0])|[61.0,0.0,5.1,59....|[-70.282032081893...|191.05811112075722|
|Kolkata Knight Ri...|Royal Challengers...|  61|      1|  5.

In [121]:
# Now let us look for the Decision tree Regressor
dtr = DecisionTreeRegressor(featuresCol='features',labelCol='total')
crossVal_dtr = CrossValidator(estimator=dtr,estimatorParamMaps=paramGrid,evaluator=RegressionEvaluator(labelCol='total'),numFolds=3)
cv_dtr = crossVal_dtr.fit(data_frame)
predictions_dtr = cv_dtr.transform(data_frame)
predictions_dtr.select('total','prediction').show()

+-----+------------------+
|total|        prediction|
+-----+------------------+
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|  188.876130198915|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|177.31890874882407|
|  222|185.53346855983773|
|  222|185.53346855983773|
+-----+------------------+
only showing top 20 rows



In [123]:
evaluator_dtr = RegressionEvaluator(labelCol='total')
rmse_dtr = evaluator_dtr.evaluate(predictions_dtr)
print(f"RMSE: {rmse_dtr}")

RMSE: 19.911042741291645


In [180]:
# this is an additional step to add the dtr_predictions into the data_frame
data_frame = data_frame.join(predictions_dtr.select('features', predictions_dtr['prediction'].alias('dtr_prediction')), on='features', how='inner')
data_frame.show()

+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+
|            features|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|        pca_features|     lr_prediction|    dtr_prediction|
+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+
|[61.0,0.0,5.1,59....|Kolkata Knight Ri...|Royal Challengers...|  61|      0|  5.1|         59|             0|  222|               4.0|      (7,[4],[1.0])|               2.0|      (7,[2],[1.0])|[-70.282032081893...|191.05811112075722|  18

In [125]:
# Now let us do the same for RandomForest Regressor
rfr = RandomForestRegressor(featuresCol='features',labelCol='total')
crossVal_rfr = CrossValidator(estimator=rfr,estimatorParamMaps=paramGrid,evaluator=RegressionEvaluator(labelCol='total'),numFolds=3)
cv_rfr = crossVal_rfr.fit(data_frame)
predictions_rfr=cv_rfr.transform(data_frame)
predictions_rfr.select('total','prediction').show()

+-----+------------------+
|total|        prediction|
+-----+------------------+
|  222|189.75908509140447|
|  222|188.20430008039858|
|  222|188.20430008039858|
|  222|188.20430008039858|
|  222|188.20430008039858|
|  222|188.20430008039858|
|  222| 187.2673721014033|
|  222| 187.2673721014033|
|  222| 180.6133392590447|
|  222| 180.6133392590447|
|  222| 180.6133392590447|
|  222| 180.6133392590447|
|  222| 180.6133392590447|
|  222| 180.6133392590447|
|  222|179.43037324756196|
|  222|179.43037324756196|
|  222| 180.6133392590447|
|  222|179.43037324756196|
|  222|172.57232031645268|
|  222|172.57232031645268|
+-----+------------------+
only showing top 20 rows



In [126]:
evaluator_rfr = RegressionEvaluator(labelCol='total')
rmse_rfr = evaluator_rfr.evaluate(predictions_rfr)
print(f"RMSE: {rmse_rfr}")

RMSE: 19.069656522301866


In [182]:
# this is an additional step to add the rfr_predictions into the data_frame
data_frame = data_frame.join(predictions_rfr.select('features', predictions_rfr['prediction'].alias('rfr_prediction')), on='features', how='inner')
data_frame.show()

+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+
|            features|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|        pca_features|     lr_prediction|    dtr_prediction|    rfr_prediction|
+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+
|[61.0,0.0,5.1,59....|Kolkata Knight Ri...|Royal Challengers...|  61|      0|  5.1|         59|             0|  222|               4.0|      (7,[4],[1.0])|               2.0|      (

In [127]:
# now let us do the same for GBT Regressor also
gbt = GBTRegressor(featuresCol='features',labelCol='total')
crossVal_gbt = CrossValidator(estimator=gbt,estimatorParamMaps=paramGrid,evaluator=RegressionEvaluator(labelCol='total'),numFolds=3)
cv_gbt = crossVal_gbt.fit(data_frame)
predictions_gbt = cv_gbt.transform(data_frame)
predictions_gbt.select('total','prediction').show

In [129]:
evaluator_gbt = RegressionEvaluator(labelCol='total')
rmse_gbt = evaluator_gbt.evaluate(predictions_gbt)
print(f"RMSE: {rmse_gbt}")

RMSE: 16.130322581581613


In [185]:
# this is an additional step to add the gbt_predictions into the data_frame
data_frame = data_frame.join(predictions_gbt.select('features', predictions_gbt['prediction'].alias('gbt_prediction')), on='features', how='inner')
data_frame.show()

+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+------------------+
|            features|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|        pca_features|     lr_prediction|    dtr_prediction|    rfr_prediction|    gbt_prediction|
+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+------------------+
|[61.0,0.0,5.1,59....|Kolkata Knight Ri...|Royal Challengers...|  61|      0|  5.1|         59|             0|  222|        

# Stacking Method

In [186]:
# since all the models are not performing well than exepcted so we will apply stacking method here....
assembler = VectorAssembler(inputCols=['lr_prediction','dtr_prediction','rfr_prediction','gbt_prediction'],outputCol='output_features')
data_frame = assembler.transform(data_frame)
data_frame.show()


+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+------------------+--------------------+
|            features|        batting_team|        bowling_team|runs|wickets|overs|runs_last_5|wickets_last_5|total|batting_team_index|batting_team_vector|bowling_team_index|bowling_team_vector|        pca_features|     lr_prediction|    dtr_prediction|    rfr_prediction|    gbt_prediction|     output_features|
+--------------------+--------------------+--------------------+----+-------+-----+-----------+--------------+-----+------------------+-------------------+------------------+-------------------+--------------------+------------------+------------------+------------------+------------------+--------------------+
|[61.0,0.0,5.1,59....|Kolkata Knight Ri...|Royal Challengers.

In [190]:
# since we got better rmse for gbt so we will use gbt here.....
meta_learner = GBTRegressor(featuresCol='output_features',labelCol='total')
cvModel_meta = CrossValidator(estimator=meta_learner,estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol='total'),numFolds=3)
cv_meta = cvModel_meta.fit(data_frame)
cv_predictions = cv_meta.transform(data_frame)
cv_predictions.select('total','prediction').show()

+-----+------------------+
|total|        prediction|
+-----+------------------+
|  222|187.82933765540378|
|  222|174.54975900969433|
|  222|174.54975900969433|
|  222|174.54975900969433|
|  222|174.54975900969433|
|  222|174.54975900969433|
|  222|174.54975900969433|
|  222|174.54975900969433|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|178.97835659799588|
|  222|186.32799593713156|
|  222|186.32799593713156|
+-----+------------------+
only showing top 20 rows



In [191]:
evaluator = RegressionEvaluator(labelCol='total')
rmse_meta = evaluator.evaluate(cv_predictions)
print(f"RMSE: {rmse_meta}")

RMSE: 15.151120024955409


In [192]:
# The error only reduced to 15. Let's try Linear Regression
meta_learner = LinearRegression(featuresCol='output_features',labelCol='total')
cvModel_meta = CrossValidator(estimator=meta_learner,estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol='total'),numFolds=3)
cv_meta = cvModel_meta.fit(data_frame)
cv_predictions = cv_meta.transform(data_frame)
cv_predictions.select('total','prediction').show()

+-----+------------------+
|total|        prediction|
+-----+------------------+
|  222|183.97925773887647|
|  222| 174.5908500005827|
|  222|174.40272295949777|
|  222|174.21459591841278|
|  222|174.02550486722302|
|  222|173.83737782613807|
|  222| 171.1829185896324|
|  222| 171.2765736662267|
|  222|173.01220339695226|
|  222| 173.7086910265407|
|  222|173.80620214355432|
|  222|173.90371326056794|
|  222|172.96307805514317|
|  222|174.24437309791224|
|  222|174.32297324713764|
|  222|174.41952035404645|
|  222|  174.532086398429|
|  222|174.60972253754957|
|  222|181.91373032078198|
|  222|181.55300251462413|
+-----+------------------+
only showing top 20 rows



In [193]:
evaluator = RegressionEvaluator(labelCol='total')
rmse_meta = evaluator.evaluate(cv_predictions)
print(f"RMSE: {rmse_meta}")

RMSE: 15.627277272843829


In [None]:
# out of this GBT Regressor model performed well compared to other 3 models. so we will take this model.

# Testing The Model on Real Data

In [147]:
from pyspark.sql import Row
def predict_score(runs,wickets,overs,runs_last_5,wickets_last_5,batting_team_index,bowling_team_index):
  data = [Row(runs=runs, wickets=wickets, overs=overs, runs_last_5=runs_last_5,
                wickets_last_5=wickets_last_5, batting_team_index=batting_team_index,
                bowling_team_index=bowling_team_index)]
  batt_name = teams_index.get(batting_team_index)
  bowl_name = teams_index.get(bowling_team_index)
  exp_df = spark.createDataFrame(data)
  exp_assembler = VectorAssembler(inputCols=features,outputCol='features')
  exp_df = exp_assembler.transform(exp_df)
  exp_df = cv_gbt.transform(exp_df)
  print(f"{batt_name} v/s {bowl_name}")
  print(f"{batt_name}: {runs}/{wickets}")
  print(f"Overs: {overs}/20")
  pred_score = exp_df.select('prediction').collect()[0]['prediction']
  print(f"Predicted Score is: {round(pred_score)}")

In [148]:
predict_score(50,3,8,10,1,1,2)

Kings XI Punjab v/s Chennai Super Kings
Kings XI Punjab: 50/3
Overs: 8/20
Predicted Score is: 135


In [149]:
predict_score(100,0,10,26,0,1,2)

Kings XI Punjab v/s Chennai Super Kings
Kings XI Punjab: 100/0
Overs: 10/20
Predicted Score is: 194


In [150]:
predict_score(100,0,7,50,0,4,7)

Kolkata Knight Riders v/s Sunrisers Hyderabad
Kolkata Knight Riders: 100/0
Overs: 7/20
Predicted Score is: 202


In [157]:
predict_score(115,3,9,36,1,3,5)

Royal Challengers Bangalore v/s Rajasthan Royals
Royal Challengers Bangalore: 115/3
Overs: 9/20
Predicted Score is: 199
