In [0]:
spark

In [0]:
from pyspark.sql import SparkSession
#create session
spark = SparkSession.builder.appName("IPL Winner Prediction").getOrCreate() 

In [0]:
spark

In [0]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/shared_uploads/kavil.jain@csu.fullerton.edu/final_data.csv")

In [0]:
df.show(15)

+------+--------------------+--------------------+----------+---------+----------+------------+------------+------------------+------------------+------+
|   _c0|        batting_team|        bowling_team|      city|runs_left|balls_left|wickets_left|total_runs_x|               crr|               rrr|result|
+------+--------------------+--------------------+----------+---------+----------+------------+------------+------------------+------------------+------+
|122604| Chennai Super Kings|Kolkata Knight Ri...|   Chennai|      123|        84|          10|         202|13.166666666666666| 8.785714285714286|     1|
|194224|      Mumbai Indians| Sunrisers Hyderabad|    Mumbai|       91|        53|          10|         193| 9.134328358208956| 10.30188679245283|     0|
|191195|    Rajasthan Royals|        Punjab Kings|    Mumbai|        0|         8|          10|         189|            10.125|               0.0|     1|
|205267| Chennai Super Kings|    Rajasthan Royals|    Jaipur|       50|     

In [0]:
train_data, test_data = df.randomSplit([0.7, 0.3], seed=1)# Splitting the data into train and test sets

In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
categorical_cols = ['batting_team', 'bowling_team', 'city'] # Defining categorical columns
# Encoding categorical features
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep") for col in categorical_cols]
encoder = OneHotEncoder(inputCols=[col+"_index" for col in categorical_cols], outputCols=[col+"_vec" for col in categorical_cols])
# Assembling features vector
feature_cols = [col for col in df.columns if col not in categorical_cols + ['result']]
assembler = VectorAssembler(inputCols=feature_cols + [col+"_vec" for col in categorical_cols], outputCol="features")

In [0]:
print(feature_cols)

['_c0', 'runs_left', 'balls_left', 'wickets_left', 'total_runs_x', 'crr', 'rrr']


In [0]:
from pyspark.ml.classification import LogisticRegression
# Defining the logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="result")

In [0]:
help(LogisticRegression)

Help on class LogisticRegression in module pyspark.ml.classification:

class LogisticRegression(_JavaProbabilisticClassifier, _LogisticRegressionParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  LogisticRegression(*, featuresCol: str = 'features', labelCol: str = 'label', predictionCol: str = 'prediction', maxIter: int = 100, regParam: float = 0.0, elasticNetParam: float = 0.0, tol: float = 1e-06, fitIntercept: bool = True, threshold: float = 0.5, thresholds: Optional[List[float]] = None, probabilityCol: str = 'probability', rawPredictionCol: str = 'rawPrediction', standardization: bool = True, weightCol: Optional[str] = None, aggregationDepth: int = 2, family: str = 'auto', lowerBoundsOnCoefficients: Optional[pyspark.ml.linalg.Matrix] = None, upperBoundsOnCoefficients: Optional[pyspark.ml.linalg.Matrix] = None, lowerBoundsOnIntercepts: Optional[pyspark.ml.linalg.Vector] = None, upperBoundsOnIntercepts: Optional[pyspark.ml.linalg.Vector] = None, maxBlockSizeIn

In [0]:
# Creating a pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers + [encoder] + [assembler] + [lr])

In [0]:
# Training the model
model = pipeline.fit(train_data)

In [0]:
# Making predictions on the test set
predictions = model.transform(test_data)
# Calculating accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="result", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8429520448411875


In [0]:
# Example prediction
example_prediction = model.transform(test_data).collect()[131]
print("Example prediction:", example_prediction)

Example prediction: Row(_c0=2108, batting_team='Rajasthan Royals', bowling_team='Deccan Chargers', city='Hyderabad', runs_left=28, balls_left=23, wickets_left=10, total_runs_x=214, crr=11.505154639175258, rrr=7.304347826086956, result=1, batting_team_index=3.0, bowling_team_index=8.0, city_index=4.0, batting_team_vec=SparseVector(13, {3: 1.0}), bowling_team_vec=SparseVector(13, {8: 1.0}), city_vec=SparseVector(32, {4: 1.0}), features=SparseVector(65, {0: 2108.0, 1: 28.0, 2: 23.0, 3: 10.0, 4: 214.0, 5: 11.5052, 6: 7.3043, 10: 1.0, 28: 1.0, 37: 1.0}), rawPrediction=DenseVector([-1.4554, 1.4554]), probability=DenseVector([0.1892, 0.8108]), prediction=1.0)


In [0]:
test_data.display()

_c0,batting_team,bowling_team,city,runs_left,balls_left,wickets_left,total_runs_x,crr,rrr,result
133,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,214,117,10,222,16.0,10.974358974358974,0
138,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,213,112,10,222,6.75,11.410714285714286,0
154,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,198,97,10,222,6.260869565217392,12.24742268041237,0
156,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,198,95,10,222,5.76,12.505263157894737,0
157,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,198,94,10,222,5.538461538461538,12.638297872340424,0
158,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,198,93,10,222,5.333333333333333,12.774193548387096,0
159,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,197,92,10,222,5.357142857142857,12.847826086956522,0
187,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,177,68,10,222,5.1923076923076925,15.617647058823527,0
197,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,165,59,10,222,5.60655737704918,16.779661016949152,0
198,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,165,58,10,222,5.516129032258065,17.06896551724138,0
