In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, mean, count
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator, RegressionEvaluator
from pyspark.ml.clustering import KMeans
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import mean, count, when
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("IPL_ML_Experiments").getOrCreate()
df = spark.read.csv("deliveries.csv", header=True, inferSchema=True)
df = df.na.drop(subset=["batter", "bowler", "over", "ball", "total_runs"])

25/06/25 05:50:37 WARN Utils: Your hostname, Molphie resolves to a loopback address: 127.0.1.1; using 192.168.6.223 instead (on interface enp4s0)
25/06/25 05:50:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 05:50:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/25 05:50:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Классификация
Цель: Предсказать, будет ли удар на 4 или 6 (boundary или нет)

In [3]:
df_cls = df.withColumn("is_boundary", when((col("batsman_runs") == 4) | (col("batsman_runs") == 6), 1).otherwise(0))
indexers = [
    StringIndexer(inputCol="batter", outputCol="batter_idx"),
    StringIndexer(inputCol="bowler", outputCol="bowler_idx"),
    StringIndexer(inputCol="batting_team", outputCol="batting_team_idx"),
]

features_cls = ["over", "ball", "batter_idx", "bowler_idx", "batting_team_idx"]
assembler_cls = VectorAssembler(inputCols=features_cls, outputCol="features")
rf = RandomForestClassifier(
    labelCol="is_boundary",
    featuresCol="features",
    seed=42,
    numTrees=30,
    maxDepth=5,
    maxBins=1024
)

In [4]:
pipeline_cls = Pipeline(stages=indexers + [assembler_cls, rf])
model_cls = pipeline_cls.fit(df_cls)
pred_cls = model_cls.transform(df_cls)
evaluator_cls = BinaryClassificationEvaluator(labelCol="is_boundary", metricName="areaUnderROC")
roc_auc = evaluator_cls.evaluate(pred_cls)
acc = pred_cls.filter(col("is_boundary") == col("prediction")).count() / df_cls.count()
print(f"RandomForest\nROC AUC: {roc_auc:.3f}\nAccuracy: {acc:.3f}\n")

25/06/25 05:50:48 WARN DAGScheduler: Broadcasting large task binary with size 1299.0 KiB


RandomForest
ROC AUC: 0.576
Accuracy: 0.836



# Кластеризация баттеров

In [5]:
agg = (
    df.groupBy("batter")
    .agg(
        mean("batsman_runs").alias("avg_runs"),
        (count(when((col("batsman_runs")==4) | (col("batsman_runs")==6), True)) / count("*")).alias("boundary_pct"),
        count("*").alias("balls_faced"),
    )
    .filter(col("balls_faced") > 100)
)
vec_assembler = VectorAssembler(inputCols=["avg_runs", "boundary_pct"], outputCol="features")
kmeans = KMeans(k=4, seed=1)
pipeline_clus = Pipeline(stages=[vec_assembler, kmeans])
model_clus = pipeline_clus.fit(agg)
pred_clus = model_clus.transform(agg)
evaluator_clus = ClusteringEvaluator()
silhouette = evaluator_clus.evaluate(pred_clus)
wssse = model_clus.stages[-1].summary.trainingCost
print(f"KMeans\nSilhouette: {silhouette:.3f}\nWSSSE: {wssse:.2f}\n")

KMeans
Silhouette: 0.644
WSSSE: 1.70



# Регрессия

In [6]:
lr = LinearRegression(featuresCol="features", labelCol="total_runs")
pipeline_reg = Pipeline(stages=indexers + [assembler_cls, lr])
model_reg = pipeline_reg.fit(df)
pred_reg = model_reg.transform(df)
evaluator_reg = RegressionEvaluator(labelCol="total_runs", predictionCol="prediction", metricName="rmse")

25/06/25 05:50:55 WARN Instrumentation: [8db144b4] regParam is zero, which might cause numerical instability and overfitting.


In [7]:
rmse = evaluator_reg.evaluate(pred_reg)
r2 = RegressionEvaluator(labelCol="total_runs", predictionCol="prediction", metricName="r2").evaluate(pred_reg)
print(f"LinearRegression\nRMSE: {rmse:.3f}\nR2: {r2:.3f}")

LinearRegression
RMSE: 1.617
R2: 0.012


In [8]:
# Агрегаты по баттеру и боулеру
batter_stats = df.groupBy("batter").agg(
    mean("batsman_runs").alias("batter_avg"),
    (count(when((col("batsman_runs")==4) | (col("batsman_runs")==6), True)) / count("*")).alias("batter_boundary_pct"),
    count("*").alias("batter_balls")
)

bowler_stats = df.groupBy("bowler").agg(
    mean("batsman_runs").alias("bowler_avg"),
    (count(when((col("batsman_runs")==4) | (col("batsman_runs")==6), True)) / count("*")).alias("bowler_boundary_pct"),
    count("*").alias("bowler_balls")
)

# Присоединяем фичи
df_ext = df.join(batter_stats, on="batter", how="left") \
           .join(bowler_stats, on="bowler", how="left") \
           .withColumn("is_powerplay", (col("over") <= 6).cast("int")) \
           .withColumn("is_death", (col("over") >= 16).cast("int")) \
           .withColumn("inning", col("inning").cast("int"))

# Ограничиваем категориальные признаки топ-10 + "other"
top_batters = df_ext.groupBy("batter").count().orderBy(col("count").desc()).limit(10).select("batter").rdd.flatMap(lambda x: x).collect()
top_bowlers = df_ext.groupBy("bowler").count().orderBy(col("count").desc()).limit(10).select("bowler").rdd.flatMap(lambda x: x).collect()
df_ext = df_ext.withColumn("batter_mod", when(col("batter").isin(top_batters), col("batter")).otherwise("other"))
df_ext = df_ext.withColumn("bowler_mod", when(col("bowler").isin(top_bowlers), col("bowler")).otherwise("other"))

# OneHotEncoding
batter_indexer = StringIndexer(inputCol="batter_mod", outputCol="batter_mod_idx")
bowler_indexer = StringIndexer(inputCol="bowler_mod", outputCol="bowler_mod_idx")
batting_team_indexer = StringIndexer(inputCol="batting_team", outputCol="batting_team_idx")
bowling_team_indexer = StringIndexer(inputCol="bowling_team", outputCol="bowling_team_idx")

batter_encoder = OneHotEncoder(inputCol="batter_mod_idx", outputCol="batter_ohe")
bowler_encoder = OneHotEncoder(inputCol="bowler_mod_idx", outputCol="bowler_ohe")
batting_team_encoder = OneHotEncoder(inputCol="batting_team_idx", outputCol="batting_team_ohe")
bowling_team_encoder = OneHotEncoder(inputCol="bowling_team_idx", outputCol="bowling_team_ohe")

# Собираем все фичи
features = [
    "over", "ball", "inning", "is_powerplay", "is_death",
    "batter_avg", "batter_boundary_pct", "bowler_avg", "bowler_boundary_pct",
    "batter_balls", "bowler_balls",
    "batter_ohe", "bowler_ohe", "batting_team_ohe", "bowling_team_ohe"
]
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Модель и pipeline
rf = RandomForestRegressor(featuresCol="features", labelCol="total_runs", numTrees=100, maxDepth=8, seed=42)
pipeline = Pipeline(stages=[
    batter_indexer, bowler_indexer, batting_team_indexer, bowling_team_indexer,
    batter_encoder, bowler_encoder, batting_team_encoder, bowling_team_encoder,
    assembler, rf
])

model = pipeline.fit(df_ext)
pred = model.transform(df_ext)

rmse = RegressionEvaluator(labelCol="total_runs", predictionCol="prediction", metricName="rmse").evaluate(pred)
r2 = RegressionEvaluator(labelCol="total_runs", predictionCol="prediction", metricName="r2").evaluate(pred)
print(f"RandomForestRegressor:\nRMSE: {rmse:.3f}\nR2: {r2:.3f}")


25/06/25 05:51:09 WARN DAGScheduler: Broadcasting large task binary with size 1388.0 KiB
25/06/25 05:51:11 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/06/25 05:51:14 WARN DAGScheduler: Broadcasting large task binary with size 4.8 MiB
25/06/25 05:51:17 WARN DAGScheduler: Broadcasting large task binary with size 1144.2 KiB

RandomForestRegressor:
RMSE: 1.596
R2: 0.037


                                                                                