In [1]:
import pyspark as ps
import json
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.fpm import FPGrowth
from pyspark.sql.functions import col, explode
from pyspark.sql import functions as F

In [2]:
master_url = 'local[*]'
#SparkContext.setSystemProperty('spark.executor.memory', '24g')
spark = SparkSession.builder.master(master_url).appName("data-miner").getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
dataset_path = './matches-1k.json'
league_df = spark.read.option("inferTimestamp", "false").option("mode", "DROPMALFORMED").json(dataset_path)

In [5]:
league_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- gameCreation: long (nullable = true)
 |-- gameDuration: long (nullable = true)
 |-- gameId: long (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: long (nullable = true)
 |-- participantIdentities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- participantId: long (nullable = true)
 |    |    |-- player: struct (nullable = true)
 |    |    |    |-- accountId: string (nullable = true)
 |    |    |    |-- currentAccountId: string (nullable = true)
 |    |    |    |-- currentPlatformId: string (nullable = true)
 |    |    |    |-- matchHistoryUri: string (nullable = true)
 |    |    |    |-- platformId: string (nullable = true)
 |    |    |    |-- profileIcon: long (nullable = true)
 |    |    |    |-- summonerId: string (nullable = true)
 |    |    |    |-- summo

In [6]:
cleansed_league_df = league_df
cleansed_league_df = cleansed_league_df.drop('_id', 'gameCreation', 'gameDuration', 'gameId', 'gameMode', 'gameType', 'gameVersion', 'mapId', 'participantIdentities', 'platformId', 'queueId', 'seasonId', 'teams') # can use teams
cleansed_league_df = cleansed_league_df.withColumn("participants",
    F.transform(
        cleansed_league_df["participants"],
        lambda x: x.withField("stats", 
                                x["stats"].dropFields('champLevel', 'combatPlayerScore',
                                    'damageDealtToObjectives', 'damageDealtToTurrets', 'damageSelfMitigated', 'doubleKills',
                                    'firstBloodAssist', 'firstBloodKill', 'firstInhibitorAssist', 'firstInhibitorKill', 
                                    'firstTowerAssist', 'firstTowerKill', 'goldSpent', 'inhibitorKills',
                                    'killingSprees', 'largestCriticalStrike', 'largestKillingSpree',
                                    'largestMultiKill', 'longestTimeSpentLiving', 'magicDamageDealt', 
                                    'magicDamageDealtToChampions', 'magicalDamageTaken', 'neutralMinionsKilled', 
                                    'neutralMinionsKilledEnemyJungle', 'neutralMinionsKilledTeamJungle', 
                                    'objectivePlayerScore', 'participantId', 'pentaKills', 
                                    'perk0Var1', 'perk0Var2', 'perk0Var3', 'perk1Var1', 'perk1Var2', 
                                    'perk1Var3', 'perk2Var1', 'perk2Var2', 'perk2Var3', 
                                    'perk3Var1', 'perk3Var2', 'perk3Var3', 'perk4Var1', 'perk4Var2', 
                                    'perk4Var3', 'perk5Var1', 'perk5Var2', 'perk5Var3', 'perkPrimaryStyle', 
                                    'perkSubStyle', 'physicalDamageDealt', 'physicalDamageDealtToChampions', 
                                    'physicalDamageTaken', 'playerScore0', 'playerScore1', 'playerScore2', 
                                    'playerScore3', 'playerScore4', 'playerScore5', 'playerScore6', 'playerScore7', 
                                    'playerScore8', 'playerScore9', 'quadraKills', 'sightWardsBoughtInGame', 
                                    'statPerk0', 'statPerk1', 'statPerk2', 'timeCCingOthers', 'totalDamageDealt', 
                                    'totalMinionsKilled', 
                                    'totalPlayerScore', 'totalScoreRank', 'totalTimeCrowdControlDealt', 'totalUnitsHealed', 
                                    'tripleKills', 'trueDamageDealt', 'trueDamageDealtToChampions', 'trueDamageTaken', 
                                    'turretKills', 'unrealKills', 'visionScore', 'visionWardsBoughtInGame', 'wardsKilled', 
                                    'wardsPlaced')
                            )
    )
)
cleansed_league_df = cleansed_league_df.withColumn("participants", 
    F.transform(
        cleansed_league_df["participants"],
        lambda x: x.dropFields('timeline', 'participantId', 'killingSprees', 'kills', 'largestCriticalStrike', 'largestKillingSpree', 'largestMultiKill', 'longestTimeSpentLiving', 'magicDamageDealt', 'magicDamageDealtToChampions', 'magicalDamageTaken', 'neutralMinionsKilled', 'neutralMinionsKilledEnemyJungle', 'neutralMinionsKilledTeamJungle', 'objectivePlayerScore', 'participantId', 'pentaKills', 'perk0', 'perk0Var1', 'perk0Var2', 'perk0Var3', 'perk1', 'perk1Var1', 'perk1Var2', 'perk1Var3', 'perk2', 'perk2Var1', 'perk2Var2', 'perk2Var3', 'perk3', 'perk3Var1', 'perk3Var2', 'perk3Var3', 'perk4', 'perk4Var1', 'perk4Var2', 'perk4Var3', 'perk5', 'perk5Var1', 'perk5Var2', 'perk5Var3', 'perkPrimaryStyle', 'perkSubStyle', 'physicalDamageDealt', 'physicalDamageDealtToChampions', 'physicalDamageTaken', 'playerScore0', 'playerScore1', 'playerScore2', 'playerScore3', 'playerScore4', 'playerScore5', 'playerScore6', 'playerScore7', 'playerScore8', 'playerScore9', 'quadraKills', 'sightWardsBoughtInGame', 'statPerk0', 'statPerk1', 'statPerk2', 'timeCCingOthers', 'totalDamageDealt', 'totalDamageDealtToChampions', 'totalDamageTaken', 'totalHeal', 'totalMinionsKilled', 'totalPlayerScore', 'totalScoreRank', 'totalTimeCrowdControlDealt', 'totalUnitsHealed', 'tripleKills', 'trueDamageDealt', 'trueDamageDealtToChampions', 'trueDamageTaken', 'turretKills', 'unrealKills', 'visionScore', 'visionWardsBoughtInGame', 'wardsKilled', 'wardsPlaced', 'teamId') # can use timeline later
    )
)
for i in range(7):
    item = "item" + str(i)
    cleansed_league_df = cleansed_league_df.withColumn("participants", 
        F.transform(
            cleansed_league_df["participants"],
            lambda x: x.withField("stats" , 
                x["stats"].withField(item, 
                    x["stats"][item].dropFields('from', 'gold', 'into', 'stats', 'tags')
                )
            )
        )
    )

cleansed_league_df.printSchema()

root
 |-- participants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- championId: string (nullable = true)
 |    |    |-- spell1Id: string (nullable = true)
 |    |    |-- spell2Id: string (nullable = true)
 |    |    |-- stats: struct (nullable = true)
 |    |    |    |-- assists: long (nullable = true)
 |    |    |    |-- deaths: long (nullable = true)
 |    |    |    |-- goldEarned: long (nullable = true)
 |    |    |    |-- item0: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item1: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item2: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item3: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item4: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |

In [7]:
def explode_df(nested_df):
    new_df = nested_df
    for column in nested_df.columns:
        if cleansed_league_df.schema[column].dataType.typeName() == 'array':
            new_df = nested_df.selectExpr("*", f"explode({column}) as {column}_exploded").drop(column)
    return new_df

exploded_league_df = explode_df(cleansed_league_df)
exploded_league_df.printSchema()

root
 |-- participants_exploded: struct (nullable = true)
 |    |-- championId: string (nullable = true)
 |    |-- spell1Id: string (nullable = true)
 |    |-- spell2Id: string (nullable = true)
 |    |-- stats: struct (nullable = true)
 |    |    |-- assists: long (nullable = true)
 |    |    |-- deaths: long (nullable = true)
 |    |    |-- goldEarned: long (nullable = true)
 |    |    |-- item0: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item1: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item2: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item3: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item4: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item5: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item6: struct (nu

In [8]:
def categorize(value,mean,std_dev):
    if value <= mean - std_dev:
        return "Low"
    elif value <= mean + std_dev:
        return "Medium"
    else:
        return "High"

In [9]:
MeanDeaths = exploded_league_df.agg(F.mean("participants_exploded.stats.deaths")).collect()[0][0]
StdDevDeaths = exploded_league_df.agg(F.stddev("participants_exploded.stats.deaths")).collect()[0][0]

MeanKills = exploded_league_df.agg(F.mean("participants_exploded.stats.kills")).collect()[0][0]
StdDevKills = exploded_league_df.agg(F.stddev("participants_exploded.stats.kills")).collect()[0][0]

MeanAssists = exploded_league_df.agg(F.mean("participants_exploded.stats.assists")).collect()[0][0]
StdDevAssists = exploded_league_df.agg(F.stddev("participants_exploded.stats.assists")).collect()[0][0]

MeanGoldEarned = exploded_league_df.agg(F.mean("participants_exploded.stats.goldEarned")).collect()[0][0]
StdDevGoldEarned = exploded_league_df.agg(F.stddev("participants_exploded.stats.goldEarned")).collect()[0][0]

MeanTotalDamageDealtToChampions = exploded_league_df.agg(F.mean("participants_exploded.stats.totalDamageDealtToChampions")).collect()[0][0]
StdDevTotalDamageDealtToChampions = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalDamageDealtToChampions")).collect()[0][0]

MeanTotalDamageTaken = exploded_league_df.agg(F.mean("participants_exploded.stats.totalDamageTaken")).collect()[0][0]
StdDevTotalDamageTaken = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalDamageTaken")).collect()[0][0]

MeanTotalHeal = exploded_league_df.agg(F.mean("participants_exploded.stats.totalHeal")).collect()[0][0]
StdDevTotalHeal = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalHeal")).collect()[0][0]

MeantotalDamageDealtToChampions = exploded_league_df.agg(F.mean("participants_exploded.stats.totalDamageDealtToChampions")).collect()[0][0]
StdDevtotalDamageDealtToChampions = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalDamageDealtToChampions")).collect()[0][0]

print(MeanDeaths,StdDevDeaths)
print(MeanKills,StdDevKills)
print(MeanAssists,StdDevAssists)
print(MeanGoldEarned,StdDevGoldEarned)
print(MeanTotalDamageDealtToChampions,StdDevTotalDamageDealtToChampions)
print(MeanTotalDamageTaken,StdDevTotalDamageTaken)
print(MeanTotalHeal,StdDevTotalHeal)
print(MeantotalDamageDealtToChampions,StdDevtotalDamageDealtToChampions)

categorize(10,MeanDeaths,StdDevDeaths)

5.2353 2.8521478546419137
5.2222 4.0593441573114655
7.9182 5.581919431741583
10306.0865 3502.812180939095
15328.9367 9464.465698347085
20153.444 10121.023860858335
5607.1283 5725.508000607954
15328.9367 9464.465698347085


'High'

In [10]:
league_rdd = exploded_league_df.rdd

In [11]:
def itemize(record):
    items = []
    participant = record.participants_exploded
    items.append("Champion = "+ participant.championId)
    items.append("Spell = "+ participant.spell1Id)
    items.append("Spell = "+ participant.spell2Id)
    stats = participant['stats']
    items.append("Win = " + str(stats.win))
    for i in range(7):
        item = stats['item' + str(i)]
        item_name = item['name'] if item else None
        if item_name: items.append("Item = " + item_name)
    for i in range(6):
        perk = stats['perk' + str(i)]
        perk_name = item['name'] if perk else None
        if perk_name: items.append("Perk = " + perk_name)
    return list(set(items))

In [12]:
items_rdd = league_rdd.map(itemize)

In [13]:
minSupport = 0.025

In [14]:
model = FPGrowth.train(items_rdd, minSupport=minSupport)
result = model.freqItemsets()
result_sorted = result.sortBy(lambda x : (-len(x.items), -x.freq))
fi = result_sorted.collect()
for i in fi:
    print(i)

Py4JJavaError: An error occurred while calling o381.trainFPGrowthModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 11 in stage 49.0 failed 1 times, most recent failure: Lost task 11.0 in stage 49.0 (TID 248) (LAPTOP-UROEI7PJ executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.base/java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.PlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.ServerSocket.implAccept(Unknown Source)
	at java.base/java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 21 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1266)
	at org.apache.spark.mllib.fpm.FPGrowth.run(FPGrowth.scala:216)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainFPGrowthModel(PythonMLLibAPI.scala:573)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.base/java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.PlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.ServerSocket.implAccept(Unknown Source)
	at java.base/java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 21 more


In [None]:
from itertools import combinations
data = items_rdd
min_support = 0.025

item_counts = data.flatMap(lambda transaction: [(item, 1) for item in transaction]).reduceByKey(lambda a, b: a + b)
total_count = data.count()

sup = int(min_support * total_count)
freq_items = item_counts.filter(lambda x: x[1] >= sup)
rules = []
k = 2
while freq_items.count() > 0:
    rules.append(freq_items.collect())
    candidate_counts = data.flatMap(lambda transaction: [(pair, 1) for pair in combinations(transaction, k)]).reduceByKey(lambda a, b: a + b)
    freq_items = candidate_counts.filter(lambda x: x[1] >= sup).map(lambda x: (x[0], x[1]))
    k += 1

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\Ahmed\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ahmed\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ahmed\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
min_support = minSupport
total_count = items_rdd.count()
sup = int(total_count * min_support)

broadcasted_items = sc.broadcast(items_rdd.map(lambda x: set(x)).collect())
def sup_filter(x):
    x_sup = len([1 for t in broadcasted_items.value if x.issubset(t)])
    if x_sup >= sup:
        return x, x_sup
    return None

rules = []

k = 1
ck = items_rdd.flatMap(lambda x: set(x)).distinct().collect()
ck = [{x} for x in ck]

while len(ck) > 0:
    fk = sc.parallelize(ck).map(sup_filter).filter(lambda x: x is not None).collect()
    if len(fk): rules.append(fk)
    k += 1
    f_k_items = [item for item in map(lambda x: x[0], fk)]
    ck = [i1 | i2 for i, i1 in enumerate(f_k_items) for i2 in f_k_items[i + 1:] if list(i1)[:k - 2] == list(i2)[:k - 2]]

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 12 in stage 3.0 failed 1 times, most recent failure: Lost task 12.0 in stage 3.0 (TID 51) (LAPTOP-UROEI7PJ executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.base/java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.PlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.ServerSocket.implAccept(Unknown Source)
	at java.base/java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 15 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.base/java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.base/java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.PlainSocketImpl.accept(Unknown Source)
	at java.base/java.net.ServerSocket.implAccept(Unknown Source)
	at java.base/java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 15 more


In [None]:
print(rules[5])

[({'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = True', 'Item = Boots of Mobility', 'Spell = Flash', 'Spell = Ignite'}, 287), ({'Item = Control Ward', 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = True', 'Spell = Flash', 'Spell = Ignite'}, 300), ({'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = False', 'Item = Enchantment: Runic Echoes', 'Spell = Flash', 'Spell = Smite'}, 263), ({"Item = Sorcerer's Shoes", 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Item = Enchantment: Runic Echoes', 'Spell = Flash', 'Spell = Smite'}, 405), ({'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = False', 'Item = Boots of Mobility', 'Spell = Flash', 'Spell = Ignite'}, 286), ({'Item = Control Ward', 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = False', 'Spell = Flash', 'Spell = Ignite'}, 419), ({'Item = Control Ward', 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Item = Boots of Mobility', 'Spell = Flash', 'Spell = Ignite'}, 282), ({'Spell = Teleport', 'Win = True', 'Item = Warding Totem (T