In [None]:
import pyspark as ps
import json
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.fpm import FPGrowth
from pyspark.sql.functions import col, explode
from pyspark.sql import functions as F

In [None]:
master_url = 'spark://spark-master:7077'
SparkContext.setSystemProperty('spark.executor.memory', '24g')
spark = SparkSession.builder.master(master_url).appName("data-miner").getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
dataset_path = 'dbfs:/matches-1k.json'
league_df = spark.read.option("inferTimestamp", "false").option("mode", "DROPMALFORMED").json(dataset_path)

In [None]:
league_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- gameCreation: long (nullable = true)
 |-- gameDuration: long (nullable = true)
 |-- gameId: long (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: long (nullable = true)
 |-- participantIdentities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- participantId: long (nullable = true)
 |    |    |-- player: struct (nullable = true)
 |    |    |    |-- accountId: string (nullable = true)
 |    |    |    |-- currentAccountId: string (nullable = true)
 |    |    |    |-- currentPlatformId: string (nullable = true)
 |    |    |    |-- matchHistoryUri: string (nullable = true)
 |    |    |    |-- platformId: string (nullable = true)
 |    |    |    |-- profileIcon: long (nullable = true)
 |    |    |    |-- summonerId: string (nullable = true)
 |    |    |    |-- summo

In [None]:
cleansed_league_df = league_df
cleansed_league_df = cleansed_league_df.drop('_id', 'gameCreation', 'gameDuration', 'gameId', 'gameMode', 'gameType', 'gameVersion', 'mapId', 'participantIdentities', 'platformId', 'queueId', 'seasonId', 'teams') # can use teams
cleansed_league_df = cleansed_league_df.withColumn("participants",
    F.transform(
        cleansed_league_df["participants"],
        lambda x: x.withField("stats", 
                                x["stats"].dropFields('assists', 'champLevel', 'combatPlayerScore',
                                    'damageDealtToObjectives', 'damageDealtToTurrets', 'damageSelfMitigated', 'deaths', 'doubleKills',
                                    'firstBloodAssist', 'firstBloodKill', 'firstInhibitorAssist', 'firstInhibitorKill', 
                                    'firstTowerAssist', 'firstTowerKill', 'goldEarned', 'goldSpent', 'inhibitorKills',
                                    'killingSprees', 'kills', 'largestCriticalStrike', 'largestKillingSpree',
                                    'largestMultiKill', 'longestTimeSpentLiving', 'magicDamageDealt', 
                                    'magicDamageDealtToChampions', 'magicalDamageTaken', 'neutralMinionsKilled', 
                                    'neutralMinionsKilledEnemyJungle', 'neutralMinionsKilledTeamJungle', 
                                    'objectivePlayerScore', 'participantId', 'pentaKills', 
                                    'perk0Var1', 'perk0Var2', 'perk0Var3', 'perk1Var1', 'perk1Var2', 
                                    'perk1Var3', 'perk2Var1', 'perk2Var2', 'perk2Var3', 
                                    'perk3Var1', 'perk3Var2', 'perk3Var3', 'perk4Var1', 'perk4Var2', 
                                    'perk4Var3', 'perk5Var1', 'perk5Var2', 'perk5Var3', 'perkPrimaryStyle', 
                                    'perkSubStyle', 'physicalDamageDealt', 'physicalDamageDealtToChampions', 
                                    'physicalDamageTaken', 'playerScore0', 'playerScore1', 'playerScore2', 
                                    'playerScore3', 'playerScore4', 'playerScore5', 'playerScore6', 'playerScore7', 
                                    'playerScore8', 'playerScore9', 'quadraKills', 'sightWardsBoughtInGame', 
                                    'statPerk0', 'statPerk1', 'statPerk2', 'timeCCingOthers', 'totalDamageDealt', 
                                    'totalDamageDealtToChampions', 'totalDamageTaken', 'totalHeal', 'totalMinionsKilled', 
                                    'totalPlayerScore', 'totalScoreRank', 'totalTimeCrowdControlDealt', 'totalUnitsHealed', 
                                    'tripleKills', 'trueDamageDealt', 'trueDamageDealtToChampions', 'trueDamageTaken', 
                                    'turretKills', 'unrealKills', 'visionScore', 'visionWardsBoughtInGame', 'wardsKilled', 
                                    'wardsPlaced')
                            )
    )
)
cleansed_league_df = cleansed_league_df.withColumn("participants", 
    F.transform(
        cleansed_league_df["participants"],
        lambda x: x.dropFields('timeline', 'participantId', 'killingSprees', 'kills', 'largestCriticalStrike', 'largestKillingSpree', 'largestMultiKill', 'longestTimeSpentLiving', 'magicDamageDealt', 'magicDamageDealtToChampions', 'magicalDamageTaken', 'neutralMinionsKilled', 'neutralMinionsKilledEnemyJungle', 'neutralMinionsKilledTeamJungle', 'objectivePlayerScore', 'participantId', 'pentaKills', 'perk0', 'perk0Var1', 'perk0Var2', 'perk0Var3', 'perk1', 'perk1Var1', 'perk1Var2', 'perk1Var3', 'perk2', 'perk2Var1', 'perk2Var2', 'perk2Var3', 'perk3', 'perk3Var1', 'perk3Var2', 'perk3Var3', 'perk4', 'perk4Var1', 'perk4Var2', 'perk4Var3', 'perk5', 'perk5Var1', 'perk5Var2', 'perk5Var3', 'perkPrimaryStyle', 'perkSubStyle', 'physicalDamageDealt', 'physicalDamageDealtToChampions', 'physicalDamageTaken', 'playerScore0', 'playerScore1', 'playerScore2', 'playerScore3', 'playerScore4', 'playerScore5', 'playerScore6', 'playerScore7', 'playerScore8', 'playerScore9', 'quadraKills', 'sightWardsBoughtInGame', 'statPerk0', 'statPerk1', 'statPerk2', 'timeCCingOthers', 'totalDamageDealt', 'totalDamageDealtToChampions', 'totalDamageTaken', 'totalHeal', 'totalMinionsKilled', 'totalPlayerScore', 'totalScoreRank', 'totalTimeCrowdControlDealt', 'totalUnitsHealed', 'tripleKills', 'trueDamageDealt', 'trueDamageDealtToChampions', 'trueDamageTaken', 'turretKills', 'unrealKills', 'visionScore', 'visionWardsBoughtInGame', 'wardsKilled', 'wardsPlaced', 'teamId') # can use timeline later
    )
)
for i in range(7):
    item = "item" + str(i)
    cleansed_league_df = cleansed_league_df.withColumn("participants", 
        F.transform(
            cleansed_league_df["participants"],
            lambda x: x.withField("stats" , 
                x["stats"].withField(item, 
                    x["stats"][item].dropFields('from', 'gold', 'into', 'stats', 'tags')
                )
            )
        )
    )

cleansed_league_df.printSchema()

root
 |-- participants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- championId: string (nullable = true)
 |    |    |-- spell1Id: string (nullable = true)
 |    |    |-- spell2Id: string (nullable = true)
 |    |    |-- stats: struct (nullable = true)
 |    |    |    |-- item0: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item1: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item2: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item3: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item4: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item5: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item6: struct (nullable = true)
 |   

In [None]:
def explode_df(nested_df):
    new_df = nested_df
    for column in nested_df.columns:
        if cleansed_league_df.schema[column].dataType.typeName() == 'array':
            new_df = nested_df.selectExpr("*", f"explode({column}) as {column}_exploded").drop(column)
    return new_df

exploded_league_df = explode_df(cleansed_league_df)
exploded_league_df.printSchema()

root
 |-- participants_exploded: struct (nullable = true)
 |    |-- championId: string (nullable = true)
 |    |-- spell1Id: string (nullable = true)
 |    |-- spell2Id: string (nullable = true)
 |    |-- stats: struct (nullable = true)
 |    |    |-- item0: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item1: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item2: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item3: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item4: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item5: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item6: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- perk0: string (nullable = true)
 |    |    |-- perk1: string (

In [None]:
league_rdd = exploded_league_df.rdd

In [None]:
def itemize(record):
    items = []
    participant = record.participants_exploded
    items.append("Champion = "+ participant.championId)
    items.append("Spell = "+ participant.spell1Id)
    items.append("Spell = "+ participant.spell2Id)
    stats = participant['stats']
    items.append("Win = " + str(stats.win))
    for i in range(7):
        item = stats['item' + str(i)]
        item_name = item['name'] if item else None
        if item_name: items.append("Item = " + item_name)
    for i in range(6):
        perk = stats['perk' + str(i)]
        perk_name = item['name'] if item else None
        if perk_name: items.append("Perk = " + perk_name)
    return list(set(items))

In [None]:
items_rdd = league_rdd.map(itemize)

In [None]:
model = FPGrowth.train(items_rdd, minSupport=0.025)
result = model.freqItemsets()
result_sorted = result.sortBy(lambda x : (-len(x.items), -x.freq))
fi = result_sorted.collect()
for i in fi:
    print(i)

FreqItemset(items=['Item = Control Ward', 'Spell = Ignite', 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = False', 'Spell = Flash'], freq=419)
FreqItemset(items=['Item = Ninja Tabi', 'Spell = Teleport', 'Item = Warding Totem (Trinket)', 'Perk = Warding Totem (Trinket)', 'Win = False', 'Spell = Flash'], freq=418)
FreqItemset(items=['Item = Enchantment: Runic Echoes', "Item = Sorcerer's Shoes", 'Spell = Smite', 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Spell = Flash'], freq=405)
FreqItemset(items=['Item = Infinity Edge', "Item = Berserker's Greaves", 'Spell = Heal', 'Item = Farsight Alteration', 'Perk = Farsight Alteration', 'Spell = Flash'], freq=392)
FreqItemset(items=['Item = Ninja Tabi', 'Spell = Teleport', 'Item = Warding Totem (Trinket)', 'Perk = Warding Totem (Trinket)', 'Win = True', 'Spell = Flash'], freq=377)
FreqItemset(items=["Item = Berserker's Greaves", 'Spell = Heal', 'Item = Farsight Alteration', 'Perk = Farsight Alteration', 'Win = False', 'Spell = Flash'], freq=3