In [0]:
import pyspark as ps
import json
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.fpm import FPGrowth
from pyspark.sql.functions import col, explode
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

In [0]:
master_url = 'spark://spark-master:7077'
SparkContext.setSystemProperty('spark.executor.memory', '24g')
spark = SparkSession.builder.master(master_url).appName("data-miner").getOrCreate()

In [0]:
sc = spark.sparkContext

In [0]:
dataset_path = 'dbfs:/matches-1k.json'
league_df = spark.read.option("inferTimestamp", "false").option("mode", "DROPMALFORMED").json(dataset_path)

In [0]:
league_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- gameCreation: long (nullable = true)
 |-- gameDuration: long (nullable = true)
 |-- gameId: long (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: long (nullable = true)
 |-- participantIdentities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- participantId: long (nullable = true)
 |    |    |-- player: struct (nullable = true)
 |    |    |    |-- accountId: string (nullable = true)
 |    |    |    |-- currentAccountId: string (nullable = true)
 |    |    |    |-- currentPlatformId: string (nullable = true)
 |    |    |    |-- matchHistoryUri: string (nullable = true)
 |    |    |    |-- platformId: string (nullable = true)
 |    |    |    |-- profileIcon: long (nullable = true)
 |    |    |    |-- summonerId: string (nullable = true)
 |    |    |    |-- summo

In [0]:
cleansed_league_df = league_df
cleansed_league_df = cleansed_league_df.drop('_id', 'gameCreation', 'gameDuration', 'gameId', 'gameMode', 'gameType', 'gameVersion', 'mapId', 'participantIdentities', 'platformId', 'queueId', 'seasonId', 'teams') # can use teams
cleansed_league_df = cleansed_league_df.withColumn("participants",
    F.transform(
        cleansed_league_df["participants"],
        lambda x: x.withField("stats", 
                                x["stats"].dropFields('champLevel', 'combatPlayerScore',
                                    'damageDealtToObjectives', 'damageDealtToTurrets', 'damageSelfMitigated', 'doubleKills',
                                    'firstBloodAssist', 'firstBloodKill', 'firstInhibitorAssist', 'firstInhibitorKill', 
                                    'firstTowerAssist', 'firstTowerKill', 'goldSpent', 'inhibitorKills',
                                    'killingSprees', 'largestCriticalStrike', 'largestKillingSpree',
                                    'largestMultiKill', 'longestTimeSpentLiving', 'magicDamageDealt', 
                                    'magicDamageDealtToChampions', 'magicalDamageTaken', 'neutralMinionsKilled', 
                                    'neutralMinionsKilledEnemyJungle', 'neutralMinionsKilledTeamJungle', 
                                    'objectivePlayerScore', 'participantId', 'pentaKills', 
                                    'perk0Var1', 'perk0Var2', 'perk0Var3', 'perk1Var1', 'perk1Var2', 
                                    'perk1Var3', 'perk2Var1', 'perk2Var2', 'perk2Var3', 
                                    'perk3Var1', 'perk3Var2', 'perk3Var3', 'perk4Var1', 'perk4Var2', 
                                    'perk4Var3', 'perk5Var1', 'perk5Var2', 'perk5Var3', 'perkPrimaryStyle', 
                                    'perkSubStyle', 'physicalDamageDealt', 'physicalDamageDealtToChampions', 
                                    'physicalDamageTaken', 'playerScore0', 'playerScore1', 'playerScore2', 
                                    'playerScore3', 'playerScore4', 'playerScore5', 'playerScore6', 'playerScore7', 
                                    'playerScore8', 'playerScore9', 'quadraKills', 'sightWardsBoughtInGame', 
                                    'statPerk0', 'statPerk1', 'statPerk2', 'timeCCingOthers', 'totalDamageDealt', 
                                    'totalMinionsKilled', 
                                    'totalPlayerScore', 'totalScoreRank', 'totalTimeCrowdControlDealt', 'totalUnitsHealed', 
                                    'tripleKills', 'trueDamageDealt', 'trueDamageDealtToChampions', 'trueDamageTaken', 
                                    'turretKills', 'unrealKills', 'visionScore', 'wardsKilled', 
                                    'wardsPlaced')
                            )
    )
)
cleansed_league_df = cleansed_league_df.withColumn("participants", 
    F.transform(
        cleansed_league_df["participants"],
        lambda x: x.dropFields('timeline', 'participantId', 'killingSprees', 'kills', 'largestCriticalStrike', 'largestKillingSpree', 'largestMultiKill', 'longestTimeSpentLiving', 'magicDamageDealt', 'magicDamageDealtToChampions', 'magicalDamageTaken', 'neutralMinionsKilled', 'neutralMinionsKilledEnemyJungle', 'neutralMinionsKilledTeamJungle', 'objectivePlayerScore', 'participantId', 'pentaKills', 'perk0', 'perk0Var1', 'perk0Var2', 'perk0Var3', 'perk1', 'perk1Var1', 'perk1Var2', 'perk1Var3', 'perk2', 'perk2Var1', 'perk2Var2', 'perk2Var3', 'perk3', 'perk3Var1', 'perk3Var2', 'perk3Var3', 'perk4', 'perk4Var1', 'perk4Var2', 'perk4Var3', 'perk5', 'perk5Var1', 'perk5Var2', 'perk5Var3', 'perkPrimaryStyle', 'perkSubStyle', 'physicalDamageDealt', 'physicalDamageDealtToChampions', 'physicalDamageTaken', 'playerScore0', 'playerScore1', 'playerScore2', 'playerScore3', 'playerScore4', 'playerScore5', 'playerScore6', 'playerScore7', 'playerScore8', 'playerScore9', 'quadraKills', 'sightWardsBoughtInGame', 'statPerk0', 'statPerk1', 'statPerk2', 'timeCCingOthers', 'totalDamageDealt', 'totalDamageDealtToChampions', 'totalDamageTaken', 'totalHeal', 'totalMinionsKilled', 'totalPlayerScore', 'totalScoreRank', 'totalTimeCrowdControlDealt', 'totalUnitsHealed', 'tripleKills', 'trueDamageDealt', 'trueDamageDealtToChampions', 'trueDamageTaken', 'turretKills', 'unrealKills', 'visionScore', 'wardsKilled', 'wardsPlaced', 'teamId')
    )
)
for i in range(7):
    item = "item" + str(i)
    cleansed_league_df = cleansed_league_df.withColumn("participants", 
        F.transform(
            cleansed_league_df["participants"],
            lambda x: x.withField("stats" , 
                x["stats"].withField(item, 
                    x["stats"][item].dropFields('from', 'gold', 'into', 'stats', 'tags')
                )
            )
        )
    )

cleansed_league_df.printSchema()

root
 |-- participants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- championId: string (nullable = true)
 |    |    |-- spell1Id: string (nullable = true)
 |    |    |-- spell2Id: string (nullable = true)
 |    |    |-- stats: struct (nullable = true)
 |    |    |    |-- assists: long (nullable = true)
 |    |    |    |-- deaths: long (nullable = true)
 |    |    |    |-- goldEarned: long (nullable = true)
 |    |    |    |-- item0: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item1: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item2: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item3: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- item4: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |

In [0]:
def explode_df(nested_df):
    new_df = nested_df
    for column in nested_df.columns:
        if cleansed_league_df.schema[column].dataType.typeName() == 'array':
            new_df = nested_df.selectExpr("*", f"explode({column}) as {column}_exploded").drop(column)
    return new_df

exploded_league_df = explode_df(cleansed_league_df)
exploded_league_df.printSchema()

root
 |-- participants_exploded: struct (nullable = true)
 |    |-- championId: string (nullable = true)
 |    |-- spell1Id: string (nullable = true)
 |    |-- spell2Id: string (nullable = true)
 |    |-- stats: struct (nullable = true)
 |    |    |-- assists: long (nullable = true)
 |    |    |-- deaths: long (nullable = true)
 |    |    |-- goldEarned: long (nullable = true)
 |    |    |-- item0: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item1: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item2: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item3: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item4: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item5: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- item6: struct (nu

In [0]:
def categorize(value,mean,std_dev):
    if value <= mean - std_dev:
        return "Low"
    elif value <= mean + std_dev:
        return "Medium"
    else:
        return "High"

In [0]:
MeanDeaths = exploded_league_df.agg(F.mean("participants_exploded.stats.deaths")).collect()[0][0]
StdDevDeaths = exploded_league_df.agg(F.stddev("participants_exploded.stats.deaths")).collect()[0][0]

MeanKills = exploded_league_df.agg(F.mean("participants_exploded.stats.kills")).collect()[0][0]
StdDevKills = exploded_league_df.agg(F.stddev("participants_exploded.stats.kills")).collect()[0][0]

MeanAssists = exploded_league_df.agg(F.mean("participants_exploded.stats.assists")).collect()[0][0]
StdDevAssists = exploded_league_df.agg(F.stddev("participants_exploded.stats.assists")).collect()[0][0]

MeanGoldEarned = exploded_league_df.agg(F.mean("participants_exploded.stats.goldEarned")).collect()[0][0]
StdDevGoldEarned = exploded_league_df.agg(F.stddev("participants_exploded.stats.goldEarned")).collect()[0][0]

MeanTotalDamageDealtToChampions = exploded_league_df.agg(F.mean("participants_exploded.stats.totalDamageDealtToChampions")).collect()[0][0]
StdDevTotalDamageDealtToChampions = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalDamageDealtToChampions")).collect()[0][0]

MeanTotalDamageTaken = exploded_league_df.agg(F.mean("participants_exploded.stats.totalDamageTaken")).collect()[0][0]
StdDevTotalDamageTaken = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalDamageTaken")).collect()[0][0]

MeanTotalHeal = exploded_league_df.agg(F.mean("participants_exploded.stats.totalHeal")).collect()[0][0]
StdDevTotalHeal = exploded_league_df.agg(F.stddev("participants_exploded.stats.totalHeal")).collect()[0][0]

MeanVisionWardsBoughtInGame = exploded_league_df.agg(F.mean("participants_exploded.stats.visionWardsBoughtInGame")).collect()[0][0]
StdDevVisionWardsBoughtInGame = exploded_league_df.agg(F.stddev("participants_exploded.stats.visionWardsBoughtInGame")).collect()[0][0]

# print(MeanDeaths,StdDevDeaths)
# print(MeanKills,StdDevKills)
# print(MeanAssists,StdDevAssists)
# print(MeanGoldEarned,StdDevGoldEarned)
# print(MeanTotalDamageDealtToChampions,StdDevTotalDamageDealtToChampions)
# print(MeanTotalDamageTaken,StdDevTotalDamageTaken)
# print(MeanTotalHeal,StdDevTotalHeal)
# print(MeantotalDamageDealtToChampions,StdDevtotalDamageDealtToChampions)

# categorize(10,MeanDeaths,StdDevDeaths)

In [0]:
league_rdd = exploded_league_df.rdd

In [0]:
def itemize(record):
    items = []
    participant = record.participants_exploded
    items.append("Champion = "+ participant.championId)
    items.append("Spell = "+ participant.spell1Id)
    items.append("Spell = "+ participant.spell2Id)
    stats = participant['stats']
    items.append("Win = " + str(stats.win))
    for i in range(7):
        item = stats['item' + str(i)]
        item_name = item['name'] if item else None
        if item_name: items.append("Item = " + item_name)
    for i in range(6):
        perk = stats['perk' + str(i)]
        perk_name = item['name'] if perk else None
        if perk_name: items.append("Perk = " + perk_name)
    damage_done = stats["totalDamageDealtToChampions"]
    items.append("Total Damage Done = " + categorize(damage_done, MeanTotalDamageDealtToChampions, StdDevTotalDamageDealtToChampions))
    # damage_taken = stats["totalDamageTaken"]
    # items.append("Total Damage Taken = " + categorize(damage_taken, MeanTotalDamageTaken, StdDevTotalDamageTaken))
    # deaths = stats["deaths"]
    # items.append("Deaths = " + categorize(deaths, MeanDeaths, StdDevDeaths))
    kills = stats["kills"]
    items.append("Kills = " + categorize(kills, MeanKills, StdDevKills))
    # assists = stats["assists"]
    # items.append("Assists = " + categorize(assists, MeanAssists, StdDevAssists))
    gold_earned = stats["goldEarned"]
    items.append("Gold Earned = " + categorize(gold_earned, MeanGoldEarned, StdDevGoldEarned))
    # total_heal = stats["totalHeal"]
    # items.append("Total Heal = " + categorize(total_heal, MeanTotalHeal, StdDevTotalHeal))
    ward_bought = stats["visionWardsBoughtInGame"]
    items.append("Wards Bought = " + categorize(ward_bought, MeanVisionWardsBoughtInGame, StdDevVisionWardsBoughtInGame))
    return set(items)

In [0]:
items_rdd = league_rdd.map(itemize)

In [0]:
minSupport = 0.025

In [0]:
model = FPGrowth.train(items_rdd, minSupport=minSupport)
result = model.freqItemsets()
result_sorted = result.sortBy(lambda x : (-len(x.items), -x.freq))
fi = result_sorted.collect()
for i in fi:
    print(i)

FreqItemset(items=['Spell = Teleport', 'Item = Warding Totem (Trinket)', 'Perk = Warding Totem (Trinket)', 'Win = True', 'Kills = Medium', 'Gold Earned = Medium', 'Total Damage Done = Medium', 'Wards Bought = Medium', 'Spell = Flash'], freq=416)
FreqItemset(items=['Spell = Teleport', 'Item = Warding Totem (Trinket)', 'Perk = Warding Totem (Trinket)', 'Win = False', 'Kills = Medium', 'Gold Earned = Medium', 'Total Damage Done = Medium', 'Wards Bought = Medium', 'Spell = Flash'], freq=413)
FreqItemset(items=['Item = Ninja Tabi', 'Spell = Teleport', 'Item = Warding Totem (Trinket)', 'Perk = Warding Totem (Trinket)', 'Kills = Medium', 'Gold Earned = Medium', 'Total Damage Done = Medium', 'Wards Bought = Medium', 'Spell = Flash'], freq=391)
FreqItemset(items=['Spell = Smite', 'Perk = Oracle Lens', 'Item = Oracle Lens', 'Win = False', 'Kills = Medium', 'Gold Earned = Medium', 'Total Damage Done = Medium', 'Wards Bought = Medium', 'Spell = Flash'], freq=301)
FreqItemset(items=["Item = Berserk

In [0]:
# from itertools import combinations
# data = items_rdd
# min_support = minSupport

# item_counts = data.flatMap(lambda transaction: [(item, 1) for item in transaction]).reduceByKey(lambda a, b: a + b)
# total_count = data.count()

# sup = int(min_support * total_count)
# freq_items = item_counts.filter(lambda x: x[1] >= sup)
# rules = []
# k = 2
# while freq_items.count() > 0:
#     rules.append(freq_items.collect())
#     candidate_counts = data.flatMap(lambda transaction: [(pair, 1) for pair in combinations(transaction, k)]).reduceByKey(lambda a, b: a + b)
#     freq_items = candidate_counts.filter(lambda x: x[1] >= sup).map(lambda x: (x[0], x[1]))
#     k += 1

In [0]:
print(items_rdd.take(1))

[{'Total Damage Done = Medium', "Item = Mercury's Treads", 'Kills = Medium', 'Wards Bought = Medium', 'Item = Sapphire Crystal', 'Item = Shard of True Ice', 'Item = Oracle Lens', 'Win = False', 'Perk = Oracle Lens', 'Gold Earned = Medium', "Item = Athene's Unholy Grail", 'Item = Ardent Censer', 'Spell = Flash', 'Item = Cloth Armor', 'Champion = Karma', 'Spell = Ignite'}]


In [0]:
cause = ("Item", "Spell", "Perk", "Champion", "Wards Bought")
# effect = ("Win", "Total Damage Done", "Total Damage Taken", "Deaths", "Kills", "Assists", "Gold Earned", "Total Heal")
effect = ("Win", "Total Damage Done", "Kills", "Assists", "Gold Earned")

In [0]:
# cause = ("Item", "Spell", "Perk", "Champion")
# effect = ("Win")
# x = set(["Item = Mercury's Treads", 'Item = Sapphire Crystal', 'Item = Shard of True Ice', 'Item = Oracle Lens', 'Perk = Oracle Lens', "Item = Athene's Unholy Grail", 'Item = Ardent Censer', 'Spell = Flash', 'Item = Cloth Armor', 'Champion = Karma', 'Spell = Ignite'])

# lhs = set([name for name in x if any(name.startswith(c) for c in cause)])
# rhs = set([name for name in x if any(name.startswith(e) for e in effect)])
# print(rhs)
# rule_conf = len([1 for t in broadcasted_items.value if lhs.issubset(t)])
# print(rule_conf)

In [0]:
items_rdd.cache()

Out[136]: PythonRDD[1039] at RDD at PythonRDD.scala:58

In [0]:
min_support = minSupport
total_count = items_rdd.count()
sup = int(total_count * min_support)

broadcasted_items = sc.broadcast(items_rdd.map(lambda x: set(x)).collect())

def sup_filter(x):
    cnt_join = len([1 for t in broadcasted_items.value if x.issubset(t)])
    if cnt_join >= sup:
        lhs = set([name for name in x if any(name.startswith(c) for c in cause)])
        rule_conf = len([1 for t in broadcasted_items.value if lhs.issubset(t)])
        if rule_conf == 0: return None
        return x, cnt_join / total_count, cnt_join / rule_conf
    return None

rules = []

k = 1
ck = items_rdd.flatMap(lambda x: set(x)).distinct().collect()
ck = [{x} for x in ck]

while len(ck) > 0:
    fk = sc.parallelize(ck).map(sup_filter).filter(lambda x: x is not None).collect()
    if len(fk): rules.append(fk)
    k += 1
    f_k_items = [item for item in map(lambda x: x[0], fk)]
    ck = [i1 | i2 for i, i1 in enumerate(f_k_items) for i2 in f_k_items[i + 1:] if list(i1)[:k - 2] == list(i2)[:k - 2]]

In [0]:
for rule in reversed(rules):
    rule.sort(key=lambda x: -x[2])
    for sub in rule:
        items = sub[0]
        sup = sub[1]
        conf = sub[2]
        lhs = set([name for name in items if any(name.startswith(c) for c in cause)])
        rhs = set([name for name in items if any(name.startswith(e) for e in effect)])
        if (len(lhs) == 0 or len(rhs) == 0): continue
        print(f"{' and '.join(lhs)} -> {' and '.join(rhs)}")
        print(f"Support: {sup}, Confidence: {conf}\n")

Wards Bought = Medium and Item = Doran's Blade and Perk = Farsight Alteration and Spell = Flash and Item = Farsight Alteration and Item = Berserker's Greaves -> Total Damage Done = Medium and Kills = Medium and Gold Earned = Medium
Support: 0.0264, Confidence: 0.639225181598063

Spell = Teleport and Wards Bought = Medium and Item = Warding Totem (Trinket) and Spell = Flash and Perk = Warding Totem (Trinket) and Item = Ninja Tabi -> Total Damage Done = Medium and Kills = Medium and Gold Earned = Medium
Support: 0.0391, Confidence: 0.6147798742138365

Wards Bought = Medium and Perk = Farsight Alteration and Spell = Flash and Spell = Heal and Item = Farsight Alteration and Item = Berserker's Greaves -> Total Damage Done = Medium and Kills = Medium and Gold Earned = Medium
Support: 0.0299, Confidence: 0.49833333333333335

Spell = Teleport and Item = Warding Totem (Trinket) and Spell = Flash and Perk = Warding Totem (Trinket) and Item = Ninja Tabi -> Total Damage Done = Medium and Kills = M

In [0]:
import re
query_str = "Item = Warding Totem (Trinket) and Spell = Teleport and Spell = Flash and Perk = Warding Totem (Trinket) and Item = Ninja Tabi -> Total Damage Done = Medium and Kills = Medium and Gold Earned = Medium and Win = False"

query = set(re.split(r" and | -> ", query_str))

print(query)
total_count = items_rdd.count()
def sup_calc(x):
    lhs = set([name for name in query if any(name.startswith(c) for c in cause)])
    rhs = set([name for name in query if any(name.startswith(e) for e in effect)])
    return (1, (int(lhs.issubset(set(x))), int(query.issubset(set(x)))))

map_phase = items_rdd.map(sup_calc)
map_phase.collect()
reduce_phase = map_phase.reduceByKey(lambda a, b : (a[0] + b[0], a[1] + b[1]))
query_conf, query_sup = reduce_phase.collect()[0][1]

try:
    print(f"Support: {query_sup / total_count}, Confidence: {query_sup / query_conf}\n")
except:
    print("This rule is invalid or does not ocour in the data")

{'Total Damage Done = Medium', 'Kills = Medium', 'Spell = Teleport', 'Win = False', 'Gold Earned = Medium', 'Item = Warding Totem (Trinket)', 'Spell = Flash', 'Perk = Warding Totem (Trinket)', 'Item = Ninja Tabi'}
Support: 0.0252, Confidence: 0.3169811320754717

