In [1]:
import pyspark.sql.types as T
import pyspark.sql.functions as F

from pyspark import SparkContext, SparkConf, SQLContext

from dateutil.relativedelta import relativedelta
from pyspark.sql.functions import pandas_udf
from pyspark.sql.window import Window

import pandas as pd
import numpy as np

from datetime import date, datetime, timedelta, timezone
import os
import json
import requests
import time
import yaml
from itertools import chain

import concurrent.futures
from threading import Lock

In [2]:
appName = "PySpark TFT puuids"
master = "local[*]"
conf = SparkConf() \
    .setAppName(appName) \
    .setMaster(master) \
    .set("spark.executor.memory", "40g") \
    .set("spark.driver.memory", "40g") \
    .set("spark.executor.memoryOverhead", "8g") \
    .set("spark.local.dir", "/home/mai/spark-temp") \
    .set("spark.sql.session.timeZone", "UTC") \
    .set("spark.dynamicAllocation.enabled", "true") \
    .set("spark.dynamicAllocation.minExecutors", "2") \
    .set("spark.dynamicAllocation.maxExecutors", "50") \
    .set("spark.speculation", "true") 
   
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

25/02/19 18:02:02 WARN Utils: Your hostname, LAPTOP-4O0SI9BK resolves to a loopback address: 127.0.1.1; using 172.30.74.42 instead (on interface eth0)
25/02/19 18:02:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/19 18:02:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/19 18:02:04 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
df = spark.read.parquet('./data/tft_match_data_cleaned/matches_euw1_CHALLENGER_I.parquet')

                                                                                

In [4]:
df.printSchema()

root
 |-- metadata_data_version: string (nullable = true)
 |-- metadata_match_id: string (nullable = true)
 |-- info_endOfGameResult: string (nullable = true)
 |-- info_gameCreation: long (nullable = true)
 |-- info_game_datetime: long (nullable = true)
 |-- info_game_length: double (nullable = true)
 |-- info_game_version: string (nullable = true)
 |-- info_mapId: long (nullable = true)
 |-- info_queueId: long (nullable = true)
 |-- info_tft_game_type: string (nullable = true)
 |-- info_tft_set_core_name: string (nullable = true)
 |-- info_tft_set_number: long (nullable = true)
 |-- gold_left: long (nullable = true)
 |-- last_round: long (nullable = true)
 |-- level: long (nullable = true)
 |-- partner_group_id: integer (nullable = true)
 |-- placement: long (nullable = true)
 |-- players_eliminated: long (nullable = true)
 |-- puuid: string (nullable = true)
 |-- riotIdGameName: string (nullable = true)
 |-- riotIdTagline: string (nullable = true)
 |-- skill_tree: integer (nullable =

In [42]:
df.withColumn('num_units', F.size(F.col('units')))\
    .withColumn('num_traits', F.size(F.col('traits')))\
    .withColumn('num_units_tier_2', F.size(F.filter(F.col('units'), lambda x: x.tier == 2)))\
    .withColumn('num_units_tier_3', F.size(F.filter(F.col('units'), lambda x: x.tier == 3)))\
    .withColumn('num_units_tier_4', F.size(F.filter(F.col('units'), lambda x: x.tier == 4)))\
    .withColumn('highest_unit_rarity', F.array_max(F.col('units.rarity')))\
    .withColumn('lowest_unit_rarity', F.array_min(F.col('units.rarity')))\
    .withColumn('units_with_highest_unit_rarity', F.filter(F.col('units'), lambda x: x.rarity == F.col('highest_unit_rarity')))\
    .withColumn('highest_tier_of_highest_unit_rarity', F.array_max(F.transform('units_with_highest_unit_rarity', lambda x: x.tier)))\
    .select('puuid', 'placement', 'num_units', 'num_traits', 
            'highest_unit_rarity', 'highest_tier_of_highest_unit_rarity', 
            'lowest_unit_rarity',
            'num_units_tier_2', 'num_units_tier_3', 'num_units_tier_4', 'level')\
    .where(F.col('metadata_match_id') == 'EUW1_7291286802')\
    .orderBy('placement')\
    .show()

+--------------------+---------+---------+----------+-------------------+-----------------------------------+------------------+----------------+----------------+----------------+-----+
|               puuid|placement|num_units|num_traits|highest_unit_rarity|highest_tier_of_highest_unit_rarity|lowest_unit_rarity|num_units_tier_2|num_units_tier_3|num_units_tier_4|level|
+--------------------+---------+---------+----------+-------------------+-----------------------------------+------------------+----------------+----------------+----------------+-----+
|DqaoIU1D3Dfm4fSRT...|        1|       11|        11|                  9|                                  3|                 1|               8|               1|               0|   10|
|bfWbsSQiKtdfCaxPU...|        2|        8|         8|                  8|                                  1|                 0|               5|               2|               0|    8|
|-8TNcPruyzGFSODc5...|        3|       10|        11|                 

In [18]:
df.where(F.col('metadata_match_id') == 'EUW1_7291286802')\
    .select('puuid', 'gold_left', 'placement', 'last_round', 'level', 'players_eliminated', 'total_damage_to_players', F.explode('traits').alias('traits'), 'units')\
    .select('puuid', 'gold_left', 'placement', 'last_round', 'level', 'players_eliminated', 'total_damage_to_players', 'traits.name', 'traits.tier_current',
           'traits.style', 'traits.tier_total').show()

+--------------------+---------+---------+----------+-----+------------------+-----------------------+-----------------+------------+-----+----------+
|               puuid|gold_left|placement|last_round|level|players_eliminated|total_damage_to_players|             name|tier_current|style|tier_total|
+--------------------+---------+---------+----------+-----+------------------+-----------------------+-----------------+------------+-----+----------+
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                     91|    TFT13_Academy|           2|    2|         4|
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                     91|TFT13_BloodHunter|           1|    3|         1|
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                     91| TFT13_Experiment|           0|    0|         3|
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                

In [22]:
df.where(F.col('metadata_match_id') == 'EUW1_7291286802')\
    .select('puuid', 'gold_left', 'placement', 'last_round', 'level', 'players_eliminated', 'total_damage_to_players', F.explode('units').alias('units'))\
    .select('puuid', 'gold_left', 'placement', 'last_round', 'level', 'players_eliminated', 'total_damage_to_players', 'units.character_id', 'units.itemNames',
           'units.rarity', 'units.tier').show(100)

+--------------------+---------+---------+----------+-----+------------------+-----------------------+------------------+--------------------+------+----+
|               puuid|gold_left|placement|last_round|level|players_eliminated|total_damage_to_players|      character_id|           itemNames|rarity|tier|
+--------------------+---------+---------+----------+-----+------------------+-----------------------+------------------+--------------------+------+----+
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                     91|      TFT13_Irelia|                  []|     0|   2|
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                     91|       TFT13_Leona|                  []|     1|   2|
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|                 0|                     91|        TFT13_Rell|[TFT_Item_Bramble...|     1|   1|
|4XZMPNXlRKTe9jXDe...|       44|        5|        30|    8|           