# Step 1: Collect summonerId of challenger TFT players

In [1]:
import pyspark.sql.types as T
import pyspark.sql.functions as F

from pyspark import SparkContext, SparkConf, SQLContext

from dateutil.relativedelta import relativedelta
from pyspark.sql.functions import pandas_udf
from pyspark.sql.window import Window

import pandas as pd
import numpy as np

from datetime import date, datetime, timedelta, timezone
import os
import json
import requests
import time
import yaml

In [2]:
appName = "PySpark TFT players by rank"
master = "local[*]"
conf = SparkConf() \
    .setAppName(appName) \
    .setMaster(master) \
    .set("spark.executor.memory", "40g") \
    .set("spark.driver.memory", "40g") \
    .set("spark.executor.memoryOverhead", "8g") \
    .set("spark.local.dir", "/home/mai/spark-temp") \
    .set("spark.sql.session.timeZone", "UTC") \
    .set("spark.dynamicAllocation.enabled", "true") \
    .set("spark.dynamicAllocation.minExecutors", "2") \
    .set("spark.dynamicAllocation.maxExecutors", "50") \
    .set("spark.speculation", "true") 
   
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

25/02/05 15:11:46 WARN Utils: Your hostname, LAPTOP-4O0SI9BK resolves to a loopback address: 127.0.1.1; using 172.26.83.22 instead (on interface eth0)
25/02/05 15:11:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/05 15:11:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/05 15:11:48 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/02/05 15:11:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Set up

In [3]:
# retrieve API key
with open('./api_key.yaml', 'r') as file:
    API_KEY = yaml.safe_load(file)['api_key']

In [4]:
API_KEY

'RGAPI-1a8d985b-72d0-45c3-a1b5-bd1be81564a0'

In [6]:
# function to get the players
def get_players(queue, tier, division, entry_limit):
    players = []
    page = 1
    is_paginating = True
    while is_paginating:
        # construct url
        url = f'{BASE_URL}?{QUEUE}'
        # headers
        headers = {
            'X-Riot-Token': API_KEY
        }
        # send request
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            # parse json response
            data = response.json()
            players.extend(data)
            if len(data) < entry_limit:
                is_paginating = False
            else:
                page += 1
            time.sleep(1.2)
        elif response.status_code == 429:  # Rate-limited response
            print("Rate limit exceeded. Waiting before retrying...")
            # Extract "Retry-After" header if available
            retry_after = int(response.headers.get("Retry-After", 10))
            time.sleep(retry_after)  # Wait for the specified time
        else:
            print(f'Error: {response.status_code} - {response.text}')
            is_paginating = False
    return players

In [22]:
# set up parameters for the querying the API
REGION = 'kr'         # region of the player (KR or EUW1)
BASE_URL = f'https://{REGION}.api.riotgames.com/tft/league/v1/challenger'         # API endpoint
QUEUE = 'RANKED_TFT'  # queue of the player. We are interested in players who reached challenger in ranked queue
TIER = 'CHALLENGER'   # tier / rank of the player 
DIVISION = 'I'        # subdivision of the tier (For Master, Grandmaster, Challenger, there is only 1 subdivision so this defaults to I)

In [23]:
# construct the url and the headers with the API
url = f'{BASE_URL}?{QUEUE}'
headers = {
        'X-Riot-Token': API_KEY
    }

In [24]:
# get the response
response = requests.get(url, headers=headers)

In [25]:
if response.status_code == 200:
    # parse json response
    data = response.json()['entries']

In [26]:
# convert the json response into a dataframe
tft_players = pd.json_normalize(data)

In [27]:
tft_players.head()

Unnamed: 0,summonerId,leaguePoints,rank,wins,losses,veteran,inactive,freshBlood,hotStreak
0,ILwCVBsVzwAXll29Rjm0pJ2MpcJho9F2Gs6Q-ROOaVmLH4...,1791,I,253,103,True,False,False,False
1,PcJtqqwPzHaQZRM3MyrTFw44HJjnPLZEOb9aZwurjaisJi...,1773,I,222,96,True,False,False,True
2,EeFn7RQBLE0Px47lujB7fbrEz-Zc1e9pCoaekXcWrDnZqAm7,1772,I,391,212,True,False,False,True
3,ONyjHlWH-BWFc-KuSrCGcVMerIzzIum7WHcoDyTD5Hd_F4...,1749,I,230,90,True,False,False,False
4,nXJWod4yFn7ZkCumBLfZcY-4gm82y0Rv6SAUIIcOhGhFWes,1724,I,305,206,True,False,False,True


In [28]:
# convert the Pandas dataframe into a PySpark dataframe for later usage
tft_players = spark.createDataFrame(tft_players)

In [29]:
tft_players.show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+------------+----+----+------+-------+--------+----------+---------+
|          summonerId|leaguePoints|rank|wins|losses|veteran|inactive|freshBlood|hotStreak|
+--------------------+------------+----+----+------+-------+--------+----------+---------+
|ILwCVBsVzwAXll29R...|        1791|   I| 253|   103|   true|   false|     false|    false|
|PcJtqqwPzHaQZRM3M...|        1773|   I| 222|    96|   true|   false|     false|     true|
|EeFn7RQBLE0Px47lu...|        1772|   I| 391|   212|   true|   false|     false|     true|
|ONyjHlWH-BWFc-KuS...|        1749|   I| 230|    90|   true|   false|     false|    false|
|nXJWod4yFn7ZkCumB...|        1724|   I| 305|   206|   true|   false|     false|     true|
|bqmYXUsjey6gxdy9m...|        1691|   I| 440|   254|   true|   false|     false|     true|
|AI-SS6_RFbA2bmJnO...|        1667|   I| 341|   183|   true|   false|     false|    false|
|2YFjhQo1LenjZen4N...|        1664|   I| 257|   116|   true|   false|     false|    false|

                                                                                

In [30]:
# write the PySpark dataframe to a parquet file
tft_players.write.mode('overwrite').parquet(f'./data/tft_players/players_{REGION}_{TIER}_{DIVISION}.parquet')