## Clash Royale Data Collection

This file is intended to do data collection only.  
The data will be acquired through python (a)sync wrapper from official clash royale api from this [Github](https://github.com/cgrok/clashroyale) Page. 

The way I collect data is to recursively call get_player_battles (tag). Each request will return 25 battles, which also contains 25 opponent players' tags. I will randomly choose one from these tags and request this player battle.  
To start the loop, I will call get_top_players() to request the top 200 players data. and randomly choose one from which to start the loop.

I plan to define some functions that will, instead of randomly select, select the player tag with lowest/highest trophies to get a evenly distributed data in trophies.

In [1]:
# import necessary packages.
import clashroyale
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import time
import random
import json
import sys   

from datetime import datetime
from dateutil import parser

In [10]:
## My Clash Royale Developer Key
# Follow https://github.com/cgrok/clashroyale#documentation to get your developer keys
import configparser
config = configparser.ConfigParser()
config.read("config.ini")
SECRETDEVKEY = config.get("Key","SECRETDEVKEY")

In [11]:
client.session.close()

In [17]:
## start session
client = clashroyale.RoyaleAPI(SECRETDEVKEY)
# get the top player to start
def get_top_players():
    players = client.get_top_players() 
    return players

def get_player_battles(tag):
    battles = client.get_player_battles(tag)
    return battles

## time counter. Count collapsed time of one request. 
# start counting
time_start = time.time()
## get top 200 players.
players = get_top_players()
now = time.time()

print(len(players),type(players))
print("It took ", now-time_start, " to run one \"get_top_players()\" method")

200 <class 'list'>
It took  0.5910444259643555  to run one "get_top_players()" method


Get top 200 players (tag) to start my loop. I will randomly choose one from these 200 players, and get the tag to start the loop.

In [20]:
## get random player tag from the requested top 200 players
random.shuffle(players)
tag = players[0].raw_data["tag"]
tag

'220J9VJJP'

In [21]:
battles = get_player_battles(tag)
rdata = battles[0].raw_data

In [22]:
## get the next tag to have get_player_battles function can performed recursively
def next_tag(battles):
    idx = random.randint(0,len(battles)-1)
    #print (len(battles),idx)
    tag = battles[idx].raw_data["opponent"][0]["tag"]
    return tag
print(next_tag(battles))

2YJRUQ2Q


In [13]:
## A "prove of concept cell" this try,except method to handle keyboardinterrupt will work to handle the keyboard interrupt while writing to file.

# try:
#     for i in range(5):
#         time.sleep(1)
#         print(i)
# except KeyboardInterrupt:
#     print("Interrupted, Your program is closing")
#     raise KeyboardInterrupt

0
1
Interrupted, Your program is closing


KeyboardInterrupt: 

In [23]:
# the following cell will delete the last line of file. This code is copied from https://stackoverflow.com/a/10289740/10344378
import os
def delete_last_line(file_name):
    with open(file_name, "r+", encoding = "utf-8") as file:

        #Move the pointer (similar to a cursor in a text editor) to the end of the file. 
        file.seek(0, os.SEEK_END)

        #This code means the following code skips the very last character in the file - 
        #i.e. in the case the last line is null we delete the last line 
        #and the penultimate one
        pos = file.tell() - 1

        #Read each character in the file one at a time from the penultimate 
        #character going backwards, searching for a newline character
        #If we find a new line, exit the search
        while pos > 0 and file.read(1) != "\n":
            pos -= 1
            file.seek(pos, os.SEEK_SET)

        #So long as we're not at the start of the file, delete all the characters ahead of this position
        if pos > 0:
            file.seek(pos, os.SEEK_SET)
            file.truncate()



In [56]:
## upload file to google drive
from googleapiclient import http

file_metadata = {'name': 'battles_temp.json'}
media = http.MediaFileUpload('battles_temp.json',
                        mimetype='json')
file = drive_service.files().create(body=file_metadata,
                                    media_body=media,
                                    fields='id').execute()
print ('File ID: %s' % file.get('id'))

NameError: name 'drive_service' is not defined

In [24]:
## start the loop

## 2 parameters that track the file size. and control the loop.
battle_df_size_tracker =0
battle_df_size = 120000

## 
with open('battles_temp.json', 'r+') as f:
    battle_df_size_tracker = sum(1 for row in f)
    #print(battle_df_size_tracker)
    # status bar
    with tqdm(total=battle_df_size, initial=battle_df_size_tracker) as pbar:
        while True:
            while True:
                #skip api timeout error, etc.
                try:
                    temp_battles = get_player_battles(tag)
                    ## check if requested data is not empty.
                    if not temp_battles:
                        tag = next_tag(battles)
                    else:
                        battles = temp_battles
                        break
                except Exception as e:
                    e = sys.exc_info()
                    print(datetime.now(), end = ": ")
                    print (e)
            ## check if the result data only has lenth of one, which will not be in a list, a dict instead. transform to list here.    
            if not isinstance(battles,list):
                battles = list(battles)
            
            for battle in battles:
                ## write to file while requesting in case of any unexpected errors.
                try:
                    json.dump(battle.raw_data, f)
                    f.write('\n')
                except KeyboardInterrupt:
                    ## in case that the json dump is not complete, delete the interrupted dump line (I persummed this will be the last line).
                    delete_last_line('battles_temp.json')
                    print("Last line deleted")
                    raise KeyboardInterrupt
            
            ## update status bar
            pbar.update(len(battles))
            
            ## file size tracker.
            battle_df_size_tracker += len(battles)
            #print(row_count, sum(1 for row in f))
            if battle_df_size_tracker > battle_df_size:
                break
            # get tag for next battle
            tag = next_tag(battles)




In [2]:
## show file size
with open("battles_temp.json", "r") as f:
    print(sum(1 for row in f))

120004


In [4]:
## show trophies distribution
battles_df = pd.read_json("battles_temp.json", lines=True)
battles_df.opponent[battles_df.opponent.apply(lambda x:"startTrophies" in x[0].keys())].apply(lambda x:x[0]["startTrophies"]).sort_values().plot(kind = "kde")

OSError: [WinError 87] The parameter is incorrect

In [29]:
## convert timestamp to human readable
dt = battles_df.utcTime.apply(lambda x: datetime.utcfromtimestamp(x))
# check month distribution of the dataset
dt.apply(lambda x:x.month).value_counts()

10    49296
9       687
3        26
Name: utcTime, dtype: int64

In [18]:
## get only ladder games
battles_df = battles_df[battles_df["type"]=="PvP"]
battles_df.month.value_counts()

AttributeError: 'DataFrame' object has no attribute 'month'