## Clash Royale Data Collection

This file is intended to do data collection only.  
The data will be acquired through python (a)sync wrapper from official clash royale api from this [Github](https://github.com/cgrok/clashroyale) Page. 

The way I collect data is to recursively call get_player_battles (tag). Each request will return 25 battles, which also contains 25 opponent players' tags. I will randomly choose one from these tags and request this player battle.  
To start the loop, I will call get_top_players() to request the top 200 players data. and randomly choose one from which to start the loop.

I plan to define some functions that will, instead of randomly select, select the player tag with lowest/highest trophies to get a evenly distributed data in trophies.

In [73]:
# import necessary packages.
import clashroyale
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import time
import random
import json
import re
import sys   
import os

from datetime import datetime
from dateutil import parser

In [2]:
## My Clash Royale Developer Key
# Follow https://github.com/cgrok/clashroyale#documentation to get your developer keys
import configparser
config = configparser.ConfigParser()
config.read("config.ini")
SECRETDEVKEY = config.get("Key","SECRETDEVKEY")

In [3]:
## start session
client = clashroyale.RoyaleAPI(SECRETDEVKEY)
# get the top player to start
def get_top_players():
    players = client.get_top_players() 
    return players

def get_player_battles(tag):
    battles = client.get_player_battles(tag)
    return battles

## time counter. Count collapsed time of one request. 
# start counting
time_start = time.time()
## get top 200 players.
players = get_top_players()
now = time.time()

print(len(players),type(players))
print("It took ", now-time_start, " to run one \"get_top_players()\" method")

1000 <class 'list'>
It took  1.2294042110443115  to run one "get_top_players()" method


Get top 200 players (tag) to start my loop. I will randomly choose one from these 200 players, and get the tag to start the loop.

In [4]:
## get random player tag from the requested top 200 players
random.shuffle(players)
tag = players[0].raw_data["tag"]
tag

'QGURU8QC'

In [5]:
battles = get_player_battles(tag)
rdata = battles[0].raw_data

In [6]:
## get the next tag to have get_player_battles function can performed recursively
def next_tag(battles):
    idx = random.randint(0,len(battles)-1)
    #print (len(battles),idx)
    tag = battles[idx].raw_data["opponent"][0]["tag"]
    return tag
print(next_tag(battles))

9899VQCP


In [7]:
## A "prove of concept cell" this try,except method to handle keyboardinterrupt will work to handle the keyboard interrupt while writing to file.

# try:
#     for i in range(5):
#         time.sleep(1)
#         print(i)
# except KeyboardInterrupt:
#     print("Interrupted, Your program is closing")
#     raise KeyboardInterrupt

In [8]:
# the following cell will delete the last line of file. This code is copied from https://stackoverflow.com/a/10289740/10344378
import os
def delete_last_line(file_name):
    with open(file_name, "r+", encoding = "utf-8") as file:

        #Move the pointer (similar to a cursor in a text editor) to the end of the file. 
        file.seek(0, os.SEEK_END)

        #This code means the following code skips the very last character in the file - 
        #i.e. in the case the last line is null we delete the last line 
        #and the penultimate one
        pos = file.tell() - 1

        #Read each character in the file one at a time from the penultimate 
        #character going backwards, searching for a newline character
        #If we find a new line, exit the search
        while pos > 0 and file.read(1) != "\n":
            pos -= 1
            file.seek(pos, os.SEEK_SET)

        #So long as we're not at the start of the file, delete all the characters ahead of this position
        if pos > 0:
            file.seek(pos, os.SEEK_SET)
            file.truncate()

today_date = str(datetime.now().date())
fname = today_date+'_battles.json'
#print(fname)
exists = os.path.isfile(fname)
if exists:
    print(exists)
else:
    f= open(fname,"w")
    f.close()

True


In [9]:
## upload file to google drive
from googleapiclient import http

file_metadata = {'name': 'battles_temp.json'}
media = http.MediaFileUpload('battles_temp.json',
                        mimetype='json')
file = drive_service.files().create(body=file_metadata,
                                    media_body=media,
                                    fields='id').execute()
print ('File ID: %s' % file.get('id'))

NameError: name 'drive_service' is not defined

In [25]:
## start the loop

## 2 parameters that track the file size. and control the loop.
battle_df_size_tracker =0
battle_df_size = 120000
try:
    Unauthorized_count
    API_count
    total_iter = 0
except NameError:
    Unauthorized_count = 0
    API_count = 0
    


## 
with open(fname, 'r+') as f:
    battle_df_size_tracker = sum(1 for row in f)
    #print(battle_df_size_tracker)
    # status bar
    with tqdm(total=battle_df_size, initial=battle_df_size_tracker) as pbar:
        while True:
            while True:
                #skip api timeout error, etc.
                try:
                    temp_battles = get_player_battles(tag)
                    ## check if requested data is not empty.
                    if not temp_battles:
                        tag = next_tag(battles)
                    else:
                        battles = temp_battles
                        break
                except Exception as e:
                    e = sys.exc_info()
                    if "Unauthorized" in str(e):
                        Unauthorized_count +=1
                    elif "API request timed out" in str(e):
                        API_count += 1
                    else:
                        print(datetime.now(), end = ": ")
                        print ((e))
            ## check if the result data only has lenth of one, which will not be in a list, a dict instead. transform to list here.    
            if not isinstance(battles,list):
                battles = list(battles)
            
            for battle in battles:
                ## write to file while requesting in case of any unexpected errors.
                try:
                    json.dump(battle.raw_data, f)
                    f.write('\n')
                except KeyboardInterrupt:
                    ## in case that the json dump is not complete, delete the interrupted dump line (I persummed this will be the last line).
                    delete_last_line(fname)
                    print("Last line deleted")
                    raise KeyboardInterrupt
            
            ## update status bar
            pbar.update(len(battles))
            
            ## file size tracker.
            battle_df_size_tracker += len(battles)
            #print(row_count, sum(1 for row in f))
            if battle_df_size_tracker > battle_df_size:
                print("Unauthorized Error count:", Unauthorized_count)
                print("API Time Out Error count:", API_count)
                break
            # get tag for next battle
            tag = next_tag(battles)

Unauthorized Error count: 0
API Time Out Error count: 16



## Preprocess

Check if each line of the file is in json format, if not try fix it or delete the line

In [129]:
data = []
with open("test.json", "r+") as ftest:
    for idx, line in enumerate(ftest):
        try:
            
            data.append(json.loads(line))
        except Exception as e:
            
            e_str = str(e)
            print(e_str)
            print(idx)
            line_num = re.findall("line \d+", e_str)[0].split()
            column_num = re.findall("column \d+", e_str)[0].split()
            char_num = re.findall("char \d+", e_str)[0].split()
            if "Expecting ':' delimiter" in e_str:
                fixed_line = line[0:int(char_num[1])].strip()+": "+line[int(char_num[1]):]
                print(line.strip()+" => "+fixed_line.strip())
                data.append(json.loads(fixed_line))
                print("Solved: " + e_str)
                print()
            elif "Expecting value:" in e_str:
                print("Solved: Line skipped")
                print()
                
            continue
data

Expecting ':' delimiter: line 1 column 6 (char 5)
3
{"1" 2} => {"1": 2}
Solved: Expecting ':' delimiter: line 1 column 6 (char 5)

Expecting value: line 2 column 1 (char 1)
4
Solved: Line skipped



[{'0': 1}, {'1': 2}, {'1': 2}, {'1': 2}, {'1': 2}]

In [131]:
## show file size
data = []
with open(fname, "r") as f:
    with tqdm(total=120015, initial=0) as pbar:
        for idx, line in enumerate(f):
    #         if idx == 115152:
    #             print (line[7500:7600])
            try:
                data.append(json.loads(line))
                pbar.update()
    #             if idx == 120000:
    #                 print("done")
    #                 break
            except Exception as e:
                e_str = str(e)
                print(e_str)
                print(idx)
                line_num = re.findall("line \d+", e_str)[0].split()
                column_num = re.findall("column \d+", e_str)[0].split()
                char_num = re.findall("char \d+", e_str)[0].split()
                if "Expecting ':' delimiter" in e_str:
                    fixed_line = line[0:int(char_num[1])].strip()+": "+line[int(char_num[1]):]
                    print(line.strip()+" => "+fixed_line.strip())
                    data.append(json.loads(fixed_line))
                    print("Solved: " + e_str)
                    print()
                elif "Expecting value:" in e_str:
                    print("Solved: Line skipped")
                    print()
                else:
                    break




In [135]:
battles_df = pd.DataFrame(data)

In [33]:
#rage_df.to_csv("rage_battle.csv")

In [134]:
# ## show trophies distribution
# battles_df = pd.read_json("battles_temp.json", lines=True)
# battles_df.opponent[battles_df.opponent.apply(lambda x:"startTrophies" in x[0].keys())].apply(lambda x:x[0]["startTrophies"]).sort_values().plot(kind = "kde")

In [136]:
## convert timestamp to human readable
dt = battles_df.utcTime.apply(lambda x: datetime.utcfromtimestamp(x))
# check month distribution of the dataset
dt.apply(lambda x:x.month).value_counts()

1     119121
12       846
3         78
Name: utcTime, dtype: int64

In [152]:
## get only ladder games
battles_df = battles_df[battles_df["type"]=="PvP"]
dt = battles_df.utcTime.apply(lambda x: datetime.utcfromtimestamp(x))
dt.apply(lambda x: x.day if x.month==1 else None).value_counts()

22.0    25150
21.0    11692
20.0    11570
19.0     3623
18.0     1114
17.0      562
16.0      378
15.0      222
14.0      187
13.0      167
12.0      131
3.0        88
6.0        63
9.0        62
10.0       62
7.0        61
8.0        41
2.0        41
11.0       39
1.0        38
4.0        36
5.0        30
Name: utcTime, dtype: int64