In [1]:
from collections import defaultdict
import json
import os

def problem_web_counts(directory):

    counts = defaultdict(int)

    for problem in os.listdir(directory):
        filename = os.path.join(directory, problem)+"/metadata.json"
        if not os.path.isfile(filename):
            continue

        with open(filename, 'r') as f:
            data = json.load(f)
            problem_url = data["url"] # Problem url

            if "codeforce" in data["url"]:
                counts["codeforce"] += 1
            elif "codewar" in data["url"]:
                counts["codewar"] += 1
            elif "codechef" in data["url"]:
                counts["codechef"] += 1
            elif "leetcode" in data["url"]:
                counts["leetcode"] += 1
            elif "hackerrank" in data["url"]:
                counts["hackerrank"] += 1
            elif "atcoder" in data["url"]:
                counts["atcoder"] += 1
            elif "kattis" in data["url"]:
                counts["kattis"] += 1
            else:
                print(data["url"])
    print("{} set counts:".format(directory))

    total = 0
    for k in counts:
        print("{}: {}/5000".format(k, counts[k]))
        total += counts[k]
    print("Total counts: {}/5000".format(total))

In [2]:
problem_web_counts("train")
print("\n")
problem_web_counts("test")

train set counts:
codechef: 1112/5000
leetcode: 739/5000
codewar: 2515/5000
atcoder: 61/5000
hackerrank: 96/5000
codeforce: 477/5000
Total counts: 5000/5000


test set counts:
codeforce: 2953/5000
kattis: 1236/5000
atcoder: 696/5000
leetcode: 38/5000
codechef: 61/5000
hackerrank: 16/5000
Total counts: 5000/5000


## Codes below only add tags to "CodeForce" problems

In [5]:
import json
import os
import requests
from bs4 import BeautifulSoup

def add_tags_codeforce(directory):
# Can be either "train" or test" directory
    count = 0
    codeforce = 0
    # Loop through all problems in the directory
    for problem in os.listdir(directory):
        filename = os.path.join(directory, problem)+"/metadata.json"
        if not os.path.isfile(filename):
            continue

        count += 1
        with open(filename, 'r') as f:
            data = json.load(f)
            problem_url = data["url"] # Problem url
            
            if "codeforce" not in data["url"]:
                continue
            print(filename)
            codeforce += 1
            problem_page = requests.get(problem_url)
            problem_soup = BeautifulSoup(problem_page.content, 'html.parser')
            problem_tags = problem_soup.find_all("span", {"class":"tag-box"}) # Find all tags
            problem_tags = [tag.text.strip() for tag in problem_tags]
            data["tags"] = problem_tags[:-1] # Ignore the difficulty tag
        with open(filename, 'w') as f:
            json.dump(data, f)

In [None]:
add_tags_codeforce("train")

train/2085/metadata.json
train/2071/metadata.json
train/2049/metadata.json
train/2076/metadata.json
train/1607/metadata.json
train/2082/metadata.json
train/2040/metadata.json
train/2078/metadata.json
train/2281/metadata.json
train/2275/metadata.json
train/2047/metadata.json
train/1696/metadata.json
train/2013/metadata.json
train/2221/metadata.json
train/0112/metadata.json
train/0115/metadata.json
train/2226/metadata.json
train/2014/metadata.json
train/2219/metadata.json
train/2022/metadata.json
train/2025/metadata.json
train/2217/metadata.json
train/2228/metadata.json
train/2079/metadata.json
train/2046/metadata.json
train/2274/metadata.json
train/2280/metadata.json
train/2273/metadata.json
train/2041/metadata.json
train/2048/metadata.json
train/1606/metadata.json
train/2083/metadata.json
train/2245/metadata.json
train/2077/metadata.json
train/2070/metadata.json
train/2084/metadata.json
train/2216/metadata.json
train/2024/metadata.json
train/2229/metadata.json
train/2023/metadata.json


## Dump new test files with hint tags into test.json

In [75]:
test_files = {}
for file in os.listdir("test"):
    test_files[int(file)] = file
with open("json_files/test.json", 'w') as fp:
    json.dump(test_files, fp)

In [78]:
sorted(test_files.values())

['0000',
 '0001',
 '0002',
 '0003',
 '0004',
 '0005',
 '0006',
 '0007',
 '0008',
 '0009',
 '0010',
 '0011',
 '0012',
 '0013',
 '0014',
 '0015',
 '0016',
 '0017',
 '0018',
 '0019',
 '0020',
 '0021',
 '0022',
 '0023',
 '0024',
 '0025',
 '0026',
 '0027',
 '0028',
 '0029',
 '0030',
 '0031',
 '0032',
 '0033',
 '0034',
 '0035',
 '0036',
 '0037',
 '0038',
 '0039',
 '0040',
 '0041',
 '0042',
 '0043',
 '0044',
 '0045',
 '0046',
 '0047',
 '0048',
 '0049',
 '0050',
 '0051',
 '0052',
 '0053',
 '0054',
 '0055',
 '0056',
 '0057',
 '0058',
 '0059',
 '0060',
 '0061',
 '0062',
 '0063',
 '0064',
 '0065',
 '0066',
 '0067',
 '0068',
 '0069',
 '0070',
 '0071',
 '0072',
 '0073',
 '0074',
 '0075',
 '0076',
 '0077',
 '0078',
 '0079',
 '0080',
 '0081',
 '0082',
 '0083',
 '0084',
 '0085',
 '0086',
 '0087',
 '0088',
 '0089',
 '0090',
 '0091',
 '0092',
 '0093',
 '0094',
 '0095',
 '0096',
 '0097',
 '0098',
 '0099',
 '0100',
 '0101',
 '0102',
 '0103',
 '0104',
 '0105',
 '0106',
 '0107',
 '0108',
 '0109',
 '0110',
 