In [81]:
import json
from dateutil.parser import parse
from datetime import datetime
from pprint import pprint as pp
from tqdm.notebook import tqdm, trange

In [1]:
path = r"C:\Users\Konrad Grudzinski\OneDrive - University of Glasgow\Computing\4th Year\Individual Project\Source\covid19\samples.json"

In [125]:
with open(path, "r") as f:
    data = json.load(f)

In [84]:
def get_week(date: datetime) -> (int, int):
    week_num = date.isocalendar().week
    year = date.year
    # first days of a year may belong to last week (number) of previous year
    if week_num > 50 and date.month == 1:
        year -= 1
    return year, week_num

In [85]:
def extract_info(d: dict[str, str]) -> (str, int, int):
    country = d["country"].split(":")[0]
    date = parse(d["collection_date"], yearfirst=True, dayfirst=False)
    year, week = get_week(date)
    return country, year, date.month, week

---

In [86]:
week_month_data = []

In [87]:
for sampleName, info in tqdm(list(data.items())):
    week_month_data.append((sampleName, *extract_info(info)))

  0%|          | 0/702141 [00:00<?, ?it/s]

In [96]:
week_month_data[:10]

[('BS000685.1', 'Japan', 2020, 4, 15),
 ('BS000686.1', 'Japan', 2020, 4, 15),
 ('BS000687.1', 'Japan', 2020, 4, 15),
 ('BS000688.1', 'Japan', 2020, 4, 15),
 ('BS000689.1', 'Japan', 2020, 4, 15),
 ('BS000690.1', 'Japan', 2020, 4, 15),
 ('BS000691.1', 'Japan', 2020, 4, 15),
 ('BS000692.1', 'Japan', 2020, 4, 15),
 ('BS000693.1', 'Japan', 2020, 4, 15),
 ('BS000694.1', 'Japan', 2020, 4, 15)]

In [121]:
countries_sorted_months

[('USA', 432251),
 ('United Kingdom', 236625),
 ('Australia', 9903),
 ('Slovakia', 9548),
 ('Mexico', 1775),
 ('Hong Kong', 1637),
 ('Netherlands', 1501),
 ('Japan', 1068),
 ('India', 945),
 ('Egypt', 928),
 ('New Zealand', 676),
 ('Bangladesh', 573),
 ('Bahrain', 536),
 ('Chile', 286),
 ('Pakistan', 254),
 ('Nigeria', 210),
 ('Ghana', 185),
 ('Saudi Arabia', 184),
 ('China', 172),
 ('Kenya', 168),
 ('Djibouti', 162),
 ('Austria', 158),
 ('Poland', 152),
 ('Serbia', 141),
 ('Spain', 125),
 ('Dominican Republic', 124),
 ('Peru', 122),
 ('Israel', 120),
 ('Brazil', 109),
 ('Italy', 94),
 ('Greece', 93),
 ('Iraq', 92),
 ('Switzerland', 87),
 ('France', 86),
 ('Sierra Leone', 59),
 ('Mongolia', 58),
 ('Taiwan', 57),
 ('Tunisia', 54),
 ('West Bank', 54),
 ('Germany', 53),
 ('Jamaica', 47),
 ('Turkey', 46),
 ('Malaysia', 43),
 ('Philippines', 40),
 ('Argentina', 34),
 ('Malawi', 32),
 ('Russia', 29),
 ('South Korea', 29),
 ('Myanmar', 28),
 ('Finland', 25),
 ('Czech Republic', 24),
 ('Jordan

In [143]:
pp(bins_string_keys["Germany"])

{'months': {'2020-1': {'counts': 27,
                       'samples': ['LR824570.1',
                                   'MT270101.1',
                                   'MT270102.1',
                                   'MT270103.1',
                                   'MT270104.1',
                                   'MT270105.1',
                                   'MT270108.1',
                                   'MT270109.1',
                                   'MT270112.1',
                                   'MW368440.1',
                                   'OV057770.1',
                                   'OV077247.1',
                                   'OV077441.1',
                                   'OV077446.1',
                                   'OV077780.1',
                                   'OV077782.1',
                                   'OV077786.1',
                                   'OV078074.1',
                                   'OV078093.1',
                                

---

In [146]:
output_json = r"C:\Users\Konrad Grudzinski\OneDrive - University of Glasgow\Computing\4th Year\Individual Project\Source\covid19\sorted_samples.json"

In [147]:
bins_string_keys = {}

In [148]:
for sample, country, year, month, week_n in tqdm(week_month_data):
    if country in bins_string_keys:
        country_data = bins_string_keys[country]
    else:
        country_data = {"months": {}, "weeks": {}}
    
    month_key = f"{year}-{month}"
    if month_key not in country_data["months"]:
        country_data["months"][month_key] = {"counts": 0, "samples": []}
    country_data["months"][month_key]["counts"] += 1
    country_data["months"][month_key]["samples"].append(sample)
    
    week_key = f"{year}-{week_n}"
    if week_key not in country_data["weeks"]:
        country_data["weeks"][week_key] = {"counts": 0, "samples": []}
    country_data["weeks"][week_key]["counts"] += 1
    country_data["weeks"][week_key]["samples"].append(sample)

    bins_string_keys[country] = country_data

  0%|          | 0/702141 [00:00<?, ?it/s]

In [149]:
with open(output_json, "w") as f:
    json_object = json.dumps(bins_string_keys, indent = 4)
    f.write(json_object)