Categorises the samples by country and month.

In [1]:
from dateutil.parser import parse
from datetime import datetime
from pprint import pprint as pp
from tqdm.notebook import tqdm, trange
import covid_utilities as cu

In [2]:
data = cu.load_json(cu.COVID_PATH + r"\samples.json")

In [84]:
def get_week(date: datetime) -> (int, int):
    week_num = date.isocalendar().week
    year = date.year
    # first days of a year may belong to last week (number) of previous year
    if week_num > 50 and date.month == 1:
        year -= 1
    return year, week_num

In [85]:
def extract_info(d: dict[str, str]) -> (str, int, int):
    country = d["country"].split(":")[0]
    date = parse(d["collection_date"], yearfirst=True, dayfirst=False)
    year, week = get_week(date)
    return country, year, date.month, week

---

In [86]:
week_month_data = []

In [None]:
for sampleName, info in tqdm(list(data.items())):
    week_month_data.append((sampleName, *extract_info(info)))

In [96]:
week_month_data[:10]

[('BS000685.1', 'Japan', 2020, 4, 15),
 ('BS000686.1', 'Japan', 2020, 4, 15),
 ('BS000687.1', 'Japan', 2020, 4, 15),
 ('BS000688.1', 'Japan', 2020, 4, 15),
 ('BS000689.1', 'Japan', 2020, 4, 15),
 ('BS000690.1', 'Japan', 2020, 4, 15),
 ('BS000691.1', 'Japan', 2020, 4, 15),
 ('BS000692.1', 'Japan', 2020, 4, 15),
 ('BS000693.1', 'Japan', 2020, 4, 15),
 ('BS000694.1', 'Japan', 2020, 4, 15)]

---

In [147]:
bins_string_keys = {}

In [None]:
for sample, country, year, month, week_n in tqdm(week_month_data):
    if country in bins_string_keys:
        country_data = bins_string_keys[country]
    else:
        country_data = {"months": {}, "weeks": {}}
    
    month_key = f"{year}-{month}"
    if month_key not in country_data["months"]:
        country_data["months"][month_key] = {"counts": 0, "samples": []}
    country_data["months"][month_key]["counts"] += 1
    country_data["months"][month_key]["samples"].append(sample)
    
    week_key = f"{year}-{week_n}"
    if week_key not in country_data["weeks"]:
        country_data["weeks"][week_key] = {"counts": 0, "samples": []}
    country_data["weeks"][week_key]["counts"] += 1
    country_data["weeks"][week_key]["samples"].append(sample)

    bins_string_keys[country] = country_data

In [149]:
cu.write_json(filename = cu.COVID_PATH + r"\sorted_samples.json", dictionary = bins_string_keys)

In [143]:
pp(bins_string_keys["Germany"])

{'months': {'2020-1': {'counts': 27,
                       'samples': ['LR824570.1',
                                   'MT270101.1',
                                   'MT270102.1',
                                   'MT270103.1',
                                   'MT270104.1',
                                   'MT270105.1',
                                   'MT270108.1',
                                   'MT270109.1',
                                   'MT270112.1',
                                   'MW368440.1',
                                   'OV057770.1',
                                   'OV077247.1',
                                   'OV077441.1',
                                   'OV077446.1',
                                   'OV077780.1',
                                   'OV077782.1',
                                   'OV077786.1',
                                   'OV078074.1',
                                   'OV078093.1',
                                