In [None]:
import os
from pathlib import Path

import numpy as np
import numpy.lib.recfunctions as rfn

root_dir = Path().resolve().parent

In [None]:
# This is the data set that Joe created - downloaded in Dec 2024 - it has a file per country listing all CMC classes that have claims in the KCDB (by any country)
# and the number of CMCs that this country has for that class

# # Here I get all the General Physics data into a structured array with fields
# # Country, Metrology_Area, Branch, Service, Sub_Service, Individual_Service, Physics_Code, Number_of_CMCs
# # Note I have cleaned the Turkiye files to not have any special characters
data = {}
for file in (root_dir / "input" / "physics-data-Non-zero").glob("*.cmc"):
    array = np.genfromtxt(file, delimiter=";", names=True, dtype=None, encoding='latin-1')
    data[array["Country"][0]] = array

In [None]:
# This gets data out of a file sent by Peter Blattner from the SI reference point
# which has a list of all current CMCs and their previous versions (as at Dec 2024)
# For every current CMC, we have the publication date for it and any previous versions of it so we can see when it was initially claimed
# Here I get all that data into a structured arrays with fields
#kcdbCode, archiveDate, publicationDate, approvalDate, kcdbServiceCategory, branchValue, serviceValue, subsServiceValue, individualServiceValue,
#categoryValue, subCategoryValue, analyteMatrix, analyteValue, mediumValue, nuclideValue, id, domainCode, metrologyAreaLabel, rmo, countryValue, nmiCode
# Note I have 'cleaned' the file so that Turkiye doesn't have any special characters!

data_SIref = np.genfromtxt(root_dir / "input" / "kcdb_data_output_physics.txt", delimiter="\t", names=True, dtype=None, encoding='latin-1')

In [None]:
# dictionary of metrology areas to be able to look up data between JB data and PB data
metrology_area = {"QM": "","AUV": "Acoustics, Ultrasound, Vibration","L":"Length","EM": "Electricity and Magnetism",
                    "RI": "Ionizing Radiation","PR":"Photometry and Radiometry","M":"Mass and related quantities",
                    "T":"Thermometry","TF":"Time and Frequency"}
data_with_years = data

# # create an array with a snapshot of number of CMCs per category for each country in 2024 and the earliest year of publication for general physics
for country in data:
    print(country)
    # create an array to hold the earliest publication date for each service
    # start with assumption that the earlist publication date is in the future, i.e. in 2030
    year_col = np.full(len(data[country]),2030, dtype = [("Initial_Year",int)])
    used = np.empty(0,dtype = int)
    # iterate through all possible services
    for i in range(0,len(data[country])):
        # pick out next possible service from data - note data has all the same entries for each country, often the number of CMCs will be zero
        data_set = (country,data[country][i]["Metrology_Area"], data[country][i]["Branch"], data[country][i]["Service"], data[country][i]["Sub_Service"], data[country][i]["Individual_Service"])
        # iterate through the full data_SIref data set to find a CMC with a matching service
        for j in range(0,len(data_SIref)):
            # pick out the next CMC in the array
            data_SIref_set = (data_SIref[j]["countryValue"],metrology_area[data_SIref[j]["metrologyAreaLabel"]], data_SIref[j]["branchValue"], data_SIref[j]["serviceValue"], data_SIref[j]["subServiceValue"], data_SIref[j]["individualServiceValue"])
            # check whether it's service category matches that picked out from the full list of possible services
            if data_set == data_SIref_set:
                # if so, see whether its putlication date is earlier than the earliest date recorded, and if so replace the date in the array
                if int(data_SIref[j]["publicationDate"][-4:]) < year_col[i][0]:
                    year_col[i] = int(data_SIref[j]["publicationDate"][-4:])
                # add the index of this CMC to a list of CMCs already checked
                used = np.concatenate((used,[j]))
    # remove all checked CMCs from the array so that we don't have to iterate over them when we go to the next country
    for index in reversed(np.sort(used)):
        data_SIref = np.delete(data_SIref, (index), axis=0)

    # add the earliest year of publication column to the data array - now we have a list of all service categories that a country might have claims for
    # and the number of CMCs that they held in 2024, and the earliest year that they had a claim for that service
    data_with_years[country] = rfn.merge_arrays((data[country], year_col), asrecarray=True, flatten=True)

In [None]:
# print all of the arrays into files, one for each country - basically have just added the year column to the original (Dec 2024) files from Joe
h_str = ''
for header in data_with_years["New Zealand"].dtype.names:
    h_str = h_str+header+','
h_str = h_str[0:-1]

outpath = str(root_dir / "output" / "physics-data-Non-zero-first-year")
if not os.path.exists(outpath):
    os.makedirs(outpath)

for country in data_with_years:
    print(country)
    filename = outpath+country+'.txt'
    np.savetxt(filename, data_with_years[country], header=h_str, delimiter='\t', fmt='%s', encoding='latin-1')

In [None]:
def country_subservices_numbers(country,area,n_branches,year):

    # this function takes the array for a country and area of General Physics and counts how many services that country has CMC claims for
    # in each branch of that area of physics for a given year

    # create empty arrays for the total possible number of services and the number of services held by this country for each branch to be examined
    total_subservices = np.zeros(n_branches,dtype=int)
    country_subservices = np.zeros(n_branches,dtype=int)

    branches = []

    # find the set of services within this area
    area_set = np.logical_and(data_with_years[country]["Number_of_CMCs"] >= 0,data_with_years[country]["Metrology_Area"] == area)

    # iterate over each branch
    b = 0
    for branch in np.unique(data_with_years[country][area_set]["Branch"]):
        # pick out the full set of services within this branch
        branch_set = np.logical_and(data_with_years[country][area_set]["Number_of_CMCs"] >= 0,data_with_years[country][area_set]["Branch"] == branch)
        # pick out the set of services for which this country has at least one claim in 2024, is within this branch,
        # and the publication date is before the end of the year
        branch_country_set = np.logical_and(data_with_years[country][area_set]["Number_of_CMCs"] >= 1,
                                            np.logical_and(data_with_years[country][area_set]["Branch"] == branch,
                                                           data_with_years[country][area_set]["Initial_Year"] <= year))
        # identify the total number of possible services within the branch from the size of the full set
        total_subservices[b] = np.size(data_with_years[country][area_set][branch_set]["Physics_Code"])
        # identify the total number of services this country has by this year within the branch from the country set
        country_subservices[b] = np.size(data_with_years[country][area_set][branch_country_set]["Physics_Code"])
        # get the name of the branch for printing
        branches.append(branch)
        b+=1

    # return the list of branches in this area, total number of services per branch
    # and the number of services for which which country had at least one CMC claim for each branch in the year given
    return(branches, total_subservices, country_subservices)

In [None]:
def get_subservice_data(year):

    # this function takes a year and returns the array of number of services per branch of Genearal Physics each country had CMC claims for in that year

    # set up a full array of size number of countries by number of branches - using the country "New Zealand" to find numbers of unique branches
    # the structured array "data" has the same list of services for each country so we could use any
    dataset=np.zeros([len(data),np.size(np.unique(data["New Zealand"]["Branch"]))])

    # set up an array for the total possible services in each branch, with length of number of branches
    number_subservices = np.zeros((np.size(np.unique(data["New Zealand"]["Branch"]))),dtype=int)

    # emptyt list of branch names
    branches = []
    # empty array of number of branches per area
    number_branches_by_area = np.zeros((np.size(np.unique(data["New Zealand"]["Metrology_Area"]))),dtype=int)

    # iterate over Metrology Areas in General Physics to get total numbers
    a = 0
    for area in np.unique(data_with_years["New Zealand"]["Metrology_Area"]):
        # using New Zealand as representative, find the full set of services within the given area
        area_set = np.logical_and(data_with_years["New Zealand"]["Number_of_CMCs"] >= 0,data_with_years["New Zealand"]["Metrology_Area"] == area)
        # and now find the number of unique branches within that area
        number_branches_by_area[a] = np.size(np.unique(data_with_years["New Zealand"][area_set]["Branch"]))
        a+=1

    # iterate over each country in the data set to get country-specific numbers
    # c is the index for each country
    c = 0
    for country in data_with_years:
        # b is the index for branches
        b = 0
        # a is the index for General Physics areas
        a = 0
        # iterate over each area in General Physics
        for area in np.unique(data_with_years[country]["Metrology_Area"]):
            # find the branches in that area, the total number of subservices possible in that branch, and the number of services for which this country
            # has at least one claim in the year given for each branch
            # add the data to arrays (NB the arrays'branches' and 'number_subservices' gets re-written for every country...)
            (branches[b:b+int(number_branches_by_area[a])],number_subservices[b:b+int(number_branches_by_area[a])],
                dataset[c,b:b+int(number_branches_by_area[a])])=country_subservices_numbers(country,area,number_branches_by_area[a],year)
            b = int(b+number_branches_by_area[a])
            a+=1
        c+=1

    # return the list of all branches, the total number of services possible per branch (i.e. which has at least one claim in the database)
    # and an array with the number of services in each branch for which each country has a CMC in the given year
    return (branches, number_subservices,dataset)

In [None]:
def main():

    # iterate over years
    for year in range(2001,2025):
        # get the list of branches, total number of subservices possible, and the array of number of services in each branch that each country
        # had at least one CMC in that year
        (branches,number_subservices,country_data) = get_subservice_data(year)

        # names of physics areas
        areas = ["AUV","EM","L","M","PR","T","TF"]
        # adding up the number of services possible per area
        summed_branches = [np.sum(number_subservices[0:3]),np.sum(number_subservices[3:11]),np.sum(number_subservices[11:13]),
                              np.sum(number_subservices[13:22]),np.sum(number_subservices[22:26]),np.sum(number_subservices[26:28]),
                              np.sum(number_subservices[28:31])]
        # add names of areas to the branches
        branches = np.append(branches, areas)
        number_subservices = np.append(number_subservices,summed_branches)

        countries=[country for country in data]
        #empty array for number of services per area per country
        summed_country_data = np.empty([len(countries),7])

        # iterate over countries
        c = 0
        for country in countries:
            # add up the number of services per area for each country
            summed_country_data[c,:] = [np.sum(country_data[c][0:3]),np.sum(country_data[c][3:11]),np.sum(country_data[c][11:13]),
                              np.sum(country_data[c][13:22]),np.sum(country_data[c][22:26]),np.sum(country_data[c][26:28]),
                              np.sum(country_data[c][28:31])]
            c += 1

        # write all the data to a file, one for each year
        outpath = str(root_dir / "output" / "services-by-branch-all-countries")
        if not os.path.exists(outpath):
            os.makedirs(outpath)

        with open(outpath+str(year)+".txt","w") as f:
            f.write(f"Country;")
            for b in branches:
                f.write(f"{b};")
            f.write("\n")
            f.write("Total;")
            for t in number_subservices:
                f.write(f"{t};")
            f.write("\n")
            c = 0
            for country in countries:
                f.write(f"{country};")
                #print(country_data[c])
                for n in country_data[c]:
                    f.write(f"{n};")
                for s in summed_country_data[c]:
                    f.write(f"{s};")
                f.write("\n")
                c += 1

main()