In [3]:
import sys
sys.path.append("..")
from toolkit.ascfile import read_asc
from glob import glob
import numpy as np, matplotlib.pyplot as plt, os

In [4]:
# We start by listing all the data files, getting some info.
raw_entries = glob("../raw_data/200to2350nm--Data/**/*", recursive=True)
print(f"There are {len(raw_entries)} entries.")
files = [ e for e in raw_entries if os.path.isfile(e) ]
print(f"There are {len(files)} files.")
asc_files = [ f for f in files if f.endswith(".asc") ]
print(f"There are {len(asc_files)} perkin spectrums.")
asc_files = [ f for f in asc_files if not os.path.basename(f).startswith("Empty") ]

There are 1256 entries.
There are 1180 files.
There are 477 perkin spectrums.


In [3]:
# Each file is like
# {quantity_token}{species}{country_token}{number?}.Sample.asc
tokens = ["Ag", "RTg", "TDg", "RDg", "RD", "TD", "RT", "TTg", "TT", "A"]
dataset = list()
for filename in asc_files:
    entry = {}
    entry['filename'] = filename
    basename = os.path.basename(filename)
    # Get the {quantity_token}{species}{country_token}{number?}
    entry['name'] = basename.split('.')[0]
    match_token = [basename.startswith(t) for t in tokens]
    match = match_token.index(True)

    if any(match_token):
        # Isolate and store {quantity_token} and {species}{country_token}{number?}
        entry['token'] = tokens[match] 
        entry['name'] =entry['name'][len(entry['token']):]
        dataset.append(entry)
        continue
    assert False, f"This file's token is not identified: {basename}."
import json
with open("../data/dataset.json", "w") as f:
    json.dump(dataset, f)


In [4]:
# Extract other infos from the name, the species
# Or also if it is an unknown species manuscript
tokens = [ "Agneau", "Chevre", "Veau", "ms" ]
from enum import IntEnum
class SpectrumKind(IntEnum):
    MODERN=0
    MANUSCRIPT=1

for entry in dataset:
    name = entry['name']
    match_token = [name.startswith(t) for t in tokens]
    if any(match_token):
        match = [i for i, x in enumerate(match_token) if x][0]
        if match < 3:
            entry['kind'] = SpectrumKind.MODERN
            entry['animal'] = tokens[match]
            other_info = name.replace(tokens[match], '').split(' ')
            entry['country'] = other_info[0]
            entry['id'] = other_info[1]
        else:
            # It's a ms
            entry['kind'] = SpectrumKind.MANUSCRIPT
            entry['id'] = name[2:]
            entry['animal'] = "Unknown"
            entry['country'] = "Unknown"
        continue
        
    else:
        print(name)
        
with open("../data/dataset.json", "w") as f:
    json.dump(dataset, f)

In [5]:
# Get the spectrums, put them in database
wl = None
for entry in dataset:
    data = read_asc(entry['filename'])
    if wl is None: # Get the wavelength
        wl = data[0, :]
    
    y = data[1, :]
    entry['spectrum'] = y.tolist()
    
    assert np.allclose(wl, data[0,:]), "Wavelength vector changed!"
    
with open("../data/dataset.json", "w") as f:
    json.dump(dataset, f)

In [6]:
tokens = ["Ag", "RTg", "TDg", "RDg", "RD", "TD", "RT", "TTg", "TT", "A"]

# Some stats
chart = []
for t in tokens:
    d = list(filter(lambda e: e['token'] == t and e['kind'] == 0, dataset))
    print(t, len(d))

Ag 21
RTg 21
TDg 21
RDg 23
RD 27
TD 24
RT 24
TTg 21
TT 31
A 21


In [7]:
import json
with open("../data/dataset.json", "r") as f:
    dataset = json.load(f)
from copy import copy
def aggregate(data, keys, drops=[]):
    import copy
    ''' Regroups individuals that share keys values. '''
    data_ag = copy.copy(data)
    groups = {}
    
    uniques = set()
    for d in data:
        values = tuple( d[k] for k in keys )
        uniques.add(values)
    
    for u in uniques:
        filtered = copy.deepcopy(list(filter(lambda e: all([ e[k] == v for k, v in zip(keys, u) ]), data)))
        filtered_dict = dict()
        filtered_dict["entries"] = filtered
        filtered_dict["commons"] = dict()
        for elem in filtered:
            
            for k in keys:
                filtered_dict["commons"][k] = elem[k]
                del elem[k]
            for d in drops:
                del elem[d]
                
        groups[".".join(map(str, u))] = filtered_dict
    
    return groups

def filter_keyset(data, key, values):
    out = {}
    for entry in data:
        present_values = [ se[key] for se in data[entry]["entries"] ]
        newentry = {}
        if set(values).issubset(set(present_values)):
            selentry = copy(data[entry]["entries"])
            
            newentry["entries"] = {}
            for i, v in enumerate(selentry):
                if v[key] in values:
                    newentry["entries"][v[key]] = selentry[i]
            
            newentry["commons"] = copy(data[entry]["commons"])
            for v in newentry["entries"]:
                del newentry["entries"][v][key]
            
            out[entry] = newentry
    
    return out
def filter_common_value(data, key, value):
    out = {}
    for entry in data:
        if data[entry]["commons"][key] == value:
            out[entry] = copy(data[entry])
    
    return out
            
        
    

In [8]:
# Here we go from a list of spectrum entries to a list of samples
# Each sample shares name, kind, animal, country and id.
# So it containes several spectral quantities in "entries": "RD", "A", ...
g = aggregate(dataset, ["name", "kind","animal","country", "id"])#, drops=["spectrum"])
with open("../data/aggregated.json", "w") as f:
    json.dump(g, f)
from copy import deepcopy
g_orig = deepcopy(g)
# Get all manuscripts, flesh and grain:
g = filter_keyset(deepcopy(g_orig), "token", ["RD", "A"])
g = filter_common_value(g, "kind", SpectrumKind.MANUSCRIPT)
# Get all modern parchments flesh
g2 = filter_keyset(deepcopy(g_orig), "token", ["RD", "A"])
g2 = filter_common_value(g2, "kind", SpectrumKind.MODERN)

g3 = filter_keyset(deepcopy(g_orig), "token", ["RDg", "Ag"])
g3 = filter_common_value(g3, "kind", SpectrumKind.MODERN)
g = list(g.values())
g.extend(g2.values())
g.extend(g3.values())
with open("../data/by_name.json", "w") as f:
    json.dump(g, f)
len(list(g))

81

In [9]:
# To add proteomic animal type, we need human interface using a csv, we create a blacnk csv
with open("../data/by_name.json", "r") as f:
    g = json.load(f)
    msnames = []
    for i in range(len(g)):
        if g[i]["commons"]["kind"] == SpectrumKind.MANUSCRIPT:
            msnames.append(g[i]["commons"]["name"])
    import csv

    with open('proteomics_to_fill.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for n in msnames:
            writer.writerow([n]+["UnKnown"]*2)

In [12]:
with open('../data/proteomics_filled.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=';',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    for row in reader:
        for gg in g:
            if gg["commons"]["name"] == row[0]:
                gg["commons"]["animal"] = row[1]
with open("../data/with_proteomics.json", "w") as f:
    json.dump(g, f,indent=4)