In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import yaml
from linkml_runtime.dumpers import json_dumper, yaml_dumper

import uuid
from numpy import sort

import peh

In [None]:
codebook_path = Path("../source_tables/PARC/BasicCodebook_v2.3.xlsx")
data_path = Path("../source_tables/PARC/ExData_BasicCodebook_v2.3.xlsx")
yaml_file_path = Path("../project_examples/PARC/parc.yaml")

In [None]:
data_dict = pd.read_excel(data_path, sheet_name=None)
for k, v in data_dict.items():
    data_dict[k] = v.replace(np.nan, None)
data_dict

## Entities
The entities we will use for this data are:
- EntityList
    - StudyEntity
        - Study (example-study) 
        - Person
        - Sample
        - PersonGroup
        - SampleCollection (equivalent to one of the tabs)
        - Timepoint

Additionally, we want to extract properties for these entities from the BasicCodebook, but that's for later.

In [None]:
s = [x for x in dir(peh) if "sampl" in x.lower()]
print(s)
print(dir(peh.SamplingResult))
print(dir(peh.SamplingObservation))
print(peh.SamplingResult())
print(peh.SamplingObservation(id=""))

In [None]:
entitylist = peh.EntityList()

In [None]:
study_sheet = data_dict["STUDYINFO"]
study = peh.Study(id=study_sheet.iloc[11][1])
entitylist.studies = [study]
study

In [None]:
timepoints_sheet = data_dict["TIMEPOINT"]

timepoints = {}
for i, row in timepoints_sheet.iterrows():
    tp = peh.Timepoint(id=peh.TimepointId(row["id_timepoint"]))
    timepoints[row["id_timepoint"]] = tp
entitylist.timepoints = timepoints
study.timepoint_id_list = [peh.TimepointId(x) for x in timepoints.keys()]
timepoints


In [None]:
# SAMPLE
sample_sheet = data_dict["SAMPLE"]

# where to save these?
samples = []

# SamplingObservation > SamplingResult > ObservedValue
sampling_design = peh.SamplingDesign()
to_ignore = {"id_sample", "id_subject", "id_timepoint"}
special_fields = {"chol", "trigl", "lipid", "lipid_enz", "crt", "sg", "osm", "density", "lipid_enz_harm"}
for _, row in sample_sheet.iterrows():
    sample = peh.Sample(id=peh.SampleId(row["id_sample"]))
    samples.append(sample)
    meta_values = []
    sample_values = []
    obs = []
    for idx, val in row.items():
        if idx not in to_ignore:
            if idx not in special_fields:
                meta_values.append(peh.ObservedValue(observable_entity=sample.id, value=val, observable_property=idx))
            else:
                sample_values.append(peh.ObservedValue(observable_entity=sample.id, value=val, observable_property=idx))

    meta_res = peh.SamplingResult(observed_values=meta_values)
    sampling_res = peh.SamplingResult(observed_values=sample_values)
    obs.append(peh.SamplingObservation(id=peh.SamplingObservationId(uuid.uuid4()), observation_result=sampling_res, observation_design=sampling_design, observation_type=peh.ObservationType.sampling))
    obs.append(peh.SamplingObservation(id=peh.SamplingObservationId(uuid.uuid4()), observation_result=meta_res, observation_design=sampling_design, observation_type=peh.ObservationType.metadata))
    timepoints[row["id_timepoint"]].observations.append(obs)
               
                
    
    

In [None]:
subject_unique_sheet = data_dict["SUBJECTUNIQUE"]
subject_design = peh.QuestionnaireDesign()
to_ignore = {"id_subject", "id_participant"}

# where to save these
subjects = []

# link persons to each other
subject_groups = {i: peh.PersonGroup(i) for i in set(subject_unique_sheet["id_participant"].values)}

for _, row in subject_unique_sheet.iterrows():
    person = peh.Person(id=peh.PersonId(row["id_subject"]))
    subjects.append(person)
    
    values = []
    obs = []
    for idx, val in row.items():
        if idx not in to_ignore:
            values.append(peh.ObservedValue(observable_entity=person.id, value=val, observable_property=idx))
    subject_groups[row["id_participant"]].study_entity_links.append(peh.StudyEntityLink(study_entity=peh.StudyEntityId(row["id_participant"]), linktype=peh.LinkType.is_part_of))
    res = peh.QuestionnaireResult(observed_values=values)
    obs.append(peh.QuestionnaireObservation(id=peh.QuestionnaireObservationId(uuid.uuid4()), observation_result=res, observation_design=subject_design, observation_type=peh.ObservationType.questionnaire))
    # no timepoints linked, so we use the first one. Assumes these are somehow alphabetically or numerically sortable
    timepoints[sort(list(timepoints.keys()))[0]].observations.append(obs)

In [None]:
subject_timepoint_sheet = data_dict["SUBJECTTIMEPOINT"]
to_ignore = {"id_subject", "id_timepoint"}

for _, row in subject_timepoint_sheet.iterrows():
    values = []
    obs = []
    for idx, val in row.items():
        if idx not in to_ignore:
            values.append(peh.ObservedValue(observable_entity=person.id, value=val, observable_property=idx))
    res = peh.QuestionnaireResult(observed_values=values)
    obs.append(peh.QuestionnaireObservation(id=peh.QuestionnaireObservationId(uuid.uuid4()), observation_result=res, observation_design=subject_design, observation_type=peh.ObservationType.questionnaire))
    # no timepoints linked, so we use the first one. Assumes these are somehow alphabetically or numerically sortable
    timepoints[row["id_timepoint"]].observations.append(obs)

In [None]:
sampletimepoint_sheets = {k: v for k, v in data_dict.items() if str(k).startswith("SAMPLETIMEPOINT")}

timepoints_lookup = sample_sheet[["id_sample", "id_timepoint"]]

for k, v in sampletimepoint_sheets.items():
    # SamplingObservation > SamplingResult > ObservedValue
    sheet = v.merge(timepoints_lookup, on="id_sample")
    sampling_design = peh.SamplingDesign()
    orig = {i[:-4] for i in sheet.columns if i.endswith("_lod") or i.endswith("_loq")}
    markers = orig - special_fields
    to_ignore = {"id_sample"}
    for _, row in sheet.iterrows():
        values = []
        obs = []
        for m in markers:
            values.append(peh.ObservedValue(observable_entity=peh.SampleId(row["id_sample"]), value=row[str(m)], observable_property=str(m),
                                            quality_data=[peh.QualityData(quality_context_key="lod", quality_value=row[str(m) + "_lod"]),
                                                          peh.QualityData(quality_context_key="loq", quality_value=row[str(m) + "_loq"])]
                                            ))

        sampling_res = peh.SamplingResult(observed_values=values)
        obs.append(peh.SamplingObservation(id=peh.SamplingObservationId(uuid.uuid4()), observation_result=sampling_res, observation_design=sampling_design, observation_type=peh.ObservationType.sampling))
        
        timepoints[row["id_timepoint"]].observations.append(obs)

In [None]:
from linkml_runtime.dumpers import yaml_dumper


yaml_dumper.dump(entitylist, Path("out/PARC/data.yaml"))
# list all samples
yaml_dumper.dump(samples, Path("out/PARC/samples.yaml"))
# list all subjects
yaml_dumper.dump(subjects, Path("out/PARC/persons.yaml"))
yaml_dumper.dump(list(subject_groups.values()), Path("out/PARC/person_groups.yaml"))

In [None]:
from linkml_runtime.dumpers import json_dumper
cwd = Path.cwd()

json_dumper.dump(entitylist, cwd / "out/PARC/data.jsonld", contexts=str(cwd / "out/peh.jsonld"))