In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import yaml
from linkml_runtime.dumpers import json_dumper, yaml_dumper

import uuid
from numpy import sort



import peh

In [None]:
codebook_path = Path("../source_tables/PARC/BasicCodebook_v2.3.xlsx")
data_path = Path("../source_tables/PARC/ExData_BasicCodebook_v2.3.xlsx")
yaml_file_path = Path("../project_examples/PARC/parc.yaml")

In [None]:
data_dict = pd.read_excel(data_path, sheet_name=None)
for k, v in data_dict.items():
    data_dict[k] = v.replace(np.nan, None)
data_dict

## Entities
The entities we will use for this data are:
- EntityList
    - StudyEntity
        - Study (example-study) 
        - Person
        - Sample
        - PersonGroup
        - SampleCollection (equivalent to one of the tabs)
        - Timepoint

Additionally, we want to extract properties for these entities from the BasicCodebook, but that's for later.

In [None]:
s = [x for x in dir(peh) if "sampl" in x.lower()]
print(s)
print(dir(peh.SamplingResult))
print(dir(peh.SamplingObservation))
print(peh.SamplingResult())
print(peh.SamplingObservation(id=""))

In [None]:
entitylist = peh.EntityList()

In [None]:
study_sheet = data_dict["STUDYINFO"]
study = peh.Study(id=study_sheet.iloc[11][1])
entitylist.studies = [study]
study

In [None]:
timepoints_sheet = data_dict["TIMEPOINT"]

timepoints = {}
for i, row in timepoints_sheet.iterrows():
    tp = peh.Timepoint(id=peh.TimepointId(row["id_timepoint"]))
    timepoints[row["id_timepoint"]] = tp
entitylist.timepoints = timepoints
study.timepoint_id_list = [peh.TimepointId(x) for x in timepoints.keys()]
timepoints


In [None]:
# SAMPLE
sample_sheet = data_dict["SAMPLE"]

# where to save these?
samples = peh.SampleCollection(id = str(uuid.uuid4()))


# SamplingObservation > SamplingResult > ObservedValue
sampling_design = peh.SamplingDesign()
to_ignore = {"id_sample", "id_subject", "id_timepoint"}
sample_fields = {"chol", "trigl", "lipid", "lipid_enz", "crt", "sg", "osm", "density"}
for _, row in sample_sheet.iterrows():
    sample = peh.Sample(id=peh.SampleId(row["id_sample"]))
    samples.study_entity_links.append(peh.StudyEntityLink(study_entity=sample))
    meta_values = []
    sample_values = []
    obs = []
    for idx, val in row.items():
        if idx not in to_ignore:
            if idx not in sample_fields:
                meta_values.append(peh.ObservedValue(observable_entity=sample.id, value=val, observable_property=idx))
            else:
                sample_values.append(peh.ObservedValue(observable_entity=sample.id, value=val, observable_property=idx))

    meta_res = peh.SamplingResult(observed_values=meta_values)
    sampling_res = peh.SamplingResult(observed_values=sample_values)
    obs.append(peh.SamplingObservation(id=peh.SamplingObservationId(uuid.uuid4()), observation_result=sampling_res, observation_design=sampling_design, observation_type=peh.ObservationType.sampling))
    obs.append(peh.SamplingObservation(id=peh.SamplingObservationId(uuid.uuid4()), observation_result=meta_res, observation_design=sampling_design, observation_type=peh.ObservationType.metadata))
    timepoints[row["id_timepoint"]].observations.append(obs)
               
                
    
    

In [None]:


subject_unique_sheet = data_dict["SUBJECTUNIQUE"]
subject_design = peh.QuestionnaireDesign()
to_ignore = {"id_subject", "id_timepoint"}

# where to save these
subjects = []

# I assume this should be a PersonCollection - just imagine it for now
subjects = peh.PersonGroup(id = str(uuid.uuid4()))

# And a PersonGroupCollection - just imagine it for now
# person_groups = peh.PersonGroup()


# link persons to each other
subject_groups = {i: peh.PersonGroup(i) for i in set(subject_unique_sheet["id_participant"].values)}

for _, row in subject_unique_sheet.iterrows():
    person = peh.Person(id=peh.PersonId(row["id_subject"]))
    subjects.study_entity_links.append(peh.StudyEntityLink(study_entity=person))
    
    values = []
    obs = []
    for idx, val in row.items():
        if idx not in to_ignore:
            values.append(peh.ObservedValue(observable_entity=person.id, value=val, observable_property=idx))

    res = peh.QuestionnaireResult(observed_values=values)
    obs.append(peh.QuestionnaireObservation(id=peh.QuestionnaireObservationId(uuid.uuid4()), observation_result=res, observation_design=subject_design, observation_type=peh.ObservationType.questionnaire))
    # no timepoints linked, so we use the first one. Assumes these are somehow alphabetically or numerically sortable
    timepoints[sort(list(timepoints.keys()))[0]].observations.append(obs)

In [None]:
from linkml_runtime.dumpers import yaml_dumper


yaml_dumper.dump(entitylist, Path("out/PARC_data.yaml"))
# list all samples
yaml_dumper.dump(samples, Path("out/PARC_samples.yaml"))
# list all subjects
yaml_dumper.dump(subjects, Path("out/PARC_subjects.yaml"))
# yaml_dumper.dump(subject_groups, Path("out/PARC_subject_groups.yaml"))