# Loading

In [None]:
import pandas as pd
import networkx as nx
from tqdm.notebook import trange, tqdm

In [None]:
# read in all relevant data:
base_path = "/sc-projects/sc-proj-ukb-cvd/data"
data_path = f"{base_path}/0_raw/showcase_48024/tables_220317"
mapping_path = f"{base_path}/mapping"
out_path = f"{base_path}/1_decoded"

In [None]:
# vocabulary
vocab_dir = f"{mapping_path}/athena"
concept = pd.read_csv(f"{vocab_dir}/CONCEPT.csv", sep='\t')
concept.head()

In [None]:
relationship = pd.read_csv(f"{vocab_dir}/RELATIONSHIP.csv", sep='\t')
vocabulary = pd.read_csv(f"{vocab_dir}/VOCABULARY.csv", sep='\t')

In [None]:
concept_relationship = pd.read_csv(f"{vocab_dir}/CONCEPT_RELATIONSHIP.csv", sep='\t')

In [None]:
# coding
gp_code_types = pd.read_csv(f"{mapping_path}/codings/coding3175.tsv")

In [None]:
# data:
gp_scripts = pd.read_feather(f"{out_path}/codes_gp_scripts_raw_220317.feather")


# Graph stuff

In [None]:
mapping_dmd_device = pd.read_feather(f"{out_path}/mapping_dmd_device_220330.feather")
mapping_dmd_device.head()

In [None]:
dmd_devices = mapping_dmd_device[["code_origin"]].drop_duplicates().merge(concept.query("vocabulary_id=='dm+d'"), left_on="code_origin", right_on="concept_code")

In [None]:
dmd_devices.value_counts("concept_class_id")

In [None]:
dmd_devices.query("concept_class_id=='VMP'")

In [None]:
start_nodes = mapping_dmd_device.id_origin.to_list()
target_nodes = concept.query('vocabulary_id=="dm+d" & concept_class_id=="VMP" & domain_id=="Device"')['concept_id'].values.tolist()

In [None]:
devices = concept.query("(domain_id=='Device'&vocabulary_id=='SNOMED'&(standard_concept==standard_concept))|concept_id==@start_nodes|concept_id==@target_nodes")\
    [['concept_id', 'concept_name', 'domain_id', 'vocabulary_id','concept_class_id', 'concept_code']]

In [None]:
devices1 = devices.copy()
devices2 = devices.copy()
devices1.columns = [f"{c}_1" for c in devices.columns]
devices2.columns = [f"{c}_2" for c in devices.columns]

In [None]:
devices_ids = devices.concept_id.to_list()

In [None]:
device_relations = concept_relationship.query("concept_id_1==@devices_ids&concept_id_2==@devices_ids")[["concept_id_1", "relationship_id", "concept_id_2"]]

In [None]:
device_relations = device_relations.merge(devices1).merge(devices2)[devices1.columns.to_list()+["relationship_id"] + devices2.columns.to_list()].query("concept_id_1!=concept_id_2")

In [None]:
import networkx as nx
from tqdm.auto import tqdm

In [None]:
nodes = devices#.set_index("concept_id")#.to_dict(orient="index")
edges = device_relations[["concept_id_1", "relationship_id", "concept_id_2"]].rename(columns={"concept_id_1":"from", "concept_id_2": "to"})#.reset_index(drop=True).to_dict(orient="index")

In [None]:
nodes.head()

In [None]:
edges.head()

In [None]:
target_nodes = [node for node in tqdm(target_nodes) if node not in start_nodes]

In [None]:
start_nodes = [node for node in tqdm(start_nodes) if node not in target_nodes]

In [None]:
# add dummy node:
dummy_edges = pd.DataFrame(target_nodes, columns=['from'])
dummy_edges['to'] = 'DUMMYOUT'
dummy_edges['relationship_id'] = 'dummy out'

dummy_edges_in = pd.DataFrame(start_nodes, columns=['to'])
dummy_edges_in['from'] = 'DUMMYIN'
dummy_edges_in['relationship_id'] = 'dummy in'

edges = pd.concat([edges, dummy_edges, dummy_edges_in], axis=0)

In [None]:
G = nx.from_pandas_edgelist(edges, 'from', 'to', edge_attr=['relationship_id'], create_using=nx.DiGraph)
node_attribute_dict = nodes.set_index('concept_id').to_dict('index')
nx.set_node_attributes(G, node_attribute_dict)

In [None]:
nx.info(G)

In [None]:
paths = [p for p in nx.all_simple_paths(G,
                                        source='DUMMYIN',
                                        target='DUMMYOUT',
                                        cutoff=3)]

In [None]:
paths

In [None]:
paths_df = pd.DataFrame([[p[1] for p in paths],  
                         paths[1:-1],
                         [len(p) for p in paths],  
                         [p[-2] for p in paths]],  
                         index=['source_concept_id', 
                               'full_path',
                               'length',
                               'target_concept_id']).transpose().reset_index()

In [None]:
paths_df_clean = paths_df.set_index('index')
paths_df_clean['length_no_dummy'] = paths_df_clean['length'] - 2

In [None]:
paths_df_clean.reindex().head()

In [None]:
paths_df_clean.reset_index().to_feather(f'{out_path}/all_paths_210322.feather')

In [None]:
# drop duplicate source/target pairs
paths_df_clean = paths_df_clean.drop_duplicates(subset=['source_concept_id',
                                       'target_concept_id'],
                               keep='first').reset_index().drop('index', axis=1)


In [None]:
paths_df_clean.head()

In [None]:
# drop source concepts that are ingredients -> only allow self reference!
all_ingredients = concept.query('concept_class_id=="Ingredient"')
all_ingredients.head()


In [None]:
# left join the concepts and clean up:
temp = paths_df_clean.rename({'target_concept_id': 'concept_id'}, axis=1).drop('full_path', axis=1)
for c in temp.columns:
    temp[c] = pd.to_numeric(temp[c], errors='ignore')
                    
paths_with_ingredient_info = pd.merge(temp,
                                      all_ingredients,
                                      on='concept_id',how='left')

paths_with_ingredient_info.head()

In [None]:
# get source concepts that are ingredients:
og_ingredients = list(set(all_ingredients.concept_id.values.tolist()).intersection(
    set(paths_with_ingredient_info.source_concept_id.values.tolist())))

other_paths = paths_with_ingredient_info.query('source_concept_id!=@og_ingredients')
ingredient_source_paths = paths_with_ingredient_info.query('source_concept_id==@og_ingredients').query('source_concept_id==concept_id')
ingredient_source_paths.head()

In [None]:
all_paths_clean = pd.concat([ingredient_source_paths, other_paths], axis=0).reset_index().drop('index', axis=1)
all_paths_clean.head()

In [None]:
# convert to numeric what is possible:
for c in all_paths_clean.columns:
    all_paths_clean[c] = pd.to_numeric(all_paths_clean[c], errors='ignore')

all_paths_clean.to_feather(
    f'{out_path}/clean_paths_210322.feather')