Knowledge-Graph-Hub · justaddcoffee · Jun 2, 2020 · Jun 4, 2020 · Jul 9, 2020 · Jul 9, 2020
diff --git a/kg_covid_19/transform_utils/drug_central/drug_central.py b/kg_covid_19/transform_utils/drug_central/drug_central.py
@@ -50,7 +50,7 @@ def run(self, data_file: Optional[str] = None,
         drug_protein_edge_label = "biolink:molecularly_interacts_with"
         drug_protein_edge_relation = "RO:0002436"  # molecularly interacts with
         self.edge_header = ['subject', 'edge_label', 'object', 'relation',
-                            'provided_by', 'comment']
+                            'provided_by', 'publication', 'comment']
 
         with open(self.output_node_file, 'w') as node, \
                 open(self.output_edge_file, 'w') as edge, \
@@ -118,11 +118,64 @@ def run(self, data_file: Optional[str] = None,
                                                protein_id,
                                                drug_protein_edge_relation,
                                                self.source_name,
+                                               get_pub_info_from_dict(items_dict),
                                                items_dict['ACT_COMMENT']])
 
         return None
 
 
+def get_pub_info_from_dict(items_dict,
+                           pubmed_prefix="PMID",
+                           uri_match='http://www.ncbi.nlm.nih.gov/pubmed/'
+                           ) -> str:
+    pubs = []
+    if 'ACT_SOURCE_URL' in items_dict and re.match(uri_match,
+                                                   items_dict['ACT_SOURCE_URL']):
+        pubs.append(
+            items_dict['ACT_SOURCE_URL'].replace(uri_match, pubmed_prefix + ":"))
+    if 'MOA_SOURCE_URL' in items_dict and re.match(uri_match,
+                                                   items_dict['MOA_SOURCE_URL']):
+        pubs.append(
+            items_dict['MOA_SOURCE_URL'].replace(uri_match, pubmed_prefix + ":"))
+    return "|".join(pubs)
+
+
+def tsv_to_dict(input_file: str, col_for_key: str) -> dict:
+    this_dict: dict = defaultdict(list)
+    with open(input_file) as file:
+        reader = csv.DictReader(file, delimiter='\t')
+        for row in reader:
+            this_dict[row[col_for_key]] = row
+    return this_dict
+
+
+def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]:
+    unzip_to_tempdir(zip_file, output_dir)
+    # get tclin filename
+    tclin_files = \
+        [f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)]
+    if len(tclin_files) > 1:
+        raise RuntimeError("Found more than one tclin file:\n%s" %
+                           "\n".join(tclin_files))
+    elif len(tclin_files) < 1:
+        raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file)
+    else:
+        tclin_file: str = os.path.join(output_dir, tclin_files[0])
+
+    # get tchem filename
+    tchem_files = \
+        [f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)]
+    if len(tchem_files) > 1:
+        raise RuntimeError("Found more than one tchem file:\n%s" %
+                           "\n".join(tchem_files))
+    elif len(tchem_files) < 1:
+        raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file)
+    else:
+        tchem_file: str = os.path.join(output_dir, tchem_files[0])
+
+    return [tclin_file, tchem_file]
+
+
 def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
     """Methods processes a line of text from Drug Central.
 

diff --git a/kg_covid_19/transform_utils/scibite_cord/scibite_cord.py b/kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
@@ -57,8 +57,10 @@ def run(self, data_file: Optional[str] = None) -> None:
         else:
             data_files.extend(data_files)
 
-        self.node_header = ['id', 'name', 'category', 'description', 'provided_by']
-        self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by']
+        self.node_header = ['id', 'name', 'category', 'description']
+        self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by',
+                            'publications']
+
         node_handle = open(self.output_node_file, 'w')
         edge_handle = open(self.output_edge_file, 'w')
         node_handle.write("\t".join(self.node_header) + "\n")
@@ -191,7 +193,8 @@ def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str
                     f"biolink:related_to",
                     f"CORD:{paper_id}",
                     "SIO:000255",
-                    provided_by
+                    provided_by,
+                    f"CORD:{paper_id}"
                 ]
             )
 
@@ -282,6 +285,7 @@ def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record:
                     # simplified generation of edges between OntologyClass and the publication where
                     # OntologyClass -> correlated_with -> Publication
                     # with the edge having relation RO:0002610
+
                     if (curie, paper_curie) not in self.seen:
                         write_node_edge_item(
                             fh=edge_handle,
@@ -296,6 +300,19 @@ def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record:
                         )
                         self.seen.add((curie, paper_curie))
 
+                    write_node_edge_item(
+                        fh=edge_handle,
+                        header=self.edge_header,
+                        data=[
+                            f"{curie}",
+                            "biolink:correlated_with",
+                            f"{paper_curie}",
+                            f"RO:0002610", # 'correlated with'
+                            f"{self.source_name} co-occurrences",
+                            paper_curie
+                        ]
+                    )
+
             # This is an earlier style of modeling that involves an InformationContentEntity for every instance of
             # co-occurrence between a Publication and a set of OntologyClass
             #

diff --git a/tests/test_drug_central.py b/tests/test_drug_central.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from kg_covid_19.transform_utils.drug_central import DrugCentralTransform
 from kg_covid_19.transform_utils.drug_central.drug_central import \
-    parse_drug_central_line
+    parse_drug_central_line, get_pub_info_from_dict
 from kg_covid_19.utils.transform_utils import parse_header
 from parameterized import parameterized
 
@@ -79,12 +79,38 @@ def test_nodes_are_not_repeated(self):
         unique_nodes = list(set(nodes))
         self.assertCountEqual(nodes, unique_nodes)
 
+    @parameterized.expand([
+    ('', ''),
+    ({'ACT_SOURCE_URL': '',
+     'MOA_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'},
+     ''
+     ),
+    ({'ACT_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749',
+     'MOA_SOURCE_URL': ''},
+     ''
+     ),
+    ({'ACT_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749',
+     'MOA_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'},
+     ''
+     ),
+    ({'ACT_SOURCE_URL': 'http://www.ncbi.nlm.nih.gov/pubmed/17275317',
+     'MOA_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'},
+     'PMID:17275317'
+     ),
+    ({'ACT_SOURCE_URL': 'http://www.ncbi.nlm.nih.gov/pubmed/17275317',
+      'MOA_SOURCE_URL': 'http://www.ncbi.nlm.nih.gov/pubmed/3207986'},
+     'PMID:17275317|PMID:3207986'
+     ),
+    ])
+    def test_get_pub_info_from_dict(self, this_dict, expected_pub_info) -> None:
+        self.assertEqual(expected_pub_info, get_pub_info_from_dict(this_dict))
+
     def test_edges_file(self):
         self.drug_central.run(data_file='drug.target.interaction_SNIPPET.tsv.gz')
         edge_file = os.path.join(self.dc_output_dir, "edges.tsv")
         self.assertTrue(os.path.isfile(edge_file))
         edge_df = pd.read_csv(edge_file, sep="\t", header=0)
-        self.assertEqual((21, 6), edge_df.shape)
+        self.assertEqual((21, 7), edge_df.shape)
         self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
-                          'comment'],
+                          'publication', 'comment'],
                          list(edge_df.columns))