Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pubmed info to ingests, when present #200

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion kg_covid_19/transform_utils/drug_central/drug_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def run(self, data_file: Optional[str] = None,
drug_protein_edge_label = "biolink:molecularly_interacts_with"
drug_protein_edge_relation = "RO:0002436" # molecularly interacts with
self.edge_header = ['subject', 'edge_label', 'object', 'relation',
'provided_by', 'comment']
'provided_by', 'publication', 'comment']

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
Expand Down Expand Up @@ -118,11 +118,64 @@ def run(self, data_file: Optional[str] = None,
protein_id,
drug_protein_edge_relation,
self.source_name,
get_pub_info_from_dict(items_dict),
items_dict['ACT_COMMENT']])

return None


def get_pub_info_from_dict(items_dict,
pubmed_prefix="PMID",
uri_match='http://www.ncbi.nlm.nih.gov/pubmed/'
) -> str:
pubs = []
if 'ACT_SOURCE_URL' in items_dict and re.match(uri_match,
items_dict['ACT_SOURCE_URL']):
pubs.append(
items_dict['ACT_SOURCE_URL'].replace(uri_match, pubmed_prefix + ":"))
if 'MOA_SOURCE_URL' in items_dict and re.match(uri_match,
items_dict['MOA_SOURCE_URL']):
pubs.append(
items_dict['MOA_SOURCE_URL'].replace(uri_match, pubmed_prefix + ":"))
return "|".join(pubs)


def tsv_to_dict(input_file: str, col_for_key: str) -> dict:
this_dict: dict = defaultdict(list)
with open(input_file) as file:
reader = csv.DictReader(file, delimiter='\t')
for row in reader:
this_dict[row[col_for_key]] = row
return this_dict


def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]:
unzip_to_tempdir(zip_file, output_dir)
# get tclin filename
tclin_files = \
[f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)]
if len(tclin_files) > 1:
raise RuntimeError("Found more than one tclin file:\n%s" %
"\n".join(tclin_files))
elif len(tclin_files) < 1:
raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file)
else:
tclin_file: str = os.path.join(output_dir, tclin_files[0])

# get tchem filename
tchem_files = \
[f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)]
if len(tchem_files) > 1:
raise RuntimeError("Found more than one tchem file:\n%s" %
"\n".join(tchem_files))
elif len(tchem_files) < 1:
raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file)
else:
tchem_file: str = os.path.join(output_dir, tchem_files[0])

return [tclin_file, tchem_file]


def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
"""Methods processes a line of text from Drug Central.

Expand Down
23 changes: 20 additions & 3 deletions kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ def run(self, data_file: Optional[str] = None) -> None:
else:
data_files.extend(data_files)

self.node_header = ['id', 'name', 'category', 'description', 'provided_by']
self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by']
self.node_header = ['id', 'name', 'category', 'description']
self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by',
'publications']

node_handle = open(self.output_node_file, 'w')
edge_handle = open(self.output_edge_file, 'w')
node_handle.write("\t".join(self.node_header) + "\n")
Expand Down Expand Up @@ -191,7 +193,8 @@ def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str
f"biolink:related_to",
f"CORD:{paper_id}",
"SIO:000255",
provided_by
provided_by,
f"CORD:{paper_id}"
]
)

Expand Down Expand Up @@ -282,6 +285,7 @@ def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record:
# simplified generation of edges between OntologyClass and the publication where
# OntologyClass -> correlated_with -> Publication
# with the edge having relation RO:0002610

if (curie, paper_curie) not in self.seen:
write_node_edge_item(
fh=edge_handle,
Expand All @@ -296,6 +300,19 @@ def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record:
)
self.seen.add((curie, paper_curie))

write_node_edge_item(
fh=edge_handle,
header=self.edge_header,
data=[
f"{curie}",
"biolink:correlated_with",
f"{paper_curie}",
f"RO:0002610", # 'correlated with'
f"{self.source_name} co-occurrences",
paper_curie
]
)

# This is an earlier style of modeling that involves an InformationContentEntity for every instance of
# co-occurrence between a Publication and a set of OntologyClass
#
Expand Down
32 changes: 29 additions & 3 deletions tests/test_drug_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from kg_covid_19.transform_utils.drug_central import DrugCentralTransform
from kg_covid_19.transform_utils.drug_central.drug_central import \
parse_drug_central_line
parse_drug_central_line, get_pub_info_from_dict
from kg_covid_19.utils.transform_utils import parse_header
from parameterized import parameterized

Expand Down Expand Up @@ -79,12 +79,38 @@ def test_nodes_are_not_repeated(self):
unique_nodes = list(set(nodes))
self.assertCountEqual(nodes, unique_nodes)

@parameterized.expand([
('', ''),
({'ACT_SOURCE_URL': '',
'MOA_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'},
''
),
({'ACT_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749',
'MOA_SOURCE_URL': ''},
''
),
({'ACT_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749',
'MOA_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'},
''
),
({'ACT_SOURCE_URL': 'http://www.ncbi.nlm.nih.gov/pubmed/17275317',
'MOA_SOURCE_URL': 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'},
'PMID:17275317'
),
({'ACT_SOURCE_URL': 'http://www.ncbi.nlm.nih.gov/pubmed/17275317',
'MOA_SOURCE_URL': 'http://www.ncbi.nlm.nih.gov/pubmed/3207986'},
'PMID:17275317|PMID:3207986'
),
])
def test_get_pub_info_from_dict(self, this_dict, expected_pub_info) -> None:
self.assertEqual(expected_pub_info, get_pub_info_from_dict(this_dict))

def test_edges_file(self):
self.drug_central.run(data_file='drug.target.interaction_SNIPPET.tsv.gz')
edge_file = os.path.join(self.dc_output_dir, "edges.tsv")
self.assertTrue(os.path.isfile(edge_file))
edge_df = pd.read_csv(edge_file, sep="\t", header=0)
self.assertEqual((21, 6), edge_df.shape)
self.assertEqual((21, 7), edge_df.shape)
self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
'comment'],
'publication', 'comment'],
list(edge_df.columns))