Skip to content

Commit

Permalink
Merge pull request #334 from Knowledge-Graph-Hub/add_progress_info_to…
Browse files Browse the repository at this point in the history
…_scibite_run_jenkins

Add progress indicators/message to scibite ingest
  • Loading branch information
deepakunni3 committed Sep 17, 2020
2 parents 8b1b69e + cd704b1 commit 7524772
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import os
import re
import uuid
from tqdm import tqdm # type: ignore
from typing import List, Dict, Any, Set, Optional
from zipfile import ZipFile
import pandas as pd # type: ignore
Expand Down Expand Up @@ -87,15 +87,19 @@ def parse_annotations(self, node_handle: Any, edge_handle: Any,
None.
"""
pbar = tqdm(total=2, desc="Unzipping files")
with ZipFile(data_file1, 'r') as ZF:
ZF.extractall(path=self.input_base_dir)
pbar.update(1)
with ZipFile(data_file2, 'r') as ZF:
ZF.extractall(path=self.input_base_dir)
pbar.update(1)
pbar.close()

subsets = ['pmc_json', 'pdf_json']
for subset in subsets:
subset_dir = os.path.join(self.input_base_dir, subset)
for filename in os.listdir(subset_dir):
for filename in tqdm(os.listdir(subset_dir)):
file = os.path.join(subset_dir, filename)
doc = json.load(open(file))
self.parse_annotation_doc(node_handle, edge_handle, doc)
Expand Down Expand Up @@ -444,7 +448,7 @@ def load_gene_info(self, input_dir: str, output_dir: str, species_id: List = Non
file_path = os.path.join(self.input_base_dir, 'gene_info.gz')

with gzip.open(file_path, 'rt') as FH:
for line in FH:
for line in tqdm(FH, desc="Loading gene info"):
records = line.split('\t')
if records[0] not in species_id:
continue
Expand All @@ -464,7 +468,7 @@ def load_country_code(self, input_dir: str, output_dir: str) -> None:
file_path = os.path.join(input_dir, 'wikidata_country_codes.tsv')
if os.path.exists(file_path):
with open(file_path, 'r') as FH:
for line in FH:
for line in tqdm(FH, desc="Loading country codes"):
if line.startswith('item'):
continue
records = line.rstrip().split('\t')
Expand Down

0 comments on commit 7524772

Please sign in to comment.