Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add feature to download google drive datasets #138

Merged
merged 37 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
237d4a9
Add feature to download google drive datasets
ajlail98 Mar 25, 2021
c6fd46d
Add gdown to requirements.txt
ajlail98 Mar 25, 2021
751bd7c
:art: pulling from upstream dev
ajlail98 Apr 29, 2021
b65d14e
:art: Formatted script according to template, renamed variables
ajlail98 May 25, 2021
d7afab6
:art: Changed permissions
ajlail98 May 25, 2021
0f35181
:art: Added unique filenames for each file size
ajlail98 May 25, 2021
fbb6c73
:art: Moved to external folder
ajlail98 May 25, 2021
72f0c99
Moved script to validation and renamed
ajlail98 Jun 4, 2021
bd92f5e
Rename function and add type hints
ajlail98 Jun 4, 2021
ad70968
Add file containing fileIDs to reference
ajlail98 Jun 4, 2021
b7df5f5
Add user input options for files/folders
ajlail98 Jun 9, 2021
0abe3a6
Reformat with black
ajlail98 Jun 9, 2021
df63e97
Change targets variable name
ajlail98 Jun 10, 2021
79484a5
Change "folder" to "dataset"
ajlail98 Jun 10, 2021
662d5bf
Update column names
ajlail98 Jun 10, 2021
7678155
Condense logic into one function
ajlail98 Jun 11, 2021
3ffd397
Change logic to input multiple files and multiple output dirs
ajlail98 Jun 11, 2021
46eafc2
Add logger warnings
ajlail98 Jun 15, 2021
d21f825
Add datasets.py info to setup.py
ajlail98 Jun 15, 2021
54d151d
Change internet_is_connected into an import
ajlail98 Jun 24, 2021
3dd9e63
Add internet connection checker and error message
ajlail98 Jun 24, 2021
2a45ab2
Directory structure to organize downloads
ajlail98 Jul 13, 2021
b7c2048
Change variable names and clean up extra bits
ajlail98 Jul 13, 2021
9a932d5
Add __init__.py to validation
ajlail98 Jul 13, 2021
98e356b
Add error for non-existent dir_path
ajlail98 Jul 13, 2021
0d1274b
Add detail to internet_is_connected failure
ajlail98 Jul 14, 2021
7af3c95
Added NotImplementedError
ajlail98 Jul 16, 2021
df317b0
Only read csv once
ajlail98 Jul 16, 2021
85c9387
Change strategy for filtering df
ajlail98 Jul 16, 2021
12afe4b
Using df.loc to retrieve file_id
ajlail98 Jul 16, 2021
e7da939
Argparse and var name refinements
ajlail98 Jul 16, 2021
dceb0f5
Add ability to ping custom IP
ajlail98 Jul 20, 2021
622d934
Reformatting
ajlail98 Jul 20, 2021
ac89c06
Hardcode fileID csv hosted on google drive
ajlail98 Jul 22, 2021
af931bb
Reformatting
ajlail98 Jul 22, 2021
33f75e1
Remove gdown_fileIDs.csv
ajlail98 Jul 22, 2021
7fdf590
Add verbose error message and dockerfile entrypoint
ajlail98 Jul 30, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ RUN echo "Checking autometa entrypoints" \
&& autometa-markers -h > /dev/null \
&& autometa-taxonomy -h > /dev/null \
&& autometa-binning -h > /dev/null \
&& autometa-unclustered-recruitment -h > /dev/null
&& autometa-unclustered-recruitment -h > /dev/null
chasemc marked this conversation as resolved.
Show resolved Hide resolved
17 changes: 14 additions & 3 deletions autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,10 @@ def main():
default="bacteria",
)
chasemc marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument(
"--verbose", action="store_true", default=False, help="log debug information",
"--verbose",
action="store_true",
default=False,
help="log debug information",
)
args = parser.parse_args()
kmers_df = kmers.embed(
Expand All @@ -940,11 +943,19 @@ def main():

cov_df = pd.read_csv(args.coverages, sep="\t", index_col="contig")
master_df = pd.merge(
kmers_df, cov_df[["coverage"]], how="left", left_index=True, right_index=True,
kmers_df,
cov_df[["coverage"]],
how="left",
left_index=True,
right_index=True,
)
gc_content_df = pd.read_csv(args.gc_content, sep="\t", index_col="contig")
master_df = pd.merge(
master_df, gc_content_df, how="left", left_index=True, right_index=True,
master_df,
gc_content_df,
how="left",
left_index=True,
right_index=True,
)

markers_df = load_markers(args.markers)
Expand Down
3 changes: 2 additions & 1 deletion autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ def main():
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"workspace", help="Path to Autometa results workspace directory",
"workspace",
help="Path to Autometa results workspace directory",
chasemc marked this conversation as resolved.
Show resolved Hide resolved
)
parser.add_argument(
"--write",
Expand Down
4 changes: 3 additions & 1 deletion autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,9 @@ def get_confidence_filtered_predictions(
raise NotImplementedError(classifier)

df = pd.DataFrame(
predictions, index=test_data.index, columns=train_data.target_names,
predictions,
index=test_data.index,
columns=train_data.target_names,
)
# Filter predictions by confidence threshold
chasemc marked this conversation as resolved.
Show resolved Hide resolved
confidence_threshold = num_classifications * confidence
Expand Down
5 changes: 4 additions & 1 deletion autometa/common/kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,10 @@ def main():
default=False,
)
parser.add_argument(
"--cpus", help=f"num. processors to use.", default=cpus, type=int,
"--cpus",
help=f"num. processors to use.",
default=cpus,
chasemc marked this conversation as resolved.
Show resolved Hide resolved
type=int,
)
parser.add_argument(
"--seed",
Expand Down
11 changes: 9 additions & 2 deletions autometa/common/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,11 @@ def get(

if not os.path.exists(out) or not os.path.getsize(out):
chasemc marked this conversation as resolved.
Show resolved Hide resolved
out = hmmer.filter_markers(
infpath=scans, outfpath=out, cutoffs=cutoffs, orfs=orfs, force=force,
infpath=scans,
outfpath=out,
cutoffs=cutoffs,
orfs=orfs,
force=force,
)
return load(fpath=out, format=format)

Expand Down Expand Up @@ -236,7 +240,10 @@ def main():
type=int,
)
parser.add_argument(
"--seed", help="Seed to set random state for hmmscan.", default=42, type=int,
"--seed",
help="Seed to set random state for hmmscan.",
default=42,
type=int,
)
args = parser.parse_args()

Expand Down
8 changes: 6 additions & 2 deletions autometa/common/metagenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,8 @@ def main():
"assembly", help="Path to metagenome assembly (nucleotide fasta)."
)
parser.add_argument(
"out", help="Path to output length-filtered assembly fasta file.",
"out",
help="Path to output length-filtered assembly fasta file.",
)
parser.add_argument(
"--cutoff",
Expand Down Expand Up @@ -471,7 +472,10 @@ def main():
args = parser.parse_args()
dirpath = os.path.dirname(os.path.realpath(args.assembly))
raw_mg = Metagenome(
assembly=args.assembly, outdir=dirpath, prot_orfs_fpath="", nucl_orfs_fpath="",
assembly=args.assembly,
outdir=dirpath,
prot_orfs_fpath="",
nucl_orfs_fpath="",
)

chasemc marked this conversation as resolved.
Show resolved Hide resolved
filtered_mg = raw_mg.length_filter(
Expand Down
15 changes: 12 additions & 3 deletions autometa/config/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ class Databases:
}

def __init__(
self, config=DEFAULT_CONFIG, dryrun=False, nproc=mp.cpu_count(), update=False,
self,
config=DEFAULT_CONFIG,
dryrun=False,
nproc=mp.cpu_count(),
update=False,
):
"""

Expand Down Expand Up @@ -719,7 +723,9 @@ def main():
"into default databases directory.",
)
parser.add_argument(
"--config", help="</path/to/input/database.config>", default=DEFAULT_FPATH,
"--config",
help="</path/to/input/database.config>",
default=DEFAULT_FPATH,
)
parser.add_argument(
"--dryrun",
Expand Down Expand Up @@ -769,7 +775,10 @@ def main():

config = get_config(args.config)
dbs = Databases(
config=config, dryrun=args.dryrun, nproc=args.nproc, update=args.update,
config=config,
dryrun=args.dryrun,
nproc=args.nproc,
update=args.update,
)
chasemc marked this conversation as resolved.
Show resolved Hide resolved

compare_checksums = False
Expand Down
4 changes: 3 additions & 1 deletion autometa/config/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,9 @@ def get_versions(program=None):
try:
return globals()[exe_name]()
except TypeError:
logger.warning(f"{exe_name} not found. This may impact a stage of the Autometa pipeline.")
logger.warning(
f"{exe_name} not found. This may impact a stage of the Autometa pipeline."
)
return "Not found"
versions = {}
chasemc marked this conversation as resolved.
Show resolved Hide resolved
executables = find_executables()
Expand Down
3 changes: 1 addition & 2 deletions autometa/config/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ def new_metagenome_num(self):
return mg_num

def save(self):
"""Save project config in project directory
"""
"""Save project config in project directory"""
put_config(self.config, self.config_fpath)

def new_metagenome_directory(self):
chasemc marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
7 changes: 6 additions & 1 deletion autometa/taxonomy/majority_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,12 @@ def majority_vote(
filename, __ = os.path.splitext(os.path.basename(orfs))
outdir = os.path.dirname(os.path.realpath(out))
lca_out = os.path.join(outdir, ".".join([filename, "lca.tsv"]))
lca_fpath = lca.blast2lca(orfs=orfs, out=lca_out, blast=blast, force=force,)
lca_fpath = lca.blast2lca(
orfs=orfs,
out=lca_out,
blast=blast,
force=force,
chasemc marked this conversation as resolved.
Show resolved Hide resolved
)
# retrieve lca taxids for each contig
classifications = lca.parse(lca_fpath=lca_fpath, orfs_fpath=orfs)
# Vote for majority lca taxid from contig lca taxids
Expand Down
47 changes: 47 additions & 0 deletions autometa/validation/download_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# in response to issue #110
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
# Intended: fetch data similarly to scikit-learn API
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
# pulling data from google drive folder with simulated or synthetic communities

# use gdown to download data from google drive to output directory specified by the user
# create a dictionary of the databases in the google drive
# allow the user to call them based on size (eg '78', '156'...)
# allow the user to specify <some/directory>
# find that corresponding file and download it to <some/directory>

# goal: autometa-download-dataset --community 78 --output <some/directory>

# prepare dependencies
import gdown
import argparse

# take in commands that user input
# including test file for now
parser = argparse.ArgumentParser(prog='autometa-download-dataset', description='Download a simulated community file from google drive to a specified directory')
parser.add_argument('--community',
help='specify a size of simulated community in MB',
choices=['78', '156', '312', '625', '1250', '2500', '5000', '10000', 'test'],
required=True)
parser.add_argument('--output',
help='specify the directory to download the file',
required=True)
args = parser.parse_args()

# provide list of database options as a dictionary with file_ids from google
simulated = {
'test': '1fy3M7RnS_HGSQVKidCy-rAwXuxldyOOv',
'78': '15CB8rmQaHTGy7gWtZedfBJkrwr51bb2y',
'156': '13bkwFBIUhdWVWlAmVCimDODWF-7tRxgI',
'312': '1qyAu-m6NCNuVlDFFC10waOD28j15yfV-',
'625': '1FgMXSD50ggu0UJbZd1PM_AvLt-E7gJix',
'1250': '1KoxwxBAYcz8Xz9H2v17N9CHOZ-WXWS5m',
'2500': '1wKZytjC4zjTuhHdNUyAT6wVbuDDIwk2m',
'5000': '1IX6vLfBptPxhL44dLa6jePs-GRw2XJ3S',
'10000': '1ON2vxEWC5FHyyPqlfZ0znMgnQ1fTirqG'
}
Copy link
Collaborator

@Sidduppal Sidduppal Mar 25, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like a great start. You can try the items() method (link) to loop through the dictionary and get the respective "ID" needed to download.


# construct file id into a url to put into gdown
file_id = simulated[args.community]
url = f'https://drive.google.com/uc?id={file_id}'

# download the specified file with gdown
gdown.download(url, args.output)
ajlail98 marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"]

# fmt: off
import parse_argparse
import parse_argparse

# -- Project information -----------------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ requests
umap-learn
hdbscan
attrs
gdown
1 change: 1 addition & 0 deletions tests/unit_tests/test_samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_sort_missing_file():
with pytest.raises(FileNotFoundError):
samtools.sort(sam="sam", bam="bam")

chasemc marked this conversation as resolved.
Show resolved Hide resolved

@pytest.mark.parametrize("cpus", [2.9, -2])
def test_sort_invalid_cpu_input(cpus):
with pytest.raises(TypeError):
Expand Down
3 changes: 2 additions & 1 deletion tests/unit_tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def return_metabin_taxonomies(*args, **kwargs):

@pytest.mark.skip
def test_get_metabin_taxonomies(
mock_rank_taxids, bin_df,
mock_rank_taxids,
bin_df,
):
mock_ncbi = return_mock_ncbi()
chasemc marked this conversation as resolved.
Show resolved Hide resolved
df = summary.get_metabin_taxonomies(bin_df=bin_df, ncbi=mock_ncbi)
Expand Down
3 changes: 2 additions & 1 deletion tests/unit_tests/test_unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ def test_get_kmer_features(kmers, dimensions):


def test_get_features_no_taxa(
kmers, coverage,
kmers,
coverage,
):
chasemc marked this conversation as resolved.
Show resolved Hide resolved
df = unclustered_recruitment.get_features(kmers=kmers, coverage=coverage)
assert isinstance(df, pd.DataFrame)
Expand Down
36 changes: 29 additions & 7 deletions tests/unit_tests/test_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,30 +91,43 @@ def test_add_ranks(ncbi, votes, tmp_path):
@pytest.mark.skip
def test_vote_assign(blastp, ncbi_dir, prot_orfs, tmp_path):
out = tmp_path / "votes.tsv"
votes = vote.assign(out=out, prot_orfs=prot_orfs, blast=blastp, ncbi_dir=ncbi_dir,)
votes = vote.assign(
out=out,
prot_orfs=prot_orfs,
blast=blastp,
ncbi_dir=ncbi_dir,
)
assert isinstance(votes, pd.DataFrame)
assert votes.index.name == "contig"
assert "taxid" in votes.columns


def test_get(ncbi, votes_fpath):
df = vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
df = vote.get(
filepath_or_dataframe=votes_fpath,
kingdom="bacteria",
ncbi=ncbi,
)
# canonical ranks should have been added to table if they were not already in place.
assert df.shape == (2, 8)


def test_get_none_recovered(ncbi, votes_fpath):
with pytest.raises(KeyError):
vote.get(
filepath_or_dataframe=votes_fpath, kingdom="archaea", ncbi=ncbi,
filepath_or_dataframe=votes_fpath,
kingdom="archaea",
ncbi=ncbi,
)


def test_get_empty_votes(ncbi_dir, tmp_path):
fpath = tmp_path / "votes.tsv"
with pytest.raises(FileNotFoundError):
vote.get(
filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi_dir,
filepath_or_dataframe=fpath,
kingdom="archaea",
ncbi=ncbi_dir,
)


Expand All @@ -127,13 +140,19 @@ def return_df(*args, **kwargs):
monkeypatch.setattr(vote, "add_ranks", return_df, raising=True)
with pytest.raises(TableFormatError):
vote.get(
filepath_or_dataframe=fpath, kingdom="archaea", ncbi=ncbi,
filepath_or_dataframe=fpath,
kingdom="archaea",
ncbi=ncbi,
)


@pytest.fixture(name="ranks_added_votes", scope="module")
def fixture_ranks_added_votes(votes_fpath, ncbi):
return vote.get(filepath_or_dataframe=votes_fpath, kingdom="bacteria", ncbi=ncbi,)
return vote.get(
filepath_or_dataframe=votes_fpath,
kingdom="bacteria",
ncbi=ncbi,
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -191,7 +210,10 @@ def test_write_ranks_no_taxonomy_columns(tmp_path, votes):
assembly = dirpath / "assembly.fna"
with pytest.raises(KeyError):
vote.write_ranks(
taxonomy=votes, assembly=assembly, outdir=dirpath, rank="superkingdom",
taxonomy=votes,
assembly=assembly,
outdir=dirpath,
rank="superkingdom",
)

chasemc marked this conversation as resolved.
Show resolved Hide resolved

Expand Down