In [1]:
# Import client library.
from igvf_client import IgvfApi

In [2]:
# Instantiate new instance of IGVF API.
api = IgvfApi()

# Basic usage

In [3]:
# Search for all types that match 'abc'.
results = api.search(query='abc')

In [4]:
# Print total number of results.
results.total

20

In [5]:
# Print result count by type.
results.facets

[SearchFacet(var_field='type', title='Data Type', terms=[{'key': 'Gene', 'doc_count': 16}, {'key': 'Software', 'doc_count': 2}, {'key': 'Source', 'doc_count': 2}])]

In [6]:
# Limit results to Software.
results = api.search(query='abc', type=['Software'])

In [7]:
# Print total number of Software results.
results.total

2

In [8]:
# Print results.
results

SearchResults(graph=[SearchResultItem(actual_instance=Software(release_timestamp='2023-03-14T00:12:36.014029+00:00', publications=None, lab='/labs/jesse-engreitz/', award='/awards/HG011972/', status='released', schema_version='6', uuid='f37503db-d59c-455d-99f0-7065c9afadfb', notes=None, aliases=None, creation_timestamp='2023-03-13T23:29:35.927480+00:00', submitted_by='/users/05f44ea6-5f00-440c-99e3-6b54f5792028/', submitter_comment=None, description='Target gene prediction (element)', name='abc', title='ABC', source_url='https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction', used_by=None, id='/software/abc/', type=['Software', 'Item'], summary='ABC', versions=[])), SearchResultItem(actual_instance=Software(release_timestamp='2023-03-14T00:14:14.620564+00:00', publications=None, lab='/labs/jesse-engreitz/', award='/awards/HG011972/', status='released', schema_version='6', uuid='48a9ccf4-9f3f-4c96-98c8-b390713163ae', notes=None, aliases=None, creation_timestamp='2023-03-13T23:2

In [9]:
# Print query URL.
results.id

'/search?query=abc&type=Software&frame=object'

In [10]:
# Get first item in results list (@graph).
item = results.graph[0]

In [11]:
# Print SearchResultItem.
item

SearchResultItem(actual_instance=Software(release_timestamp='2023-03-14T00:12:36.014029+00:00', publications=None, lab='/labs/jesse-engreitz/', award='/awards/HG011972/', status='released', schema_version='6', uuid='f37503db-d59c-455d-99f0-7065c9afadfb', notes=None, aliases=None, creation_timestamp='2023-03-13T23:29:35.927480+00:00', submitted_by='/users/05f44ea6-5f00-440c-99e3-6b54f5792028/', submitter_comment=None, description='Target gene prediction (element)', name='abc', title='ABC', source_url='https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction', used_by=None, id='/software/abc/', type=['Software', 'Item'], summary='ABC', versions=[]))

In [12]:
# Unwrap Software item from SearchResultItem.
software = item.actual_instance

In [13]:
# Print Software item.
software

Software(release_timestamp='2023-03-14T00:12:36.014029+00:00', publications=None, lab='/labs/jesse-engreitz/', award='/awards/HG011972/', status='released', schema_version='6', uuid='f37503db-d59c-455d-99f0-7065c9afadfb', notes=None, aliases=None, creation_timestamp='2023-03-13T23:29:35.927480+00:00', submitted_by='/users/05f44ea6-5f00-440c-99e3-6b54f5792028/', submitter_comment=None, description='Target gene prediction (element)', name='abc', title='ABC', source_url='https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction', used_by=None, id='/software/abc/', type=['Software', 'Item'], summary='ABC', versions=[])

In [14]:
# Print @id field. 
software.id

'/software/abc/'

In [15]:
# Print status field.
software.status

'released'

In [16]:
# Print lab @id associated with the software.
software.lab

'/labs/jesse-engreitz/'

In [17]:
# Get Lab item with @id. 
item = api.get_by_id(software.lab)

In [18]:
# Print Item.
item

Item(actual_instance=Lab(status='current', url='https://www.engreitzlab.org/', schema_version='3', uuid='083882e6-c0fb-4cee-ae94-c1138a2195e7', notes=None, aliases=None, creation_timestamp='2023-01-27T01:25:29.847716+00:00', submitted_by='/users/1e75d989-a438-4d77-a451-8a297fd3636e/', submitter_comment=None, description=None, name='jesse-engreitz', pi='/users/6c75dbfa-635e-4a7a-8ec8-9e51e9c380a4/', awards=['/awards/HG011972/'], institute_label='Stanford', id='/labs/jesse-engreitz/', type=['Lab', 'Item'], summary='Jesse Engreitz, Stanford', title='Jesse Engreitz, Stanford'))

In [19]:
# Unwrap underlying Lab item from Item.
lab = item.actual_instance

In [20]:
# Print Lab item.
lab

Lab(status='current', url='https://www.engreitzlab.org/', schema_version='3', uuid='083882e6-c0fb-4cee-ae94-c1138a2195e7', notes=None, aliases=None, creation_timestamp='2023-01-27T01:25:29.847716+00:00', submitted_by='/users/1e75d989-a438-4d77-a451-8a297fd3636e/', submitter_comment=None, description=None, name='jesse-engreitz', pi='/users/6c75dbfa-635e-4a7a-8ec8-9e51e9c380a4/', awards=['/awards/HG011972/'], institute_label='Stanford', id='/labs/jesse-engreitz/', type=['Lab', 'Item'], summary='Jesse Engreitz, Stanford', title='Jesse Engreitz, Stanford')

In [21]:
# Print fields.
lab.id

'/labs/jesse-engreitz/'

In [22]:
lab.pi

'/users/6c75dbfa-635e-4a7a-8ec8-9e51e9c380a4/'

In [23]:
lab.title

'Jesse Engreitz, Stanford'

In [24]:
lab.awards

['/awards/HG011972/']

In [25]:
# Get title of Award.
api.awards(id=lab.awards).graph[0].title

'Stanford Center for Connecting DNA Variants to Function and Phenotype'

In [26]:
# Find all the Labs that share Award.
labs_that_share_award = api.search(field_filters={'awards.@id': lab.awards}, type=['Lab'])

In [27]:
labs_that_share_award.total

7

In [28]:
# Iterate through Lab titles in results.
lab_titles = [lab.title for lab in labs_that_share_award.graph]
lab_titles

['Anshul Kundaje, Stanford',
 'Michael Bassik, Stanford',
 'Lars Steinmetz, Stanford',
 'Thomas Quertermous, Stanford',
 'Marlene Rabinovitch, Stanford',
 'Will Greenleaf, Stanford',
 'Jesse Engreitz, Stanford']

In [29]:
# Iterate through Lab @ids in results.
lab_ids = [lab.id for lab in labs_that_share_award.graph]
lab_ids

['/labs/anshul-kundaje/',
 '/labs/michael-bassik/',
 '/labs/lars-steinmetz/',
 '/labs/thomas-quertermous/',
 '/labs/marlene-rabinovitch/',
 '/labs/will-greenleaf/',
 '/labs/jesse-engreitz/']

In [30]:
# Search User collection for all Users that belong to Labs.
users_in_labs = api.users(lab=lab_ids)

In [31]:
users_in_labs.total

23

In [32]:
# Collection endpoints return the collection item directly (i.e. User). Results are not wrapped in SearchResultItem or Item.
users_in_labs.graph[0]

User(status='current', schema_version=None, uuid='f9e7cd82-a407-4a20-89ee-f4dc668b0cbe', notes=None, aliases=None, creation_timestamp=None, submitted_by=None, submitter_comment=None, description=None, email=None, first_name=None, last_name=None, lab='/labs/jesse-engreitz/', submits_for=['/labs/jesse-engreitz/', '/labs/thomas-quertermous/', '/labs/anshul-kundaje/', '/labs/michael-bassik/', '/labs/will-greenleaf/', '/labs/marlene-rabinovitch/', '/labs/lars-steinmetz/'], groups=None, viewing_groups=['IGVF'], job_title=None, id='/users/f9e7cd82-a407-4a20-89ee-f4dc668b0cbe/', type=['User', 'Item'], summary=None, title='Michael Montgomery')

In [33]:
user_ids = [user.id for user in users_in_labs.graph]
user_ids

['/users/f9e7cd82-a407-4a20-89ee-f4dc668b0cbe/',
 '/users/e5d430d2-0b7c-41cb-91be-ce0ddd787af9/',
 '/users/ddb425b6-3b56-4b7b-9887-15aadd20ed99/',
 '/users/c7d0d753-635a-4bd1-a8b3-06f6fdcf826e/',
 '/users/b266291e-129f-4e0e-ba75-fe0dfe6eec4e/',
 '/users/a138ff68-2623-439d-ada7-94f409cd1469/',
 '/users/7c7ca375-a30f-4237-a212-100be64fbaed/',
 '/users/7c48e124-578f-4cb9-97b6-eb80fb1588c3/',
 '/users/6c75dbfa-635e-4a7a-8ec8-9e51e9c380a4/',
 '/users/68b90f31-de28-41ed-acd6-0e72fae2073c/',
 '/users/61a18fdf-6b2a-4353-96dd-d83d1b28e4b2/',
 '/users/5e5d0a09-9206-4e07-abcb-a5079e84cf6e/',
 '/users/4bf06b84-583d-47ff-a1c2-c0b99ced51c9/',
 '/users/4b2083b4-24c1-41d8-9daa-e79299f00d6e/',
 '/users/3e25d6c1-85e5-4871-8242-4997a3da72bb/',
 '/users/3bd6f74c-8cf3-4c35-a94d-0c7c11654815/',
 '/users/35da1760-050c-433c-93ba-97082a53cd72/',
 '/users/3506e8e6-b017-4724-99bd-e89d47e6f355/',
 '/users/2fcea56e-9fd5-4dde-8e46-21996bf82f70/',
 '/users/12693ee2-fa46-421f-8c44-831d9e13ec10/',
 '/users/100f06c6-83

In [34]:
# This is equivalent to searching for User items with lab field.
users_in_lab = api.search(type=['User'], field_filters={'lab': lab_ids})

In [35]:
users_in_lab.total

23

In [36]:
# But results from search endpoint is wrapped in SearchResultItem since results could be any item type.
users_in_lab.graph[0]

SearchResultItem(actual_instance=User(status='current', schema_version=None, uuid='f9e7cd82-a407-4a20-89ee-f4dc668b0cbe', notes=None, aliases=None, creation_timestamp=None, submitted_by=None, submitter_comment=None, description=None, email=None, first_name=None, last_name=None, lab='/labs/jesse-engreitz/', submits_for=['/labs/jesse-engreitz/', '/labs/thomas-quertermous/', '/labs/anshul-kundaje/', '/labs/michael-bassik/', '/labs/will-greenleaf/', '/labs/marlene-rabinovitch/', '/labs/lars-steinmetz/'], groups=None, viewing_groups=['IGVF'], job_title=None, id='/users/f9e7cd82-a407-4a20-89ee-f4dc668b0cbe/', type=['User', 'Item'], summary=None, title='Michael Montgomery'))

In [37]:
# Note fields are accessible without unwrapping the underlying instance first first.
users_in_lab.graph[0].submits_for

['/labs/jesse-engreitz/',
 '/labs/thomas-quertermous/',
 '/labs/anshul-kundaje/',
 '/labs/michael-bassik/',
 '/labs/will-greenleaf/',
 '/labs/marlene-rabinovitch/',
 '/labs/lars-steinmetz/']

In [38]:
# But unwrapping with actual_instance allows for tab completion of fields.
users_in_lab.graph[0].actual_instance.submits_for

['/labs/jesse-engreitz/',
 '/labs/thomas-quertermous/',
 '/labs/anshul-kundaje/',
 '/labs/michael-bassik/',
 '/labs/will-greenleaf/',
 '/labs/marlene-rabinovitch/',
 '/labs/lars-steinmetz/']

In [39]:
# Search for all SequenceFiles that were submitted by one of the users.
sequence_files = api.search(type=['SequenceFile'], field_filters={'submitted_by.@id': user_ids})

In [40]:
sequence_files.total

8865

In [41]:
# Could also search for SequenceFiles that were submitted as part of the Award.
sequence_files = api.search(type=['SequenceFile'], field_filters={'award.@id': lab.awards})

In [42]:
sequence_files.total

8865

In [43]:
sequence_files.graph[0].actual_instance

SequenceFile(controlled_access=False, anvil_url=None, release_timestamp='2023-12-21T00:06:35.899510+00:00', documents=None, lab='/labs/jesse-engreitz/', award='/awards/HG011972/', accession='IGVFFI1165AJSO', alternate_accessions=None, collections=None, status='released', revoke_detail=None, schema_version='14', uuid='fffcd64e-af02-4675-8953-7352459ee06a', notes=None, aliases=['jesse-engreitz:220829_8merInsertion-Jurkat_Stimulation-Amp_PPIF_promoter-BioRep2-FFrep1-BinD-PCRrep1_I2'], creation_timestamp='2023-12-05T19:58:18.462157+00:00', submitted_by='/users/f9e7cd82-a407-4a20-89ee-f4dc668b0cbe/', submitter_comment=None, description=None, analysis_step_version=None, content_md5sum='fce2b00b7c260827c76467aada46e759', content_type='reads', dbxrefs=None, derived_from=None, file_format='fastq', file_format_specifications=None, file_set='/measurement-sets/IGVFDS2192NCTH/', file_size=3060311, md5sum='d8c1e1b377534e0338d15286e3d22eea', submitted_file_name='220829_pPPIF_JurkatVFF-Jurkat-PE2A-BFP

In [44]:
sequence_files.graph[0].actual_instance.file_format

'fastq'

# Schema endpoints

In [45]:
# Get JSONSchema definition for SequenceFile.
schema = api.schema_for_item_type('SequenceFile')

In [46]:
schema.keys()

dict_keys(['title', '$id', '$schema', 'description', 'required', 'identifyingProperties', 'additionalProperties', 'mixinProperties', 'dependentSchemas', 'type', 'properties', 'fuzzy_searchable_fields', 'exact_searchable_fields', 'changelog', '@type'])

In [47]:
# Print fields.
schema['properties'].keys()

dict_keys(['controlled_access', 'anvil_url', 'release_timestamp', 'documents', 'lab', 'award', 'accession', 'alternate_accessions', 'collections', 'status', 'revoke_detail', 'schema_version', 'uuid', 'notes', 'aliases', 'creation_timestamp', 'submitted_by', 'submitter_comment', 'description', 'analysis_step_version', 'content_md5sum', 'content_type', 'dbxrefs', 'derived_from', 'file_format', 'file_format_specifications', 'file_set', 'file_size', 'md5sum', 'submitted_file_name', 'upload_status', 'validation_error_detail', 'flowcell_id', 'lane', 'read_count', 'minimum_read_length', 'maximum_read_length', 'mean_read_length', 'sequencing_platform', 'sequencing_kit', 'sequencing_run', 'illumina_read_type', 'index', 'base_modifications', '@id', '@type', 'summary', 'integrated_in', 'input_file_for', 'gene_list_for', 'loci_list_for', 'href', 's3_uri', 'upload_credentials', 'seqspecs'])

In [48]:
# Print schema definition for content_md5sum.
schema['properties']['content_md5sum']

{'title': 'Content MD5sum',
 'description': 'The MD5sum of the uncompressed file.',
 'type': 'string',
 'permission': 'admin_only',
 'format': 'hex',
 'maxLength': 32,
 'pattern': '[a-f\\d]{32}|[A-F\\d]{32}',
 'readonly': True}

In [49]:
# Get JSONSchema for all item types.
schemas = api.schemas()

In [50]:
schemas.keys()

dict_keys(['AccessKey', 'AnalysisStep', 'AnalysisStepVersion', 'Award', 'Biomarker', 'Document', 'HumanDonor', 'RodentDonor', 'AlignmentFile', 'ConfigurationFile', 'GenomeBrowserAnnotationFile', 'ImageFile', 'MatrixFile', 'ModelFile', 'ReferenceFile', 'SequenceFile', 'SignalFile', 'TabularFile', 'AnalysisSet', 'AuxiliarySet', 'ConstructLibrarySet', 'CuratedSet', 'MeasurementSet', 'ModelSet', 'PredictionSet', 'Gene', 'Image', 'InstitutionalCertificate', 'Lab', 'CrisprModification', 'DegronModification', 'AssayTerm', 'PhenotypeTerm', 'PlatformTerm', 'SampleTerm', 'OpenReadingFrame', 'Page', 'PhenotypicFeature', 'Publication', 'InVitroSystem', 'MultiplexedSample', 'PrimaryCell', 'TechnicalSample', 'Tissue', 'WholeOrganism', 'Software', 'SoftwareVersion', 'Source', 'Treatment', 'User', 'Workflow', '_subtypes', '@type', '_hierarchy'])

In [51]:
# Filter out some metadata fields.
item_types = [x for x in api.schemas().keys() if '_' not in x and '@' not in x]
item_types

['AccessKey',
 'AnalysisStep',
 'AnalysisStepVersion',
 'Award',
 'Biomarker',
 'Document',
 'HumanDonor',
 'RodentDonor',
 'AlignmentFile',
 'ConfigurationFile',
 'GenomeBrowserAnnotationFile',
 'ImageFile',
 'MatrixFile',
 'ModelFile',
 'ReferenceFile',
 'SequenceFile',
 'SignalFile',
 'TabularFile',
 'AnalysisSet',
 'AuxiliarySet',
 'ConstructLibrarySet',
 'CuratedSet',
 'MeasurementSet',
 'ModelSet',
 'PredictionSet',
 'Gene',
 'Image',
 'InstitutionalCertificate',
 'Lab',
 'CrisprModification',
 'DegronModification',
 'AssayTerm',
 'PhenotypeTerm',
 'PlatformTerm',
 'SampleTerm',
 'OpenReadingFrame',
 'Page',
 'PhenotypicFeature',
 'Publication',
 'InVitroSystem',
 'MultiplexedSample',
 'PrimaryCell',
 'TechnicalSample',
 'Tissue',
 'WholeOrganism',
 'Software',
 'SoftwareVersion',
 'Source',
 'Treatment',
 'User',
 'Workflow']

In [52]:
# Print total number of types.
len(item_types)

51

In [53]:
# Find total number of items for each item type using search endpoint. Use limit=0 to prevent returning any actual results.
for item_type in item_types:
    print(item_type, api.search(type=[item_type], limit=0).total)

AccessKey 0
AnalysisStep 0
AnalysisStepVersion 0
Award 27
Biomarker 0
Document 12
HumanDonor 16
RodentDonor 11
AlignmentFile 0
ConfigurationFile 767
GenomeBrowserAnnotationFile 0
ImageFile 0
MatrixFile 2
ModelFile 0
ReferenceFile 709
SequenceFile 9369
SignalFile 7
TabularFile 45
AnalysisSet 13
AuxiliarySet 292
ConstructLibrarySet 10
CuratedSet 60
MeasurementSet 615
ModelSet 0
PredictionSet 0
Gene 10272
Image 44
InstitutionalCertificate 0
Lab 102
CrisprModification 6
DegronModification 0
AssayTerm 22
PhenotypeTerm 275
PlatformTerm 24
SampleTerm 81
OpenReadingFrame 0
Page 15
PhenotypicFeature 1
Publication 73
InVitroSystem 753
MultiplexedSample 8
PrimaryCell 0
TechnicalSample 0
Tissue 8
WholeOrganism 0
Software 49
SoftwareVersion 1
Source 109
Treatment 4
User 228
Workflow 0


## Report endpoint

In [54]:
# Report endpoint is equivalent to search endpoint, but returns full list of results formatted as TSV with default fields as columns.
api.search(type=['HumanDonor']).total

16

In [55]:
api.report(type=['HumanDonor'])

'2024-08-29 00:28:05.631971\thttps://api.data.igvf.org/multireport/?type=HumanDonor&limit=all\r\nID\tUUID\tAccession\tAlternate Accessions\tAliases\tTaxa\tSex\tAward\tEthnicities\tHuman Donor Identifiers\tLab\tStatus\tSubmitted By\tCollections\tPhenotypic Features\tVirtual\r\n/human-donors/IGVFDO1756PPKO/\t7cccb0b9-99e6-425d-b684-152adc4b95e5\tIGVFDO1756PPKO\t\tigvf:donor_of_WTC11\tHomo sapiens\tmale\t/awards/HG012012/\tJapanese\tGM25256 cell line donor,WTC11 cell line donor\t/labs/j-michael-cherry/\treleased\t/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/\t\t\tFalse\r\n/human-donors/IGVFDO3459DFTW/\tc364e744-7b69-45e3-a6f1-55f423548632\tIGVFDO3459DFTW\t\tigvf:donor_of_teloHAEC\tHomo sapiens\tfemale\t/awards/HG012012/\t\tTeloHAEC cell line donor\t/labs/j-michael-cherry/\treleased\t/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/\t\t\tFalse\r\n/human-donors/IGVFDO0965QSBA/\tde29ea95-6778-46c4-92ce-15537a94896a\tIGVFDO0965QSBA\t\tigvf:donor_of_HUDEP-2\tHomo sapiens\tunspecified\t/awards/HG0120

In [56]:
import pandas as pd
from io import StringIO

In [57]:
# Can read directly into pandas dataframe. 
# Specify tab seperation.
# Skip first row which is URL of report.
df = pd.read_csv(StringIO(api.report(type=['HumanDonor'])), sep='\t', skiprows=1)
df

Unnamed: 0,ID,UUID,Accession,Alternate Accessions,Aliases,Taxa,Sex,Award,Ethnicities,Human Donor Identifiers,Lab,Status,Submitted By,Collections,Phenotypic Features,Virtual
0,/human-donors/IGVFDO1756PPKO/,7cccb0b9-99e6-425d-b684-152adc4b95e5,IGVFDO1756PPKO,,igvf:donor_of_WTC11,Homo sapiens,male,/awards/HG012012/,Japanese,"GM25256 cell line donor,WTC11 cell line donor",/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
1,/human-donors/IGVFDO3459DFTW/,c364e744-7b69-45e3-a6f1-55f423548632,IGVFDO3459DFTW,,igvf:donor_of_teloHAEC,Homo sapiens,female,/awards/HG012012/,,TeloHAEC cell line donor,/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
2,/human-donors/IGVFDO0965QSBA/,de29ea95-6778-46c4-92ce-15537a94896a,IGVFDO0965QSBA,,igvf:donor_of_HUDEP-2,Homo sapiens,unspecified,/awards/HG012012/,Japanese,HUDEP-2 cell line donor,/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
3,/human-donors/IGVFDO6638HIAD/,1a69b34d-f8f8-4364-9676-aeb935461d5d,IGVFDO6638HIAD,,igvf:donor_of_H1,Homo sapiens,male,/awards/HG012012/,,H1 cell line donor,/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
4,/human-donors/IGVFDO1080XFGV/,e3c78ee7-8f9b-42e5-aee0-dcbd80946c60,IGVFDO1080XFGV,,igvf:donor_of_Jurkat,Homo sapiens,male,/awards/HG012012/,,"FHCRC-11 cell line donor,Jurkat cell line donor",/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
5,/human-donors/IGVFDO8315PGTI/,fad338b1-c1c1-44f9-a925-0d4e540e2cd8,IGVFDO8315PGTI,,igvf:donor_HUES8,Homo sapiens,male,/awards/HG012051/,,"HUES8 cell line donor,HVRDe008-A cell line donor",/labs/j-michael-cherry/,released,/users/1e75d989-a438-4d77-a451-8a297fd3636e/,,,False
6,/human-donors/IGVFDO7411YYYE/,c8a71ffe-0207-4359-af58-8c058af26061,IGVFDO7411YYYE,,igvf:donor_of_h7,Homo sapiens,female,/awards/HG012012/,,"H7.s14 cell line donor,H7 cell line donor",/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
7,/human-donors/IGVFDO2718IDDQ/,9409e5ba-79dd-4a4f-9967-3bdbdaacf60c,IGVFDO2718IDDQ,,igvf:donor_of_H9,Homo sapiens,female,/awards/HG012012/,,H9 cell line donor,/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
8,/human-donors/IGVFDO7359LVPI/,cb89fae6-0fef-40cf-a17b-4a371a548786,IGVFDO7359LVPI,,igvf:donor_of_THP1,Homo sapiens,male,/awards/HG012012/,,THP1 cell line donor,/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False
9,/human-donors/IGVFDO9571CSNU/,74fae408-6fc9-4169-b026-465f0694e094,IGVFDO9571CSNU,,igvf:donor_of_U2OS,Homo sapiens,female,/awards/HG012012/,,"U2OS cell line donor,2T cell line donor",/labs/j-michael-cherry/,released,/users/43f2f757-5cbf-490a-9787-a1ee85a4cdcd/,,,False


In [58]:
df.shape

(16, 16)

In [59]:
# Can also only include specfic fields as columns to reduce data transfer, or return fields that are not returned by default.
# Refer to the fields in the model (e.g. https://github.com/IGVF-DACC/igvf-python-client/blob/main/docs/HumanDonor.md) or 
# the properties in the schema for the correct fields names.
df = pd.read_csv(
    StringIO(
        api.report(
            type=['HumanDonor'],
            include_fields=[
                '@id', 
                'accession',
                'award', 
                'human_donor_identifiers',
            ]
        )
    ),
    sep='\t', 
    skiprows=1
    )
df

Unnamed: 0,ID,Accession,Award,Human Donor Identifiers
0,/human-donors/IGVFDO1756PPKO/,IGVFDO1756PPKO,/awards/HG012012/,"GM25256 cell line donor,WTC11 cell line donor"
1,/human-donors/IGVFDO3459DFTW/,IGVFDO3459DFTW,/awards/HG012012/,TeloHAEC cell line donor
2,/human-donors/IGVFDO0965QSBA/,IGVFDO0965QSBA,/awards/HG012012/,HUDEP-2 cell line donor
3,/human-donors/IGVFDO6638HIAD/,IGVFDO6638HIAD,/awards/HG012012/,H1 cell line donor
4,/human-donors/IGVFDO1080XFGV/,IGVFDO1080XFGV,/awards/HG012012/,"FHCRC-11 cell line donor,Jurkat cell line donor"
5,/human-donors/IGVFDO8315PGTI/,IGVFDO8315PGTI,/awards/HG012051/,"HUES8 cell line donor,HVRDe008-A cell line donor"
6,/human-donors/IGVFDO7411YYYE/,IGVFDO7411YYYE,/awards/HG012012/,"H7.s14 cell line donor,H7 cell line donor"
7,/human-donors/IGVFDO2718IDDQ/,IGVFDO2718IDDQ,/awards/HG012012/,H9 cell line donor
8,/human-donors/IGVFDO7359LVPI/,IGVFDO7359LVPI,/awards/HG012012/,THP1 cell line donor
9,/human-donors/IGVFDO9571CSNU/,IGVFDO9571CSNU,/awards/HG012012/,"U2OS cell line donor,2T cell line donor"


# Download file

The download endpoint return raw bytes of a file, given a file UUID or @id. Most files are big and should probably be saved directly to disk. In the case of a gzipped TSV, we can read it directly into a pandas dataframe.

In [60]:
# Find TSV file.
tsv_files = api.search(type=['File'], field_filters={'file_format': 'tsv'})

In [61]:
tsv_files.total

49

In [62]:
tsv_file = tsv_files.graph[0].actual_instance

In [63]:
tsv_file

TabularFile(cell_type_annotation=None, controlled_access=False, anvil_url=None, assembly='GRCh38', release_timestamp='2024-07-10T21:20:03.378713+00:00', file_format_type=None, transcriptome_annotation=None, documents=None, lab='/labs/jesse-engreitz/', award='/awards/HG011972/', accession='IGVFFI8092FZKL', alternate_accessions=None, collections=None, status='released', revoke_detail=None, schema_version='11', uuid='fdbdc159-e5b9-40a8-b788-3f72c9886b03', notes=None, aliases=['jesse-engreitz:locus_file_CRUDO_MYC'], creation_timestamp='2024-05-31T21:02:44.602697+00:00', submitted_by='/users/100f06c6-83a7-4b4d-9dfd-053a1829773b/', submitter_comment=None, description=None, analysis_step_version=None, content_md5sum='4aa6e40c55b54a3cc40d1cb1ef9093a3', content_type='elements reference', dbxrefs=None, derived_from=None, file_format='tsv', file_format_specifications=None, file_set='/curated-sets/IGVFDS2476WFVW/', file_size=4732, md5sum='ea8051cc2571447ca8f413aa99a4764a', submitted_file_name='/oa

In [64]:
tsv_file.id

'/tabular-files/IGVFFI8092FZKL/'

In [65]:
tsv_file.summary

'GRCh38 elements reference'

In [66]:
# File size in bytes.
tsv_file.file_size

4732

In [67]:
from io import BytesIO

In [68]:
# Read raw gzipped bytes directly into dataframe.
pd.read_csv(BytesIO(api.download(tsv_file.id)), compression='gzip', sep='\t', header=None)

Unnamed: 0,0,1,2
0,chr8,124741787,124742287
1,chr8,124751476,124751976
2,chr8,124784449,124784949
3,chr8,124799434,124799934
4,chr8,124810348,124810848
...,...,...,...
572,chr8,130681801,130682301
573,chr8,130692145,130692645
574,chr8,130701645,130702145
575,chr8,130716836,130717336


# Collection endpoints

The search endpoint will search across most item types by default, or can be filtered to only include select items. If you already know the collection type you are interested in, you can use its collection endpoint, which has explicitly defined field filters as input (refer to the collection endpoint documentation for specific fields). Note that the period in embedded fields such as `file.file_size` are specified with underscores (i.e. `files_file_size`) so that they are valid Python syntax.

In [69]:
# Find total number of MeasurementSets
api.measurement_sets().total

615

In [70]:
# Refer to e.g. https://github.com/IGVF-DACC/igvf-python-client/blob/main/docs/IgvfApi.md#measurement_sets for filter fields.

In [71]:
api.measurement_sets(status=['released']).total

615

In [72]:
# Filter on award.@id
api.measurement_sets(status=['released'], award_id=['/awards/HG011972/']).total

549

In [73]:
# Filter on files.@id
api.measurement_sets(files_id=['/sequence-files/IGVFFI9534FGRQ/']).total

1

In [74]:
api.measurement_sets(files_id=['/sequence-files/IGVFFI9534FGRQ/']).graph[0].files

['/sequence-files/IGVFFI9534FGRQ/',
 '/sequence-files/IGVFFI5460ASSY/',
 '/sequence-files/IGVFFI4839QVTC/',
 '/sequence-files/IGVFFI8111MPTE/',
 '/sequence-files/IGVFFI2837YQLJ/',
 '/sequence-files/IGVFFI6606YZTP/',
 '/sequence-files/IGVFFI5238KZBQ/',
 '/sequence-files/IGVFFI3272DDWD/',
 '/sequence-files/IGVFFI1090FYAA/',
 '/sequence-files/IGVFFI9275BZMG/',
 '/sequence-files/IGVFFI9043ACMD/',
 '/sequence-files/IGVFFI3461WBDK/',
 '/configuration-files/IGVFFI4106VMGU/',
 '/configuration-files/IGVFFI7942WNXM/',
 '/configuration-files/IGVFFI7894CKVE/',
 '/configuration-files/IGVFFI2982KSVM/']

In [75]:
api.measurement_sets(samples_sample_terms_term_name=['HCT116']).total

160

In [76]:
# Search on samples.sample_terms.term_name
api.measurement_sets(samples_sample_terms_term_name=['HCT116', 'K562']).total

164

In [77]:
# Equivalent to search with dotted embedded field.
api.search(type=['MeasurementSet'], field_filters={'samples.sample_terms.term_name': ['HCT116', 'K562']}).total

164

In [78]:
# Use dotted fields with search/report endpoints when specifying field filters and fields to include in response.
df = pd.read_csv(
    StringIO(
        api.report(
            type=['MeasurementSet'],
            field_filters={'samples.sample_terms.term_name': ['HCT116', 'K562']},
            include_fields=['@id', 'preferred_assay_title', 'assay_term.term_name', 'samples.taxa', 'targeted_genes.symbol', 'samples.treatments.summary']
        )
    ),
    sep='\t', 
    skiprows=1
)
df

Unnamed: 0,ID,Preferred Assay Title,assay_term.term_name,samples.taxa,Assay Targeted Genes,samples.treatments.summary
0,/measurement-sets/IGVFDS6323AUPJ/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,MYC,
1,/measurement-sets/IGVFDS3328PNPV/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,KITLG,Treatment of 1 μM 5-Phenyl-1H-indole-3-acetic ...
2,/measurement-sets/IGVFDS2241NBBY/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,CCND1,
3,/measurement-sets/IGVFDS9546RHPY/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,ITPRID2,
4,/measurement-sets/IGVFDS4476RLFR/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,KITLG,
...,...,...,...,...,...,...
159,/measurement-sets/IGVFDS4959UVNG/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,KITLG,Treatment of 1 μM 5-Phenyl-1H-indole-3-acetic ...
160,/measurement-sets/IGVFDS4798YRBK/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,ITPRID2,Treatment of 1 μM 5-Phenyl-1H-indole-3-acetic ...
161,/measurement-sets/IGVFDS2213NJWP/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,KITLG,Treatment of 1 μM 5-Phenyl-1H-indole-3-acetic ...
162,/measurement-sets/IGVFDS6273UIFG/,CRISPR FlowFISH,CRISPR perturbation screen followed by flow cy...,Homo sapiens,FAM3C,


In [79]:
df.shape

(164, 6)

In [80]:
df['assay_term.term_name'].value_counts()

assay_term.term_name
CRISPR perturbation screen followed by flow cytometry and FISH    120
CRISPR screen                                                      44
Name: count, dtype: int64

In [81]:
# Default columns are mapped to titles that are different than the underlying field name.
df['Assay Targeted Genes'].value_counts()

Assay Targeted Genes
KITLG      48
CCND1      36
MYC        24
ITPRID2    24
FAM3C      24
Name: count, dtype: int64

# Limit number of results

By default most endpoints return at most 25 results. Use the `limit` keyword to specify exact number of results, or `all` to return all results. Be careful if you are returning thousands of results. Usually the report endpoint with select fields is the most efficient for data transfer/serialization, and always returns all results (make sure to narrow search query down first as much as possible before fetching report).

In [82]:
results = api.tabular_files()

In [83]:
# Total number of results matching collection.
results.total

45

In [84]:
# Total number of results actuall returned.
len(results.graph)

25

In [85]:
results = api.tabular_files(limit=1)
len(results.graph)

1

In [86]:
results = api.tabular_files(limit=100)
len(results.graph)

45

In [87]:
results = api.tabular_files(limit='all')
len(results.graph)

45