# Test base/niagads/metadata_validator_tool

## Test running the metadata validator tool script directly  

`../../bases/niagads/metadata_validator_tool/core.py`

In [1]:
# imports / helpers

import subprocess
import json

def pretty_print(resultStr):
    resultJson = json.loads(resultStr.replace('running\n', '').replace('\\"', ''))
    print(json.dumps(resultJson, indent=4))
    
CMD_ROOT = ['python', '../../bases/niagads/metadata_validator_tool/core.py']

In [2]:
# print the usage 
result = subprocess.run(CMD_ROOT + ['--help'], capture_output=True, text=True)
print(result.stderr)
print(result.stdout)


usage: core.py [-h] [--template TEMPLATE] --metadataFileType
               {BIOSOURCE_PROPERTIES,FILE_MANIFEST} [--caseInsensitive]
               [--log] [--failOnError] [--schemaDir SCHEMADIR]
               [--metadataFilePrefix METADATAFILEPREFIX]
               [--metadataFile METADATAFILE] [--schemaFile SCHEMAFILE]
               [--idField IDFIELD]

NIAGADS JSON Schema based metadata validation. This tool allows the user to
perform [JSON Schema](https://json-schema.org/)-based validation of a sample
or file manifest metadata file arranged in tabular format (with a header row
that has field names matching the validation schema). The tool works for
delimited text files (.tab, .csv., .txt) as well as excel (.xls, .xlsx) files.
This tool can be run as a script or can also be imported as a module. When run
as a script, results are piped to STDOUT unless the `--log` option is
specified.

options:
  -h, --help            show this help message and exit
  --template TEMPLATE   templat

In [3]:
# file manifest test

schemaFile = './schemas/file_manifest.json'
metadataFile = './metadata_files/test_file_manifest.tab'
args = ['--metadataFileType', 'file_manifest', '--schemaFile', schemaFile, '--metadataFile', metadataFile]
result = subprocess.run(CMD_ROOT + args, capture_output=True, text=True)
print(result.stderr)
pretty_print(result.stdout)


{
    "errors": [],
}


In [4]:
# file manifest with errors test

schemaFile = './schemas/file_manifest.json'
metadataFile = './metadata_files/test_file_manifest_with_errors.tab'
args = ['--metadataFileType', 'file_manifest',  '--schemaFile', schemaFile, '--metadataFile', metadataFile]
result = subprocess.run(CMD_ROOT + args, capture_output=True, text=True)
print(result.stderr)
pretty_print(result.stdout)


{
    "errors": [
        {
            "file": [
                "Additional properties are not allowed ('analysis_category' was unexpected)"
            ]
        },
        {
            "1": [
                {
                    "md5sum": "'c6779ec2960296ed9a8d67f64422' does not match '^[a-fA-F0-9]{32}$'"
                },
                {
                    "data_type": "'sequence' is not one of ['imputation', 'sample annotation', 'genetic association', 'expression', 'sequencing', 'variant calling', 'gene annotation', 'variant annotation', 'genomic region annotation', 'README', 'data dictionary', 'other']"
                }
            ]
        },
        {
            "2": [
                {
                    "data_type": "'sequence' is not one of ['imputation', 'sample annotation', 'genetic association', 'expression', 'sequencing', 'variant calling', 'gene annotation', 'variant annotation', 'genomic region annotation', 'README', 'data dictionary', 'other']"
           

In [5]:
# case insensitive file manifest
schemaFile = './schemas/file_manifest.json'
metadataFile = './metadata_files/ci_file_manifest.tab'
args = ['--metadataFileType', 'file_manifest',  '--schemaFile', schemaFile, '--metadataFile', metadataFile, '--caseInsensitive']
result = subprocess.run(CMD_ROOT + args, capture_output=True, text=True)
print(result.stderr)
pretty_print(result.stdout)


{
    "errors": [
        {
            "4": [
                {
                    "file_format": "'TEXT' is not one of (case-insensitive) ['free text', 'delimited text', 'XML', 'YAML', 'JSON', 'Excel', 'BAM', 'CRAM', 'BED', 'BIM (PLINK)', 'FAM (PLINK)', 'MAP (PLINK)', 'PED (PLINK)', 'BGEN', 'FASTA', 'FASTQ', 'Folder archive', 'matrix', 'index', 'sample', 'other']"
                }
            ]
        }
    ],
}


In [6]:
# templated file manifest

schemaDir = 'schemas'
pattern = 'metadata_files/test_'
args = ['--metadataFileType', 'file_manifest', '--template', 'file_manifest', '--schemaDir', schemaDir, '--metadataFilePrefix', pattern]
result = subprocess.run(CMD_ROOT + args, capture_output=True, text=True)
print(result.stderr)
pretty_print(result.stdout)


{
    "errors": [],
}


In [7]:
# templated participant info file

schemaDir = 'schemas'
pattern = 'metadata_files/test_'
idField = 'participant_id'
args = ['--metadataFileType', 'biosource_properties', 
    '--template', 'participant_info', 
    '--schemaDir', schemaDir, 
    '--metadataFilePrefix', pattern,
    '--idField', idField
    ]
result = subprocess.run(CMD_ROOT + args, capture_output=True, text=True)
print(result.stderr)
pretty_print(result.stdout)


{
    "errors": [],
}


## Test as an imported package

```python
import niagads.metadata_validator_tool.core
```

In [2]:
import niagads.metadata_validator_tool.core as vm

schemaDir = 'schemas'
pattern = 'metadata_files/test_'
idField = 'participant_id'
template = 'participant_info'

# get files from template and path variables
schemaFile = vm.get_templated_schema_file(schemaDir, template)
print(f'Schema File: {schemaFile}')

metadataFile = vm.get_templated_metadata_file(pattern, template)
print(f'Metadata File: {metadataFile}')

# straight run
validation_result = vm.run(metadataFile, schemaFile, 'biosource_properties', idField)
print(f'Straight Run result: {validation_result}')

# get an initialized validator object
validator = vm.initialize_validator(metadataFile, schemaFile, 'biosource_properties', idField)
print(f'Validator type: {type(validator)}')
print(f'Schema: {json.dumps(validator.get_schema(as_json=True), indent=4)}')
print(f'Parsed Metadata: {json.dumps(validator.get_metadata(), indent=4)}')
print(f'Biosource IDs: {validator.get_biosource_ids()}')
print(f'Race: {validator.get_field_values('race')}')
validation_result = validator.run()
print(f'Validation Result: {validation_result}')

Schema File: schemas/participant_info.json
Metadata File: metadata_files/test_participant_info.tab
Validator type: <class 'niagads.metadata_validator.core.BiosourcePropertiesValidator'>
Schema: {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "title": "Participant Information",
    "description": "description of required fields and field values for DSS Data Submission Participant Info Metadata",
    "$comment": "enums for disease, diagnosis, APOE, age",
    "properties": {
        "participant_id": {
            "type": "string"
        },
        "cohort": {
            "type": "string"
        },
        "consent": {
            "type": [
                "string",
                "null"
            ],
            "$comment": "allowing nulls because the submitter may not know the consent information"
        },
        "sex": {
            "type": [
                "string",
                "null"
            ],
            "enum": [
              

## Test from package, continued:
- case-insensitive matches to schema controlled vocabularies 

In [4]:
import niagads.metadata_validator_tool.core as vm

schemaDir = 'schemas'
template = "file_manifest"
schemaFile = vm.get_templated_schema_file(schemaDir, template)
metadataFile = "metadata_files/ci_file_manifest.tab"

# strict case
validator = vm.initialize_validator(metadataFile, schemaFile, 'file_manifest')
validation_result = validator.run()
print(f'Strict Case Validation Result: {json.dumps(validation_result, indent=4)}')

# case insensitive
validator = vm.initialize_validator(metadataFile, schemaFile, 'file_manifest', case_insensitive=True)
validation_result = validator.run()
print(f'\nCase Insensitive Validation Result: {json.dumps(validation_result, indent=4)}')

Strict Case Validation Result: {
    "errors": [
        {
            "1": [
                {
                    "sample_id": "unexpected value; check for an error in a related field or set to an empty string (text/EXCEL) or `null` (json)"
                },
                {
                    "data_type": "'Sequencing' is not one of ['imputation', 'sample annotation', 'genetic association', 'expression', 'sequencing', 'variant calling', 'gene annotation', 'variant annotation', 'genomic region annotation', 'README', 'data dictionary', 'other']"
                },
                {
                    "file_category": "'Single-sample' is not one of ['single-sample', 'multi-sample', 'sample-independent', 'documentation']"
                }
            ]
        },
        {
            "2": [
                {
                    "file_format": "'fasta' is not one of ['free text', 'delimited text', 'XML', 'YAML', 'JSON', 'Excel', 'BAM', 'CRAM', 'BED', 'BIM (PLINK)', 'FAM (PLINK)', '

In [7]:
# normalize the case-insensitive metadata from previous cell
print(f"{validator.to_text(path_or_buf=None, normalize=True)}")



file_category	file_name	data_type	file_format	sample_id	md5sum	package_file_manifest	comment
single-sample	sample1.fasta.gz	sequencing	FASTA	SAMPLE1	c6779ec2960296ed9a04f08d67f64422		
single-sample	sample2.fasta.gz	sequencing	FASTA	SAMPLE2	c6779ec2960296ed9a04f08d67f64422		
sample-independent	annotation.tar.gz	sample annotation	Folder archive		c6779ec2960296ed9a04f08d67f64422	sample3_file_manifest.txt	
single-sample	sample4.txt	genetic association	TEXT	SAMPLE4	c6779ec2960296ed9a04f08d67f64422		
single-sample	sample5.csv	genetic association	delimited text	SAMPLE5	c6779ec2960296ed9a04f08d67f64422		
single-sample	sample3.fasta.gz	sequencing	FASTA	SAMPLE3	c6779ec2960296ed9a04f08d67f64422		

