# Notebook Run Requirements

1. The data files and data dictionaries must exist in the /data/{study_id} directory.
2. The data files and data dictionaries must be defined in the {study_id}_study.yaml.
3. Edit the 'study_id' variable below.

In [None]:
# Arg otebook arguments
study_id = 'cmg_yale'

In [None]:
import pandas as pd
from pathlib import Path
import sys

pd.set_option('display.max_colwidth', None)
pd.set_option("display.width", 0)
pd.set_option('future.no_silent_downcasting', True)

project_root = Path().resolve().parent
sys.path.append(str(project_root))
from scripts.general import *

# Arguments that most likely won't change
org_id = 'anvil'
dbt_repo = 'anvil_dbt_project'
tgt_model_id = 'tgt_consensus_a' # Required by get_all_paths
raw_schema = 'main'

paths = get_all_paths(study_id, dbt_repo, org_id, tgt_model_id, src_data_path=None)
study_config = read_file(paths["study_yml_path"])

# Run Reports

In [None]:
# Compile and format the source 
src_dds_dict = study_config_dds_to_dict(study_config, paths)
src_df_names_dict = study_config_df_lists_to_dict(study_config)
unioned_dfs_dict = union_tables(src_df_names_dict, paths)

# Enumerations Validation¶
- Compares the 'summary data dictionary' defined 'enumerations' with enums from the data files.
- All columns, not included. The report only shows where there was not a 100% match between the files.
- Enumerations in df_enum_missing_from_dd with red text, define failure. These strings are expected in the ‘dd_enums’.  All other fields help to pinpoint the origin of the error.


In [None]:
report = enum_report_by_file(src_dds_dict, src_df_names_dict, paths)
filtered_report = report[~(report['df_enum_missing_from_dd'].isna()) & ~(report['dd_enum_missing_from_df'].isna())].reset_index(drop=True)
styled_report = filtered_report.style.apply(format_not_nulls, axis=1, subset=['df_enum_missing_from_dd', 'dd_enums'])

styled_report

# Schema Validation
- Ignore the field 'subject.subject_id2'. It was added by the harmonizer for the last pipeline run. The harmonization process will not require this type of dd edit in the future. Please ignore, for now.
- Ignore the field 'ingest_provenance'. - It is not in the original dds, it is currently being added by the harmonizer to the dds. The src  field is added to the source data by the TDR automatically. The 'src_data_column' exists for each of the files, it does not show in the characterization report because it is the column that is used to pivot the data(column names). Keeping in the validation for posterity.

In [None]:
schema_reports_dict = schema_comparison(unioned_dfs_dict,src_dds_dict)

for table_name, df in schema_reports_dict.items():
    print(f"\n\n\n\n Schema validation. Table:{table_name}")
    styled_report = df.style.apply(format_nulls, axis=1, subset=['dd_column'])

    display(styled_report)