diff --git a/multiqc/modules/dragen/dragen.py b/multiqc/modules/dragen/dragen.py index 3980199ff5..8ea4be946e 100755 --- a/multiqc/modules/dragen/dragen.py +++ b/multiqc/modules/dragen/dragen.py @@ -10,6 +10,7 @@ from .ploidy_estimation_metrics import DragenPloidyEstimationMetrics from .rna_quant_metrics import DragenRnaQuantMetrics from .rna_transcript_cov import DragenRnaTranscriptCoverage +from .roh_metrics import DragenROHMetrics from .sc_atac_metrics import DragenScAtacMetrics from .sc_rna_metrics import DragenScRnaMetrics from .time_metrics import DragenTimeMetrics @@ -20,6 +21,7 @@ class MultiqcModule( + DragenROHMetrics, DragenMappingMetics, DragenFragmentLength, DragenPloidyEstimationMetrics, @@ -73,6 +75,9 @@ def __init__(self): samples_found |= self.add_ploidy_estimation_metrics() # .ploidy_estimation_metrics.csv - add just Ploidy estimation into gen stats + samples_found |= self.add_roh_metrics() + # .roh_metrics.csv + self.collect_overall_mean_cov_data() # ._overall_mean_cov.csv # This data will be used by in the DragenCoverageMetrics. diff --git a/multiqc/modules/dragen/roh_metrics.py b/multiqc/modules/dragen/roh_metrics.py new file mode 100644 index 0000000000..5fb0ebd7d3 --- /dev/null +++ b/multiqc/modules/dragen/roh_metrics.py @@ -0,0 +1,510 @@ +'''"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +This module gathers ROH metrics data and prepares it for the output report. + +It relies on the following official sources: +https://support.illumina.com/help/DRAGEN_Germline_OLH_1000000083701/Content/Source/Informatics/Apps/ROHMetrics_appDRAG.htm +https://support-docs.illumina.com/SW/DRAGEN_v38/Content/SW/RegionsOfHomozygosity_fDG.htm +https://support-docs.illumina.com/SW/DRAGEN_v41/Content/SW/ROHCaller.htm +''' + +import logging +import re +from collections import OrderedDict, defaultdict + +from multiqc.modules.base_module import BaseMultiqcModule +from multiqc.plots import bargraph, table +from multiqc.utils.util_functions import write_data_file + +from .utils import check_duplicate_samples, flatten_4D_data, make_gen_stats_headers, make_log_report, make_own_headers + +# Initialise the logger. +log = logging.getLogger(__name__) + + +NAMESPACE = "DRAGEN ROH" + + +'''"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +The SINGLE_HEADER is used to define common settings for all coverage headers: +https://multiqc.info/docs/development/plots/#creating-a-table +''' +SINGLE_HEADER = { + "namespace": NAMESPACE, # Name for grouping. Prepends desc and is in Config Columns modal + "title": None, # Short title, table column title + "description": None, # Longer description, goes in mouse hover text + "max": None, # Minimum value in range, for bar / colour coding + "min": 0, # Maximum value in range, for bar / colour coding + "ceiling": None, # Maximum value for automatic bar limit + "floor": None, # Minimum value for automatic bar limit + "minRange": None, # Minimum range for automatic bar + "scale": "GnBu", # Colour scale for colour coding. False to disable. + "bgcols": None, # Dict with values: background colours for categorical data. + "colour": "0, 0, 255", # Colour for column grouping + "suffix": None, # Suffix for value (eg. "%") + "format": "{:,.1f}", # Value format string - default 1 decimal places + "cond_formatting_rules": None, # Rules for conditional formatting table cell values. + "cond_formatting_colours": None, # Styles for conditional formatting of table cell values + "shared_key": None, # See the link for description + "modify": None, # Lambda function to modify values + "hidden": True, # Set to True to hide the column in the general table on page load. + "hidden_own": False, # For non-general plots in the ROH section. + "exclude": True, # True to exclude all headers from the general html table. +} +'''"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +The EXTRA_HEADER contains the logical keys of the SINGLE_HEADER. These are just extensions, +but can be used as if they were a part of the SINGLE_HEADER. Extra configurations are not +set by default. Only the SINGLE_HEADER is used as a basis for a header. But you can always +append an extra config(eg hidden_own, exclude) to the SINGLE_HEADER to overcome this issue. + +Hints and rules: +- "hidden" is used for own ROH section if "hidden_own" is not provided. + +- "exclude" and "exclude_own" may be useful in cases where html size matters. + Eg. many samples but only few metrics of interest. + +- "order_priority" can be used to arrange table's columns. Only int and float are valid. +''' +EXTRA_HEADER = { + "hidden_own": None, # For non-general plots in the ROH section. + "exclude": None, # True to exclude metric from the general html table. + "exclude_own": None, # True to exclude from own ROH section html table. + "order_priority": None, # Used to specify columns' order in all tables. +} + +'''""""""""""""""""""""""""""""""""""""""""""""""" +Officially defined file format could not be found. +According to examined real data, there are 4 columns in total: +Section Void-column Metric Value + +The only section found in data was 'VARIANT CALLER' +It is unknown if it is the only valid section or others already exist/will be added. +The second column does not contain anything. +It is unknown if it is the final constant structure or it was reserved for future use. + +Due to this uncertainty the following dict structure was chosen to store parsed ROH data: +{sample: {section: {region_or_sample: {metric: value}}}} + +Might look overcomplicated, but it conforms to the standard. + +If you wish to add new sections, take a look at the make_consistent_section. +''' + +## Sections: +VARIANT_CALLER = "VARIANT CALLER" + +## RG/Sample aka second column: +EMPTY = "" +WGS = "WGS" # Not detected/used. Just an example. + +## Special case. Token for any section/region/sample. +ANY = object() + +""" +Only INCLUDED_SECTIONS will be shown in html table. +The parser supports only the first value (4th column). +Can be used as a quick but unreliable way to control html table. +""" +INCLUDED_SECTIONS = { + VARIANT_CALLER: [EMPTY], +} + +# INCLUDED_SECTIONS = {ANY:[ANY]} # Include all sections/regions/samples. + +# Used to detect and report what is not properly supported by the module. +# Works as INCLUDED_SECTIONS. +SUPPORTED_SECTIONS = { + VARIANT_CALLER: [EMPTY], +} + +'''""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +The STD_METRICS is the container for abstract representations of the standard metrics. +They are designed to be seen as simple identifiers of the well known metrics. +You can set any real/virtual configuration defined in the SINGLE_HEADER. + +Some rules: + +- "modify" must conform to the following general structure: + lambda x: (arbitrary expression) if isinstance(x, str) else (math expression) + The x is either number or string. The latter is some value, which could not be + converted to int or float. NA for instance. Such values can be modified in the + (arbitrary expression) if needed. If not then just return the input. + +- If you wish to add new metrics, take a look at the make_consistent_metric. +''' + +STD_METRICS = { + "number of large roh ( >= 3000000)": { + "order_priority": 1, + "exclude": False, + "title": "LargeROH", + "description": "The number of large ROH detected by the small variant caller.", + "scale": "Oranges", + "colour": "255, 255, 0", + "format": "{:,.0f}", + }, + "percent snvs in large roh ( >= 3000000)": { + "order_priority": 2, + "exclude": False, + "title": "%SNVsInLargeROH", + "description": "The percentage of SNVs present in large ROH.", + "colour": "255, 255, 0", + "format": "{:,.3f}", + }, +} + +# The TABLE_CONFIG defines configs for the whole table in own section. +TABLE_CONFIG = { + "namespace": NAMESPACE, # Name for grouping. Prepends desc and is in Config Columns modal + "id": "dragen_roh_metrics_table", # ID used for the table + "table_title": "ROH", # Title of the table. Used in the column config modal + "save_file": False, # Whether to save the table data to a file + "raw_data_fn": None, # File basename to use for raw data file + "sortRows": True, # Whether to sort rows alphabetically + "only_defined_headers": True, # Only show columns that are defined in the headers config + "col1_header": None, # The header used for the first column with sample names. + "no_beeswarm": False, # Force a table to always be plotted (beeswarm by default if many rows) +} + + +""" Define the bargraph's settings """ + +CATS = OrderedDict() +CATS["number of large roh ( >= 3000000)"] = { + "name": "Number of large ROH ( >= 3000000)", + "color": "#00c3a9", +} +CATS["percent snvs in large roh ( >= 3000000)"] = { + "name": "Percent SNVs in large ROH ( >= 3000000)", + "color": "#d2004d", +} + +# More configs: https://multiqc.info/docs/development/plots/#bar-graphs +BARGRAPH_CONFIG = { + "id": "dragen_roh_metrics_bargraph", + "title": "Dragen: ROH Caller", + "cpswitch": False, # Just show the 2 metrics. + "yDecimals": False, # Looks a little bit better with integers. + "ylab": "", # Just to pass the lint test. The graph is pretty self-explanatory. + "tt_decimals": -1, # Preserve values: https://api.highcharts.com/class-reference/Highcharts#.NumberFormatterCallbackFunction + "tt_percentages": False, # No need for percentages. + "stacking": None, # Both metrics are not parts of something larger. +} + + +class DragenROHMetrics(BaseMultiqcModule): + """Public members of the DragenROHMetrics module are available to other dragen modules. + To avoid cluttering up the global dragen space, only one method is defined.""" + + def add_roh_metrics(self): + """The main function of the dragen ROH metrics module. + Public members of the BaseMultiqcModule and dragen modules defined in + MultiqcModule are available within it. Returns keys with sample names + or an empty set if no samples were found or all samples were ignored.""" + + roh_data = {} + + # Stores cleaned sample names with references to file handlers. + # Used later to check for duplicates and inform about found ones. + all_samples = defaultdict(list) + + # Metric IDs with original consistent metric strings. + all_metrics = {} + + for file in self.find_log_files("dragen/roh_metrics"): + out = roh_parser(file) + if out["success"]: + self.add_data_source(file, section="stats") + cleaned_sample = self.clean_s_name(out["sample_name"], file) + all_samples[cleaned_sample].append(file) + roh_data[cleaned_sample] = out["data"] # Add/overwrite the sample. + all_metrics.update(out["metric_IDs"]) + + # Filter to strip out ignored sample names. + roh_data = self.ignore_samples(roh_data) + if not roh_data: + return set() + + check_duplicate_samples(all_samples, log, "dragen/roh_metrics") + + roh_headers = make_headers(all_metrics) + + txt_out_data = flatten_4D_data( + roh_data, + metric_IDs=all_metrics, # Original strings can be used instead of somewhat ugly metric IDs. + hide_single_section=True, # True to exclude common single section from metric ID. + ) + self.write_data_file(txt_out_data, "dragen_roh") # Write data to file. + + roh_data_table = make_data_for_table(roh_data) # Prepare data for tables. + table_data, table_headers = flatten_4D_data( + roh_data_table, + headers=roh_headers, # Adjust headers' metrics to data's structure. + metric_IDs=all_metrics, # Original strings can be used instead of somewhat ugly IDs. + hide_single_section=True, # True to exclude common single section from metric ID. + ) + gen_headers = make_gen_stats_headers(table_headers) + # If gen_headers is empty, then all metrics from data will be included in table. + if gen_headers: + self.general_stats_addcols(table_data, gen_headers, namespace=NAMESPACE) + + # Add single own table section. + own_table_headers = make_own_headers(table_headers) + self.add_section( + name="ROH Caller", + anchor="dragen-roh-caller-table", + description="Metrics are reported on a per sample level. Regions of homozygosity (ROH) " + "are detected as part of the small variant caller. Press the `Help` button for details.", + helptext="The caller detects and outputs the runs of homozygosity from whole genome " + "calls on autosomal human chromosomes. ROH output allows downstream tools to screen " + "for and predict consanguinity between the parents of the proband subject.\n\n" + "* DRAGEN v3.3 - 3.8: Sex chromosomes are ignored.\n\n" + "* DRAGEN v3.9 - 4.1: Sex chromosomes are ignored unless the sample sex karyotype is XX, " + "as specified on the command line or determined by the Ploidy Estimator.\n\n" + "A region is defined as consecutive variant calls on the chromosome with no large gap in " + "between these variants. In other words, regions are broken by chromosome or by large gaps " + "with no SNV calls. The gap size is set to 3 Mbases.\n\nThe ROH algorithm runs on the small " + "variant calls. The algorithm excludes variants with multiallelic sites, indels, complex " + "variants, non-PASS filtered calls, and homozygous reference sites. The variant calls are " + "then filtered further using a block list BED, and finally depth filtering is applied " + "after the block list filter. The default value for the fraction of filtered calls is 0.2, " + "which filters the calls with the highest 10% and lowest 10% in DP values. " + "The algorithm then uses the resulting calls to find regions.\n\n" + "Addition for DRAGEN v3.7 - 4.1: " + "The ROH algorithm first finds seed regions that contain at least 50 consecutive homozygous " + "SNV calls with no heterozygous SNV or gaps of 500,000 bases between the variants. The regions " + "can be extended using a scoring system that functions as follows.\n\n" + "* Score increases with every additional homozygous variant (0.025) and decreases with a " + "large penalty (1–0.025) for every heterozygous SNV. This provides some tolerance of " + "presence of heterozygous SNV in the region.\n\n" + "* Each region expands on both ends until the regions reach the end of a chromosome, " + "a gap of 500,000 bases between SNVs occurs, or the score becomes too low (0).\n\n" + "Overlapping regions are merged into a single region. Regions can be merged across gaps of " + "500,000 bases between SNVs if a single region would have been called from the beginning of " + "the first region to the end of the second region without the gap. There is no maximum size " + "for regions, but regions always end at chromosome boundaries.", + plot=table.plot( + table_data, own_table_headers, {config: val for config, val in TABLE_CONFIG.items() if val is not None} + ), + ) + bargraph_data = make_bargraph_data(roh_data) + if bargraph_data: # Append only if not empty. + self.add_section( + name="Large ROH / SNVs in large ROH", + anchor="dragen-roh-caller-bargraph", + description="Different perspective for the 'Number of large ROH ( >= 3000000)' " + "and 'Percent SNVs in large ROH ( >= 3000000)' metrics from the ROH Caller.", + plot=bargraph.plot( + bargraph_data, + CATS, + BARGRAPH_CONFIG, + ), + ) + # Report found info/warnings/errors. You can disable it anytime, if it is not wanted. + make_log_report(log_data, log, "roh_metrics") + + return roh_data.keys() + + +def make_data_for_table(DATA): + """Modify parsed data before storing in tables.""" + + # Inner samples can be concatenated with file samples + # and replaced by empty string in the second column. + + # Sections can be analyzed and restructured. + + # Use INCLUDED_SECTIONS as inclusion checklist. + return { + sample: { + section: { + region: {metric: value for metric, value in data.items()} + for region, data in regions.items() + if (ANY in INCLUDED_SECTIONS and ANY in INCLUDED_SECTIONS[ANY]) + or ( + section in INCLUDED_SECTIONS + and (region in INCLUDED_SECTIONS[section] or ANY in INCLUDED_SECTIONS[section]) + ) + } + for section, regions in sections.items() + if ANY in INCLUDED_SECTIONS or section in INCLUDED_SECTIONS + } + for sample, sections in DATA.items() + } + + +def make_bargraph_data(data): + data_out = defaultdict(dict) + included_metrics = ["number of large roh ( >= 3000000)", "percent snvs in large roh ( >= 3000000)"] + for sample in data: + # If section is "VARIANT CALLER" and second column is "" + if VARIANT_CALLER in data[sample] and EMPTY in data[sample][VARIANT_CALLER]: + metrics = data[sample][VARIANT_CALLER][EMPTY] + for metric in included_metrics: + # Crash prevention. Though very unlikely that metric is not present. + if metric in metrics: + data_out[sample][metric] = metrics[metric] + return data_out + + +'''""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +The log_data is the container for found info, warnings and errors. +Collected data is stored in groups to make the output log file more readable. + +warnings: +- invalid_file_names, which do not conform to the: + .roh_metrics.csv + +debug/info: +- invalid_file_lines, which do not conform to: +
,,, +- unknown_sections are not present in the SUPPORTED_SECTIONS. +- unknown_second_columns are not present in the SUPPORTED_SECTIONS. +- unknown_metrics are not present in the STD_METRICS. +- unusual_values are those except for int and float. +''' +log_data = { + "invalid_file_names": defaultdict(list), + "invalid_file_lines": defaultdict(lambda: defaultdict(list)), + "unknown_sections": set(), + "unknown_second_columns": defaultdict(set), + "unknown_metrics": [], + "unusual_values": defaultdict(lambda: defaultdict(dict)), +} + + +def make_headers(metric_IDs): + """Build headers from metric_IDs.""" + headers = {} + for metric_id in metric_IDs: + original_metric = metric_IDs[metric_id] + configs = {config: val for config, val in SINGLE_HEADER.items() if val is not None} + if metric_id in STD_METRICS: + configs.update(STD_METRICS[metric_id]) + else: + log_data["unknown_metrics"].append(original_metric) + configs["title"] = original_metric + configs["description"] = original_metric + headers[metric_id] = configs + return headers + + +def create_parser(): + """Isolation for the parsing codeblock. Returns parse_roh_metrics_file closure.""" + + def make_consistent_metric(metric): + """Tries to guarantee consistency to a certain degree. + Metric-specific peculiarities can be handled here.""" + # metric will be stored in .txt output. + metric = re.sub("\s+", " ", metric).strip() + # metric_id will be used for internal storage. + metric_id = metric.lower() + return metric, metric_id + + def make_consistent_section(section): + """Tries to guarantee consistency for section string.""" + return re.sub("\s+", " ", section).strip().upper() + + # The official/accepted structure of the input file: + # .roh_metrics.csv + FILE_RGX = re.compile("(.+)\.roh_metrics\.csv$") + + # No official general line structure could be found. + # The following regex is used to check input lines. + LINE_RGX = re.compile("^([^,]+),([^,]*),([^,]+),([^,]+)$") + + def parse_roh_metrics_file(file_handler): + """ + Parser for *.roh_metrics.csv files. + + Input: file_handler with necessary info - file name/content/root. + + Output: {"success": False} if file name test failed. Otherwise: + {"success": False/True, + "sample_name": , + "data": {section: {RG/Sample: {metric: value}}}, + "metric_IDs": {metric_ID: original_metric} + }, with success False if all file's lines are invalid. + + File examples: + + sample_1.roh_metrics.csv + VARIANT CALLER,,Percent SNVs in large ROH ( >= 3000000),0.000 + VARIANT CALLER,,Number of large ROH ( >= 3000000),0 + + sample_2.roh_metrics.csv + VARIANT CALLER,,Percent SNVs in large ROH ( >= 3000000),0.082 + VARIANT CALLER,,Number of large ROH ( >= 3000000),1 + """ + file, root = file_handler["fn"], file_handler["root"] + + file_match = FILE_RGX.search(file) + if not file_match: + log_data["invalid_file_names"][root].append(file) + return {"success": False} + + sample = file_match.group(1) + + success = False + data = defaultdict(lambda: defaultdict(dict)) + metric_IDs = {} + + for line in file_handler["f"].splitlines(): + # Check the general line structure. + line_match = LINE_RGX.search(line) + + # If line is fine then extract the necessary fields. + if line_match: + success = True + section, region_sample, metric, value = line_match.groups() + + # Otherwise check if line is empty. If not then report it. Go to the next line. + else: + if not re.search("^\s*$", line): + log_data["invalid_file_lines"][root][file].append(line) + continue + + # Modify extracted parts to improve consistency. + consistent_section = make_consistent_section(section) + consistent_metric, metric_id = make_consistent_metric(metric) + + # Check support for sections/regions. + if not (ANY in SUPPORTED_SECTIONS or consistent_section in SUPPORTED_SECTIONS): + log_data["unknown_sections"].add(section) + if not ( + ANY in SUPPORTED_SECTIONS + and ANY in SUPPORTED_SECTIONS[ANY] + or ( + consistent_section in SUPPORTED_SECTIONS + and ( + region_sample in SUPPORTED_SECTIONS[consistent_section] + or ANY in SUPPORTED_SECTIONS[consistent_section] + ) + ) + ): + log_data["unknown_second_columns"][section].add(region_sample) + + # Check the extracted value. Shall be float or int. + try: + value = float(value) + except ValueError: + try: + value = int(value) + except ValueError: + log_data["unusual_values"][root][file][metric] = value + + metric_IDs[metric_id] = consistent_metric + data[consistent_section][region_sample][metric_id] = value + + # Return results of parsing the input file. + return { + "success": success, + "sample_name": sample, + "data": data, + "metric_IDs": metric_IDs, + } + + # Return the parser closure. + return parse_roh_metrics_file + + +roh_parser = create_parser() diff --git a/multiqc/modules/dragen/utils.py b/multiqc/modules/dragen/utils.py index 429aba926a..c7f6b7ae7b 100644 --- a/multiqc/modules/dragen/utils.py +++ b/multiqc/modules/dragen/utils.py @@ -1,4 +1,102 @@ -from collections import OrderedDict +'''""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Below is a set of common tools, which might help to write DRAGEN QC Metrics modules. + +This comment describes some construction components of a single dragen module. +It also includes noticed common patterns, observations. + + +Officially defined QC metrics output table format: +Section RG/Sample Metric Count/Ration/Time Percentage/Seconds + +Source: https://support-docs.illumina.com/SW/DRAGEN_v41/Content/SW/DRAGEN/QCMetricsCoverageReports.htm + +Real files' formats can vary: + +- Sections have variations in composition and staticness. + * Can be constant: coverage_metrics with 'COVERAGE SUMMARY' + * Or quite variable: + * subsections/prefixes/suffixes (eg vc_metrics, mapping_metrics). + * phenotypes as extra prefixes: 'TUMOR', 'NORMAL' + * Some dont have a section: + Eg overall_mean_cov_metrics has the 'Average alignment coverage over ' metric in the 1st column. + But the 'TUMOR' and 'NORMAL' prefixes can be also appended to that metric. + +- RG/Sample can be either: + * DRAGEN-defined: empty or some static region (eg WGS). + * User-defined: sample names. + * Non existent (eg overall_mean_cov_metrics) + +- Metrics are also quite variant: + * Substrings: + * Varying regions/values/metainfo (eg mapping_metrics, coverage_metrics). + * Space padding within metrics (eg coverage_metrics). + * DRAGEN version dependency as stated in mapping_metrics.py: + "Dragen 3.9 has replaced 'rRNA filtered reads' with 'Adjustment of reads matching filter contigs'" + * User-defined sample names instead of metrics (eg cnv_metrics). + * Probably smth else. + +- Values are mostly int and float. NA, inf or karyotype(eg XX/XY/X0) are also present. + Extra padded space can be included (eg overall_mean_cov_metrics) + + +DRAGEN QC Metrics Module includes: + +Main components: + + - Main class: + interface to features provided by the BaseMultiqcModule and own functionality. + Methods and properties of DRAGEN modules defined in MultiqcModule(dragen.py) are also + available from the main dragen class through self. Unfortunately it is not possible + to completely isolate dragen modules from each other, so some cautiousness is needed: + * main class can overwrite public members of main classes from other modules. + * try to create as little methods/properties as possible. Especially those which can + consume a lot of space, because references to such members will still exist after + single module's execution has finished. + * use only explicitly defined internal dragen interface(eg getter methods) to quiry data. + + - CSV Parser: + extracts data from specific CSV-file with metrics. + The structure of collected data depends on the file's format. + + - Configurations: + Highcharts plots: headers for tables, configs for whole tables ... + +Optional components: + + - Data transformator: + tools (eg util functions, own methods/global funcs) to restructure parsed data. + It might happen that a single file contains differently structured rows. + Internal data restructuring could be necessary in order to properly and nicely + present metrics in html table. + - Data flattener: + Transforms data from higher dimension to 2D - {sample: {metric: value}} + May be useful in writing a prototype. + - Transformation for storing in txt output. + - Transformation for tables/linegraphs/etc + + - Headers creation: + the only plot which is available by default and can represent data completely is the table. + In some cases metrics/files can have varying and rather complicated structure. It might be + cumbersome to manually define configs in such cases, so additional functionality could help. + + - Logging: + Collect and report found debug/info/warnings/errors according to some predefined schema. + Direct reporting can be also considered as a "component", a distributed one, sort of. + It is much easier than the former and reports with own module name, but can produce chaotic log. + + - JSON Parser: + JSON data can be collected separately from CSV. After that it can be used to improve html output. + + - Data checker: + It might happen that collected data needs to be thoroughly examined to guarantee data quality. + This component may be useful if many specific data aspects have to be checked. + Currently it is part of parser. + +This structuring is just an attempt to organize and simplify the whole dragen module, which +could improve readability and make modules more extensible/maintainable in the long run. +''' + +from collections import OrderedDict, defaultdict from multiqc import config @@ -15,6 +113,9 @@ # base_format += ' ' + config.base_count_prefix +"""______________________ Headers ______________________""" + + class Metric: def __init__( self, @@ -139,63 +240,44 @@ def make_headers(parsed_metric_ids, metrics): return genstats_headers, own_tabl_headers -def exist_and_number(data, *metrics): - return all(isinstance(data.get(m, None), int) or isinstance(data.get(m, None), float) for m in metrics) - - -def check_duplicate_samples(sample_names, logger, module): - """Check samples for duplicate names. Warn about found ones.""" - message = "" - line1 = "\n {} was built from the following samples:" - line2 = "\n {} in {}" - for clean_sample_name in sample_names: - if len(sample_names[clean_sample_name]) > 1: - message += line1.format(clean_sample_name) - for file_handler in sample_names[clean_sample_name]: - message += line2.format(file_handler["fn"], file_handler["root"]) - if message: - message = "\n\nDuplicate sample names were found. The last one overwrites previous data." + message - logger.debug(message) - - def order_headers(headers): - """Produces a shallow copy with ordered single headers.""" + """Produces a shallow copy with ordered single headers from the input. + Or returns the input if no header contains the 'order_priority' config.""" """ Collected "order_priority" values are stored as groups to resolve conflicts, which may arise if several metrics have the same priority. The order of headers within the same priority group is not specified. """ indexes = set() - ordered_headers = {} + ordered_headers = defaultdict(list) not_ordered_headers = {} for metric in headers: + configs = headers[metric].copy() if "order_priority" in headers[metric] and isinstance(headers[metric]["order_priority"], (int, float)): order_priority = headers[metric]["order_priority"] indexes.add(order_priority) - if order_priority not in ordered_headers: - ordered_headers[order_priority] = [] - ordered_headers[order_priority].append((metric, headers[metric].copy())) + ordered_headers[order_priority].append((metric, configs)) else: - not_ordered_headers[metric] = headers[metric].copy() + not_ordered_headers[metric] = configs # Convert to list and sort in ascending order. if indexes: indexes = list(indexes) indexes.sort() + # Otherwise there is nothing to order. Return the input. else: return headers output_headers = OrderedDict() for index in indexes: - for metric in ordered_headers[index]: - output_headers[metric[0]] = metric[1] + for metric, configs in ordered_headers[index]: + output_headers[metric] = configs output_headers.update(not_ordered_headers) return output_headers -# STD_TABLE_CONFIGS contains all standard table configurations from: -# https://github.com/ewels/MultiQC/blob/master/docs/plots.md#creating-a-table +# STD_TABLE_CONFIGS contains all standard table configurations: STD_TABLE_CONFIGS = [ "namespace", "title", @@ -220,6 +302,7 @@ def order_headers(headers): def clean_headers(headers): + """Include configurations only from STD_TABLE_CONFIGS. Extension-configurations will be excluded.""" cleaned_headers = OrderedDict() for metric in headers: cleaned_headers[metric] = { @@ -228,6 +311,185 @@ def clean_headers(headers): return cleaned_headers +def make_gen_stats_headers(headers): + gen_headers = { + metric: configs.copy() # Copy to be able to modify them. + for metric, configs in headers.items() + if metric in + # Header is included only if "exclude" is not present or False/False-equivalent. + [metric for metric in headers if not ("exclude" in headers[metric] and headers[metric]["exclude"])] + } + return clean_headers(order_headers(gen_headers)) + + +def make_own_headers(headers): + """Creates table's headers for non-general section.""" + own_headers = { + metric: configs.copy() # Copy to be able to modify them. + for metric, configs in headers.items() + if metric in + # Header is included only if "exclude_own" is not present or False/False-equivalent. + [metric for metric in headers if not ("exclude_own" in headers[metric] and headers[metric]["exclude_own"])] + } + for metric in own_headers: + # "hidden_own" is an extension and must be stored in "hidden" + if "hidden_own" in own_headers[metric]: + own_headers[metric]["hidden"] = own_headers[metric]["hidden_own"] + + unhide_single_metric(own_headers) + return clean_headers(order_headers(own_headers)) + + +"""______________________ Data transformation ______________________""" + + +def flatten_3D_data(data, **extra): + """ + Reduces amount of dimensions in data to 2D by combining layers. + + Input: + * data: + dictionary with parsed metrics from coverage reports in form: + {sample: {section: {metric_ID: value}}} + + * extra: + includes additional parameters: + - hide_single_section: boolean to hide single common section. + - metric_IDs: dictionary with parsed metrics in form {metric_ID: original_metric} + - headers: dictionary in form {metric_ID: {configuration: value}} + + Output: + * 2D data if key parameter headers is not provided. + * 2D data and headers if key parameter headers is provided. + """ + out_data = defaultdict(dict) + headers = extra["headers"] if "headers" in extra else None + out_headers = {} if headers is not None else None + + metric_IDs = extra["metric_IDs"] if "metric_IDs" in extra else None + hide_single_section = extra["hide_single_section"] if "hide_single_section" in extra else False + sections = len({section for sample in data for section in data[sample]}) + + for sample in data: + for section in data[sample]: + for metric in data[sample][section]: + original_metric = metric + # Try to extract the original metric. + if metric_IDs and metric in metric_IDs: + original_metric = metric_IDs[metric] + # Create new metric-ID. + if hide_single_section and sections == 1: + new_id = original_metric + else: + # If section is not empty. + if section: + new_id = section + " " + original_metric + else: + new_id = original_metric + out_data[sample][new_id] = data[sample][section][metric] + + if out_headers is None: + continue + + if metric in headers: + out_headers[new_id] = headers[metric] + + if out_headers is not None: + return out_data, out_headers + else: + return out_data + + +def flatten_4D_data(data, **extra): + """ + Reduces amount of dimensions in data to 2D by combining layers. + + Input: + * data: + dictionary with parsed metrics from coverage reports in form: + {sample: {section: {region: {metric_ID: value}}}} + + * extra: + includes additional parameters: + - hide_single_section: boolean to hide single common section. + - metric_IDs: dictionary with parsed metrics in form {metric_ID: original_metric} + - headers: dictionary in form {metric_ID: {configuration: value}} + + Output: + * 2D data if key parameter headers is not provided. + * 2D data and headers if key parameter headers is provided. + """ + out_data = defaultdict(dict) + headers = extra["headers"] if "headers" in extra else None + out_headers = {} if headers is not None else None + + metric_IDs = extra["metric_IDs"] if "metric_IDs" in extra else None + hide_single_section = extra["hide_single_section"] if "hide_single_section" in extra else False + sections = len({section for sample in data for section in data[sample]}) + + for sample in data: + for section in data[sample]: + for region in data[sample][section]: + for metric in data[sample][section][region]: + original_metric = metric + # Try to extract the original metric. + if metric_IDs and metric in metric_IDs: + original_metric = metric_IDs[metric] + # Create new metric-ID. + if hide_single_section and sections == 1: + # If region is not empty. + if region: + new_id = region + " " + original_metric + else: + new_id = original_metric + else: + # Section and region are not empty. + if section and region: + new_id = section + " " + region + " " + original_metric + # Region is empty. + elif section and not region: + new_id = section + " " + original_metric + # Section is empty. + elif not section and region: + new_id = region + " " + original_metric + # Both are empty. + else: + new_id = original_metric + out_data[sample][new_id] = data[sample][section][region][metric] + + if out_headers is None: + continue + + if metric in headers: + out_headers[new_id] = headers[metric] + + if out_headers is not None: + return out_data, out_headers + else: + return out_data + + +"""______________________ Logging component ______________________""" + + +def check_duplicate_samples(sample_names, logger, module): + """Check samples for duplicate names. Warn about found ones.""" + message = "" + line1 = "\n {} was built from the following samples:" + line2 = "\n {} in {}" + for clean_sample_name in sample_names: + if len(sample_names[clean_sample_name]) > 1: + message += line1.format(clean_sample_name) + for file_handler in sample_names[clean_sample_name]: + message += line2.format(file_handler["fn"], file_handler["root"]) + if message: + # Draw user's attention with error. + logger.error(module + ": duplicates found.") + # But print full info only in report. + message = "\n\nDuplicate sample names were found. The last one overwrites previous data." + message + logger.debug(message) + + # Used to define module-specific texts for the make_parsing_log_report. # Comments are used to separate keys. Though empty lines look better, they are removed by the "Black" formatter. DRAGEN_MODULE_TEXTS = { @@ -239,19 +501,73 @@ def clean_headers(headers): "overall_mean_cov_metrics": "\n\nThe file names must conform to the following structure:\n" "_overall_mean_cov.csv\n\n" "The following files are not valid:\n", + # ------------- # + "cnv_metrics": "\n\nThe file names must conform to the following structure:\n" + ".cnv_metrics.csv\n\n" + "The following files are not valid:\n", + # ------------- # + "ploidy_metrics": "\n\nThe file names must conform to the following structure:\n" + ".ploidy_estimation_metrics.csv\n" + "The following files are not valid:\n", + # ------------- # + "roh_metrics": "\n\nThe file names must conform to the following structure:\n" + ".roh_metrics.csv\n" + "The following files are not valid:\n", + # ------------- # + "sv_metrics": "\n\nThe file names must conform to the following structure:\n" + ".sv_metrics.csv\n" + "The following files are not valid:\n", + # ------------- # + "vc_hethom_ratio_metrics": "\n\nThe file names must conform to the following structure:\n" + ".vc_hethom_ratio_metrics.csv\n" + "The following files are not valid:\n", }, "invalid_file_lines": { "coverage_metrics": "\n\nThe lines in files must be:\n" "COVERAGE SUMMARY,,, or " - "COVERAGE SUMMARY,,,,\n\n" + "COVERAGE SUMMARY,,,,\n" + "The following files contain invalid lines:\n", + # ------------- # + "cnv_metrics": "\n\nThe lines in files must be:\n" + "
,,, or " + "
,,,,\n" + "The following files contain invalid lines:\n", + # ------------- # + "ploidy_metrics": "\n\nThe lines in files must be:\n" + "
,,,\n" + "The following files contain invalid lines:\n", + # ------------- # + "roh_metrics": "\n\nThe lines in files must be:\n" + "
,,,\n" + "The following files contain invalid lines:\n", + # ------------- # + "sv_metrics": "\n\nThe lines in files must be:\n" + "
,,, or " + "
,,,,\n" + "The following files contain invalid lines:\n", + # ------------- # + "vc_hethom_ratio_metrics": "\n\nThe lines in files must be:\n" + "
,,,\n" "The following files contain invalid lines:\n", }, + "unknown_sections": {}, + "unknown_second_columns": {}, "unknown_metrics": { "coverage_metrics": "\n\nThe following metrics could not be recognized. " "Their headers might be incomplete, hence uninformative and ugly table's columns.\n", # ------------- # "overall_mean_cov_metrics": "\n\nOnly the 'Average alignment coverage over ' metric is supported.\n" "The following metrics could not be recognized and are not included in the report.\n", + # ------------- # + "ploidy_metrics": "\n\nThe following metrics could not be recognized. " + "Their headers are incomplete, hence uninformative and ugly table's columns.\n", + # ------------- # + "roh_metrics": "\n\nThe following metrics could not be recognized. " + "Their headers are incomplete, hence uninformative and ugly table's columns.\n", + # ------------- # + "sv_metrics": "\n\nThe following metrics are not included in the report:\n", + # ------------- # + "vc_hethom_ratio_metrics": "\n\nThe following metrics are not included in the report:\n", }, "unusual_values": { "coverage_metrics": "\n\nAll metrics' values except for int, float and NA are non-standard.\n" @@ -259,13 +575,29 @@ def clean_headers(headers): # ------------- # "overall_mean_cov_metrics": "\n\nAll metrics' values except for float are non-standard.\n" "The following files contain non-standard values:\n", + # ------------- # + "cnv_metrics": "\n\nThe CNV SUMMARY section shall contain only int and float.\n" + "The second value of the SEX GENOTYPER section shall be float.\n" + "The following files contain non-standard values:\n", + # ------------- # + "ploidy_metrics": "\n\nEach value shall be either int, float or a combination of X/Y/0 for the karyotype.\n" + "The following files contain non-standard values:\n", + # ------------- # + "roh_metrics": "\n\nEach value shall be either int or float.\n" + "The following files contain non-standard values:\n", + # ------------- # + "sv_metrics": "\n\nEach value shall be either int or float.\n" + "The following files contain non-standard values:\n", + # ------------- # + "vc_hethom_ratio_metrics": "\n\nEach value shall be either int, float or inf.\n" + "The following files contain non-standard values:\n", }, } def make_log_report(log_data, logger, module): - """The only purpose of this function is to create a readable and informative log output - about found info/warnings/errors, which were found at the time of executing a parser.""" + """The only purpose of this function is to create a readable and + informative log output about found debug/info/warnings/errors""" if "invalid_file_names" in log_data and log_data["invalid_file_names"]: if module in DRAGEN_MODULE_TEXTS["invalid_file_names"]: @@ -278,7 +610,41 @@ def make_log_report(log_data, logger, module): for file in log_data["invalid_file_names"][root]: log_message += " " + file + "\n" - logger.warning(log_message + "\n") + # Draw user's attention with error. + logger.error("dragen/" + module + ": invalid file names found.") + # But print full info only in report. + logger.debug(log_message + "\n") + + if "unknown_sections" in log_data and log_data["unknown_sections"]: + if module in DRAGEN_MODULE_TEXTS["unknown_sections"]: + log_message = DRAGEN_MODULE_TEXTS["unknown_sections"][module] + else: + log_message = "\n\nThe following sections could not be recognized.\n" + + for section in log_data["unknown_sections"]: + log_message += " " + section + "\n" + + # Draw user's attention with warning. + logger.warning("dragen/" + module + ": unknown sections found.") + # But print full info only in report. + logger.debug(log_message + "\n") + + if "unknown_second_columns" in log_data and log_data["unknown_second_columns"]: + if module in DRAGEN_MODULE_TEXTS["unknown_second_columns"]: + log_message = DRAGEN_MODULE_TEXTS["unknown_second_columns"][module] + else: + log_message = "\n\nThe following RG/Samples could not be recognized.\n" + + for section in log_data["unknown_second_columns"]: + log_message += " " + section + ":\n" + for rg_sample in log_data["unknown_second_columns"][section]: + rg_sample = rg_sample if rg_sample else "Empty region" + log_message += " " + rg_sample + "\n" + + # Draw user's attention with warning. + logger.warning("dragen/" + module + ": unknown RG/Samples found.") + # But print full info only in report. + logger.debug(log_message + "\n") if "invalid_file_lines" in log_data and log_data["invalid_file_lines"]: if module in DRAGEN_MODULE_TEXTS["invalid_file_lines"]: @@ -293,6 +659,9 @@ def make_log_report(log_data, logger, module): for line in log_data["invalid_file_lines"][root][file]: log_message += " " + line + "\n" + # Draw user's attention with warning. + logger.warning("dragen/" + module + ": invalid file lines found.") + # But print full info only in report. logger.debug(log_message + "\n") if "unknown_metrics" in log_data and log_data["unknown_metrics"]: @@ -304,6 +673,9 @@ def make_log_report(log_data, logger, module): for metric in log_data["unknown_metrics"]: log_message += " " + metric + "\n" + # Draw user's attention with warning. + logger.warning("dragen/" + module + ": unknown metrics found.") + # But print full info only in report. logger.debug(log_message + "\n") if "unusual_values" in log_data and log_data["unusual_values"]: @@ -319,4 +691,22 @@ def make_log_report(log_data, logger, module): for metric in log_data["unusual_values"][root][file]: log_message += " " + metric + " = " + log_data["unusual_values"][root][file][metric] + "\n" + # Draw user's attention with warning. + logger.warning("dragen/" + module + ": unusual values found.") + # But print full info only in report. logger.debug(log_message + "\n") + + +"""______________________ Miscellaneous ______________________""" + + +def exist_and_number(data, *metrics): + return all(isinstance(data.get(m, None), int) or isinstance(data.get(m, None), float) for m in metrics) + + +def unhide_single_metric(headers): + """Solve a table bug: if table has only 1 metric and this metric is hidden, then there + is no user-friendly way to show it, because the "Configure columns" button is absent.""" + if len(headers) == 1: + metric = list(headers.keys())[0] + headers[metric]["hidden"] = False diff --git a/multiqc/utils/search_patterns.yaml b/multiqc/utils/search_patterns.yaml index 66a8f210ff..5795c50140 100644 --- a/multiqc/utils/search_patterns.yaml +++ b/multiqc/utils/search_patterns.yaml @@ -281,6 +281,8 @@ dragen/vc_metrics: fn: "*.vc_metrics.csv" dragen/ploidy_estimation_metrics: fn: "*.ploidy_estimation_metrics.csv" +dragen/roh_metrics: + fn_re: '.*\.roh_metrics\.csv' dragen/wgs_contig_mean_cov: fn_re: '.*\.wgs_contig_mean_cov_?(tumor|normal)?\.csv' dragen/overall_mean_cov_metrics: