Refactor: fix unescaped regex strings (#2384)

* Fix unescaped regex strings * [automated] Update CHANGELOG.md --------- Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io>
MultiQC · Feb 26, 2024 · cedb55f · cedb55f
1 parent 41b7908
commit cedb55f
Show file tree

Hide file tree

Showing 9 changed files with 34 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@
 - Generic font family for Plotly ([#2368](https://github.com/MultiQC/MultiQC/pull/2368))
 - Violin plot: filter Inf values ([#2380](https://github.com/MultiQC/MultiQC/pull/2380))
 - Fix use of the `no_violin`/`no_beeswarm` table config flag ([#2376](https://github.com/MultiQC/MultiQC/pull/2376))
+- Refactor: fix unescaped regex strings ([#2384](https://github.com/MultiQC/MultiQC/pull/2384))
 
 ### New modules
 

diff --git a/multiqc/modules/rockhopper/rockhopper.py b/multiqc/modules/rockhopper/rockhopper.py
@@ -116,15 +116,15 @@ def parse_rockhopper_summary(self, f):
 
             # Get total number of reads read by rockhopper
             if line.startswith("Total reads:"):
-                results["total-reads"] = int(re.search("Total reads:\s*(\d*)", line).group(1))
+                results["total-reads"] = int(re.search(r"Total reads:\s*(\d*)", line).group(1))
 
             # Get number of reads aligned to each genome
             elif line.startswith("Successfully aligned reads"):
                 # Get number of aligned reads
-                genome_reads = int(re.search("Successfully aligned reads:\s*(\d*)", line).group(1))
+                genome_reads = int(re.search(r"Successfully aligned reads:\s*(\d*)", line).group(1))
 
                 # Get percent of reads in each category
-                stats = [int(re.search("(\d+)\%", subline).group(1)) for subline in lines[i + 1 : i + 10]]
+                stats = [int(re.search(r"(\d+)\%", subline).group(1)) for subline in lines[i + 1 : i + 10]]
                 for name, val in zip(stats_index, stats):
                     # Convert percentages to true number of reads in each category
                     results[name] += int(round(val * genome_reads / 100))

diff --git a/multiqc/modules/sambamba/markdup.py b/multiqc/modules/sambamba/markdup.py
@@ -72,10 +72,10 @@ def parse_markdup_stats(self, f):
         """
 
         regexes = {
-            "sorted_end_pairs": "sorted (\d+) end pairs",
-            "single_ends": "and (\d+) single ends",
-            "single_unmatched_pairs": "among them (\d+) unmatched",
-            "duplicate_reads": "found (\d+) duplicates",
+            "sorted_end_pairs": r"sorted (\d+) end pairs",
+            "single_ends": r"and (\d+) single ends",
+            "single_unmatched_pairs": r"among them (\d+) unmatched",
+            "duplicate_reads": r"found (\d+) duplicates",
         }
         d = {}
         for key, regex in regexes.items():

diff --git a/multiqc/modules/samblaster/samblaster.py b/multiqc/modules/samblaster/samblaster.py
@@ -82,8 +82,8 @@ def parse_samblaster(self, f):
             r"samblaster: (Removed|Marked)\s+(\d+)\s+of\s+(\d+) \((\d+.\d+)%\)\s*(total)?\s*read ids as duplicates"
         )
 
-        input_file_regex = "samblaster: Opening (\S+) for read."
-        rgtag_name_regex = "\\\\tID:(\S*?)\\\\t"
+        input_file_regex = r"samblaster: Opening (\S+) for read."
+        rgtag_name_regex = r"\\\\tID:(\S*?)\\\\t"
         data = {}
         s_name = None
         version = None

diff --git a/multiqc/modules/seqyclean/seqyclean.py b/multiqc/modules/seqyclean/seqyclean.py
@@ -159,9 +159,9 @@ def _clean_keys(self, keys):
         """Given a list of keys, make them easier to read for plot labels"""
         cats = {}
         for k in keys:
-            nice_name = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", k)  # CamelCase > Camel Case
-            nice_name = re.sub("([PS]E\d?)", "\g<1> ", nice_name)  # PE1Label > PE1 Label
-            nice_name = re.sub("W([A-Z])", "W \g<1>", nice_name)  # WContam > W Contam
+            nice_name = re.sub(r"([a-z])([A-Z])", r"\g<1> \g<2>", k)  # CamelCase > Camel Case
+            nice_name = re.sub(r"([PS]E\d?)", r"\g<1> ", nice_name)  # PE1Label > PE1 Label
+            nice_name = re.sub(r"W([A-Z])", r"W \g<1>", nice_name)  # WContam > W Contam
             nice_name = nice_name.replace("_", " ")  # tags_found > tags found
             nice_name = nice_name.title()  # Title Case
             nice_name = nice_name.replace("Pe", "PE").replace("Se", "SE")

diff --git a/multiqc/modules/sickle/sickle.py b/multiqc/modules/sickle/sickle.py
@@ -50,13 +50,13 @@ def parse_logs(f):
         """Parse the Sickle standard output"""
         regexes = [
             # Paired-end
-            ["reads_paired_kept", re.compile("FastQ paired records kept: ([\d,]+) .*")],
-            ["reads_single_kept", re.compile("FastQ single records kept: ([\d,]+).*")],
-            ["reads_paired_discarded", re.compile("FastQ paired records discarded: ([\d,]+) .*")],
-            ["reads_single_discarded", re.compile("FastQ single records discarded: ([\d,]+) .*")],
+            ["reads_paired_kept", re.compile(r"FastQ paired records kept: ([\d,]+) .*")],
+            ["reads_single_kept", re.compile(r"FastQ single records kept: ([\d,]+).*")],
+            ["reads_paired_discarded", re.compile(r"FastQ paired records discarded: ([\d,]+) .*")],
+            ["reads_single_discarded", re.compile(r"FastQ single records discarded: ([\d,]+) .*")],
             # Single-end
-            ["reads_single_kept", re.compile("FastQ records kept: ([\d,]+)")],
-            ["reads_single_discarded", re.compile("FastQ records discarded: ([\d,]+)")],
+            ["reads_single_kept", re.compile(r"FastQ records kept: ([\d,]+)")],
+            ["reads_single_discarded", re.compile(r"FastQ records discarded: ([\d,]+)")],
         ]
         data = {}
         for line in f.splitlines():

diff --git a/multiqc/modules/skewer/skewer.py b/multiqc/modules/skewer/skewer.py
@@ -89,16 +89,16 @@ def parse_skewer_log(self, f):
         """Go through log file looking for skewer output"""
         fh = f["f"]
         regexes = {
-            "fq1": "Input file:\s+(.+)",
-            "fq2": "Paired file:\s+(.+)",
-            "r_processed": "(\d+) read|reads pairs? processed",
-            "r_short_filtered": "(\d+) \(\s*\d+.\d+%\) short read",
-            "r_empty_filtered": "(\d+) \(\s*\d+.\d+%\) empty read",
-            "r_avail": "(\d+) \(\s*\d+.\d+%\) read",
-            "r_trimmed": "(\d+) \(\s*\d+.\d+%\) trimmed read",
-            "r_untrimmed": "(\d+) \(\s*\d+.\d+%\) untrimmed read",
+            "fq1": r"Input file:\s+(.+)",
+            "fq2": r"Paired file:\s+(.+)",
+            "r_processed": r"(\d+) read|reads pairs? processed",
+            "r_short_filtered": r"(\d+) \(\s*\d+.\d+%\) short read",
+            "r_empty_filtered": r"(\d+) \(\s*\d+.\d+%\) empty read",
+            "r_avail": r"(\d+) \(\s*\d+.\d+%\) read",
+            "r_trimmed": r"(\d+) \(\s*\d+.\d+%\) trimmed read",
+            "r_untrimmed": r"(\d+) \(\s*\d+.\d+%\) untrimmed read",
         }
-        regex_hist = "\s?(\d+)\s+(\d+)\s+(\d+.\d+)%"
+        regex_hist = r"\s?(\d+)\s+(\d+)\s+(\d+.\d+)%"
 
         data = dict()
         for k, v in regexes.items():

diff --git a/multiqc/modules/sortmerna/sortmerna.py b/multiqc/modules/sortmerna/sortmerna.py
@@ -72,7 +72,7 @@ def parse_sortmerna(self, f):
 
         for line in f["f"]:
             if "Reads file" in line:
-                parts = re.split(r":|=", line)
+                parts = re.split(r"[:=]", line)
                 s_name = self.clean_s_name(parts[-1], f)
                 self.sortmerna[s_name] = dict()
             if "Results:" in line and not post_results_start:  # old versions
@@ -83,13 +83,13 @@ def parse_sortmerna(self, f):
                 continue
             if post_results_start and not post_database_start:
                 if "Total reads =" in line:
-                    m = re.search("\d+", line)
+                    m = re.search(r"\d+", line)
                     if m:
                         self.sortmerna[s_name]["total"] = int(m.group())
                     else:
                         err = True
                 elif "Total reads passing" in line:
-                    m = re.search("\d+", line)
+                    m = re.search(r"\d+", line)
                     if m:
                         self.sortmerna[s_name]["rRNA"] = int(m.group())
                         self.sortmerna[s_name]["rRNA_pct"] = (
@@ -98,7 +98,7 @@ def parse_sortmerna(self, f):
                     else:
                         err = True
                 elif "Total reads failing" in line:
-                    m = re.search("\d+", line)
+                    m = re.search(r"\d+", line)
                     if m:
                         self.sortmerna[s_name]["non_rRNA"] = int(m.group())
                         self.sortmerna[s_name]["non_rRNA_pct"] = (

diff --git a/multiqc/modules/supernova/supernova.py b/multiqc/modules/supernova/supernova.py
@@ -470,9 +470,9 @@ def parse_report(content):
         data = {}
         # Find the sample ID
         sid = ""
-        sid_pat = re.compile("- \[(.+)\]")
+        sid_pat = re.compile(r"- \[(.+)\]")
         # [number, unit, category]
-        stat_pat = re.compile("-\s+(\d+\.\d+)\s+(\S+|.)\s+= (.+) =")
+        stat_pat = re.compile(r"-\s+(\d+\.\d+)\s+(\S+|.)\s+= (.+) =")
 
         for line in content.splitlines():
             sid_m = re.match(sid_pat, line)