Skip to content

Commit

Permalink
Refactor: fix unescaped regex strings (#2384)
Browse files Browse the repository at this point in the history
* Fix unescaped regex strings

* [automated] Update CHANGELOG.md

---------

Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io>
  • Loading branch information
vladsavelyev and multiqc-bot committed Feb 26, 2024
1 parent 41b7908 commit cedb55f
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- Generic font family for Plotly ([#2368](https://github.com/MultiQC/MultiQC/pull/2368))
- Violin plot: filter Inf values ([#2380](https://github.com/MultiQC/MultiQC/pull/2380))
- Fix use of the `no_violin`/`no_beeswarm` table config flag ([#2376](https://github.com/MultiQC/MultiQC/pull/2376))
- Refactor: fix unescaped regex strings ([#2384](https://github.com/MultiQC/MultiQC/pull/2384))

### New modules

Expand Down
6 changes: 3 additions & 3 deletions multiqc/modules/rockhopper/rockhopper.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def parse_rockhopper_summary(self, f):

# Get total number of reads read by rockhopper
if line.startswith("Total reads:"):
results["total-reads"] = int(re.search("Total reads:\s*(\d*)", line).group(1))
results["total-reads"] = int(re.search(r"Total reads:\s*(\d*)", line).group(1))

# Get number of reads aligned to each genome
elif line.startswith("Successfully aligned reads"):
# Get number of aligned reads
genome_reads = int(re.search("Successfully aligned reads:\s*(\d*)", line).group(1))
genome_reads = int(re.search(r"Successfully aligned reads:\s*(\d*)", line).group(1))

# Get percent of reads in each category
stats = [int(re.search("(\d+)\%", subline).group(1)) for subline in lines[i + 1 : i + 10]]
stats = [int(re.search(r"(\d+)\%", subline).group(1)) for subline in lines[i + 1 : i + 10]]
for name, val in zip(stats_index, stats):
# Convert percentages to true number of reads in each category
results[name] += int(round(val * genome_reads / 100))
Expand Down
8 changes: 4 additions & 4 deletions multiqc/modules/sambamba/markdup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ def parse_markdup_stats(self, f):
"""

regexes = {
"sorted_end_pairs": "sorted (\d+) end pairs",
"single_ends": "and (\d+) single ends",
"single_unmatched_pairs": "among them (\d+) unmatched",
"duplicate_reads": "found (\d+) duplicates",
"sorted_end_pairs": r"sorted (\d+) end pairs",
"single_ends": r"and (\d+) single ends",
"single_unmatched_pairs": r"among them (\d+) unmatched",
"duplicate_reads": r"found (\d+) duplicates",
}
d = {}
for key, regex in regexes.items():
Expand Down
4 changes: 2 additions & 2 deletions multiqc/modules/samblaster/samblaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def parse_samblaster(self, f):
r"samblaster: (Removed|Marked)\s+(\d+)\s+of\s+(\d+) \((\d+.\d+)%\)\s*(total)?\s*read ids as duplicates"
)

input_file_regex = "samblaster: Opening (\S+) for read."
rgtag_name_regex = "\\\\tID:(\S*?)\\\\t"
input_file_regex = r"samblaster: Opening (\S+) for read."
rgtag_name_regex = r"\\\\tID:(\S*?)\\\\t"
data = {}
s_name = None
version = None
Expand Down
6 changes: 3 additions & 3 deletions multiqc/modules/seqyclean/seqyclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,9 @@ def _clean_keys(self, keys):
"""Given a list of keys, make them easier to read for plot labels"""
cats = {}
for k in keys:
nice_name = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", k) # CamelCase > Camel Case
nice_name = re.sub("([PS]E\d?)", "\g<1> ", nice_name) # PE1Label > PE1 Label
nice_name = re.sub("W([A-Z])", "W \g<1>", nice_name) # WContam > W Contam
nice_name = re.sub(r"([a-z])([A-Z])", r"\g<1> \g<2>", k) # CamelCase > Camel Case
nice_name = re.sub(r"([PS]E\d?)", r"\g<1> ", nice_name) # PE1Label > PE1 Label
nice_name = re.sub(r"W([A-Z])", r"W \g<1>", nice_name) # WContam > W Contam
nice_name = nice_name.replace("_", " ") # tags_found > tags found
nice_name = nice_name.title() # Title Case
nice_name = nice_name.replace("Pe", "PE").replace("Se", "SE")
Expand Down
12 changes: 6 additions & 6 deletions multiqc/modules/sickle/sickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def parse_logs(f):
"""Parse the Sickle standard output"""
regexes = [
# Paired-end
["reads_paired_kept", re.compile("FastQ paired records kept: ([\d,]+) .*")],
["reads_single_kept", re.compile("FastQ single records kept: ([\d,]+).*")],
["reads_paired_discarded", re.compile("FastQ paired records discarded: ([\d,]+) .*")],
["reads_single_discarded", re.compile("FastQ single records discarded: ([\d,]+) .*")],
["reads_paired_kept", re.compile(r"FastQ paired records kept: ([\d,]+) .*")],
["reads_single_kept", re.compile(r"FastQ single records kept: ([\d,]+).*")],
["reads_paired_discarded", re.compile(r"FastQ paired records discarded: ([\d,]+) .*")],
["reads_single_discarded", re.compile(r"FastQ single records discarded: ([\d,]+) .*")],
# Single-end
["reads_single_kept", re.compile("FastQ records kept: ([\d,]+)")],
["reads_single_discarded", re.compile("FastQ records discarded: ([\d,]+)")],
["reads_single_kept", re.compile(r"FastQ records kept: ([\d,]+)")],
["reads_single_discarded", re.compile(r"FastQ records discarded: ([\d,]+)")],
]
data = {}
for line in f.splitlines():
Expand Down
18 changes: 9 additions & 9 deletions multiqc/modules/skewer/skewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,16 @@ def parse_skewer_log(self, f):
"""Go through log file looking for skewer output"""
fh = f["f"]
regexes = {
"fq1": "Input file:\s+(.+)",
"fq2": "Paired file:\s+(.+)",
"r_processed": "(\d+) read|reads pairs? processed",
"r_short_filtered": "(\d+) \(\s*\d+.\d+%\) short read",
"r_empty_filtered": "(\d+) \(\s*\d+.\d+%\) empty read",
"r_avail": "(\d+) \(\s*\d+.\d+%\) read",
"r_trimmed": "(\d+) \(\s*\d+.\d+%\) trimmed read",
"r_untrimmed": "(\d+) \(\s*\d+.\d+%\) untrimmed read",
"fq1": r"Input file:\s+(.+)",
"fq2": r"Paired file:\s+(.+)",
"r_processed": r"(\d+) read|reads pairs? processed",
"r_short_filtered": r"(\d+) \(\s*\d+.\d+%\) short read",
"r_empty_filtered": r"(\d+) \(\s*\d+.\d+%\) empty read",
"r_avail": r"(\d+) \(\s*\d+.\d+%\) read",
"r_trimmed": r"(\d+) \(\s*\d+.\d+%\) trimmed read",
"r_untrimmed": r"(\d+) \(\s*\d+.\d+%\) untrimmed read",
}
regex_hist = "\s?(\d+)\s+(\d+)\s+(\d+.\d+)%"
regex_hist = r"\s?(\d+)\s+(\d+)\s+(\d+.\d+)%"

data = dict()
for k, v in regexes.items():
Expand Down
8 changes: 4 additions & 4 deletions multiqc/modules/sortmerna/sortmerna.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def parse_sortmerna(self, f):

for line in f["f"]:
if "Reads file" in line:
parts = re.split(r":|=", line)
parts = re.split(r"[:=]", line)
s_name = self.clean_s_name(parts[-1], f)
self.sortmerna[s_name] = dict()
if "Results:" in line and not post_results_start: # old versions
Expand All @@ -83,13 +83,13 @@ def parse_sortmerna(self, f):
continue
if post_results_start and not post_database_start:
if "Total reads =" in line:
m = re.search("\d+", line)
m = re.search(r"\d+", line)
if m:
self.sortmerna[s_name]["total"] = int(m.group())
else:
err = True
elif "Total reads passing" in line:
m = re.search("\d+", line)
m = re.search(r"\d+", line)
if m:
self.sortmerna[s_name]["rRNA"] = int(m.group())
self.sortmerna[s_name]["rRNA_pct"] = (
Expand All @@ -98,7 +98,7 @@ def parse_sortmerna(self, f):
else:
err = True
elif "Total reads failing" in line:
m = re.search("\d+", line)
m = re.search(r"\d+", line)
if m:
self.sortmerna[s_name]["non_rRNA"] = int(m.group())
self.sortmerna[s_name]["non_rRNA_pct"] = (
Expand Down
4 changes: 2 additions & 2 deletions multiqc/modules/supernova/supernova.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,9 +470,9 @@ def parse_report(content):
data = {}
# Find the sample ID
sid = ""
sid_pat = re.compile("- \[(.+)\]")
sid_pat = re.compile(r"- \[(.+)\]")
# [number, unit, category]
stat_pat = re.compile("-\s+(\d+\.\d+)\s+(\S+|.)\s+= (.+) =")
stat_pat = re.compile(r"-\s+(\d+\.\d+)\s+(\S+|.)\s+= (.+) =")

for line in content.splitlines():
sid_m = re.match(sid_pat, line)
Expand Down

0 comments on commit cedb55f

Please sign in to comment.