# Basic functions

In [None]:
import re

text = 'the dog barked at the cat chasing the mouse'
regex = 'the'

re.findall(regex, text)

In [None]:
matches = re.finditer(regex, text)
print(matches)

for match in matches:
    start, end = match.start(), match.end()
    print('%d..%d: %s' % (start, end, text[start:end]))
    print(match.group())

In [None]:
# 'search' gives only the first result (or None if the regex isn't found)
match = re.search(regex, text)
print(match.start(), match.end(), match.group())

In [None]:
re.split(regex, text)

In [None]:
re.sub('the', 'this', text)

# Compiling a regex

In [None]:
# When using the same regex multiple times, it's more efficient to compile it only once.
# (it generally takes more computational power to compile it)

regex = 'dog'
text1 = 'the dog barked at the cat chasing the mouse'
text2 = 'the white dog barked at the black dog'

# Option 1
print(re.findall(regex, text1))
print(re.findall(regex, text2))

# Option 2
pattern = re.compile(regex)
print(pattern.findall(text1))
print(pattern.findall(text2))

In [None]:
%timeit re.findall(regex, text1)
%timeit pattern.findall(text1)

In [None]:
# The regex object has the exact same methods
print(pattern.split(text1))
print(pattern.sub('horse', text1))

# Regex syntax

In [None]:
dna_seq = 'AATGCCCTGGCCATTTTTTTTTCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGATTC'

# . matches everything
print(re.findall(r'A.C', dna_seq))

# Matches are not overlapping: the next match attempt will start only after the end of the previous match.
print(re.findall(r'A.C', 'AACC'))

In [None]:
# Repetitions

print(re.findall(r'A{3}', dna_seq))

# {} applies to the previous regex, whatever it is:
print(re.findall(r'A..', dna_seq))
print(re.findall(r'A.{2}', dna_seq)) # The last regex is '.', and so that's what {2} applies to.

# Between 1 to 3 repetitions of T
print(re.findall(r'AT{1,3}C', dna_seq))

In [None]:
# + matches any positive number of repetitions
print(re.findall(r'AT+C', dna_seq))

# * matches any number (including zero) of repetitions
print(re.findall(r'AT*C', dna_seq))

# ? matches 0 or 1 repetitions
print(re.findall(r'AT?C', dna_seq))

In [None]:
dna_seqs = 'ATGGATGAATG' + '\n' + 'GGGACT' + '\n' + 'AAAATTT'

# .* matches everything, except new lines.
print(re.findall(r'.*', dna_seqs))
print(re.findall(r'A.*T', dna_seqs))

# We can make '.' also match newlines with the DOTALL flag
print(re.findall(r'G.*', dna_seqs, flags = re.DOTALL)) 

# .* also matches empty strings.
print(re.findall(r'.*', ''))
print(re.findall(r'.*', '\n'))

In [None]:
# Square brackets specify a set of optional characters (to be matched exactly once)
print(re.findall(r'[AT]GG', dna_seq))
print(re.findall(r'[AT][GC]{2}', dna_seq))
print(re.findall(r'[ATG]+', dna_seq)) # finds any char except C (essentially splits the string using C as a delimiter)

In [None]:
# Can specify a range of chars (by ASCII)

text = r'NC000011.10 (5225466..5227071, complement)'
print(re.findall(r'[a-z]+', text)) # any lower case sequence
print(re.findall(r'[A-Z]+', text))
print(re.findall(r'[0-9]+', text))
print(re.findall(r'[a-zA-Z0-9]+', text))

In [None]:
# ^ stands for "not".
print(re.findall(r'[^C]+', dna_seq))
print(re.findall(r'[ATG]+', dna_seq))

In [None]:
# Use \ to escape special characters (so they are interpreted literally).
text = 'AATA.T'
print(re.findall(r'A.T', text))
print(re.findall(r'A\.T', text))

In [None]:
text = r'AAA\BBB\CCC'
print(text)

# Make sure to write regexes using raw strings (r in front of the string).
print(re.findall(r'.\\.', text))
# Otherwise expect an escaping nightmare (instead of using r, we will have to use backslash for every backslash).
# Regex needs two backslashes - one to match it, but also the one before to tell regex to ignore the following backslash
# Python interpreter also needs a backslash for all backslashes, making it four in total
print(re.findall('.\\\\.', text))

# Special regex chars

In [None]:
song = '''
On the 4th of July 1806
We set sail from the sweet cove of Cork
We were sailing away with a cargo of bricks
For the grand city hall in New York
'Twas a wonderful craft, she was rigged fore-and-aft
And oh, how the wild winds drove her.
She'd got several blasts, she'd 27 masts
And we called her the Irish Rover.

We had 1000000 bales_of_the_best_Sligo_rags
We had 2000000 barrels_of_stones
We had 3000000 sides_of_old_blind_horses_hides,
We had 4000000 barrels_of_bones.
We had 5000000 hogs, we had 6000000 dogs,
7000000 barrels of porter.
We had 8000000 bails_of_old_nanny_goats'_tails,
In the hold of the Irish Rover.
'''

In [None]:
# \s matches all white spaces
print(re.findall(r'\s+', song))

In [None]:
# \d matches all digits (equivalent to: [0-9])
print(re.findall(r'\d', song))
print(re.findall(r'\d+', song))

In [None]:
# \w matches any alphanumeric character and the underscore (equivalent to: [a-zA-Z0-9_])
print(re.findall(r'\w+', song))

In [None]:
# \S, \D and \W are the NOT versions.
print(re.findall(r'\S+', song)) # Non-whitespace characters
print(re.findall(r'\D+', song)) # Non-digit characters
print(re.findall(r'\W+', song)) # Non-alphanumeric characters

# Regex is usually greedy

In [None]:
dna_seq = 'ACGTCGGGGGGGACCGT'
print(re.findall(r'A.*T', dna_seq)) # Will try to give the longest string possible, not just ACGT

In [None]:
# Use *?, +?, ?? and {}? for non-greedy versions

print(re.findall(r'A.*T', dna_seq))
print(re.findall(r'A.*?T', dna_seq))

print('*' * 50)

print(re.findall(r'A.?C', dna_seq)) # either one or no chars
print(re.findall(r'A.??C', dna_seq)) # prefer no chars

# Flags

In [None]:
dna_seq = 'ACGTAAtaggtagtcgtAGTGACGTA'
# Regex is case-sensitive by default.
print(re.findall(r'A.T', dna_seq))
print(re.findall(r'A.T', dna_seq, flags = re.IGNORECASE))

In [None]:
# Use the | operator to add multiple flags
dna_seqs = 'ACGTAAtaggtagtcgtAGTGACGTA' + '\n' + 'TAAAAGT'
print(re.findall(r'A.T', dna_seqs, flags = re.IGNORECASE | re.DOTALL))

# Selecting specific parts

In [None]:
dna_seq = 'AATGCCCTGGCCATATTTTTTTTTCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGATTC'

# Use parentheses to select a specific part of the regex
print(re.findall(r'A.{3}T', dna_seq))
print(re.findall(r'A(.{3})T', dna_seq))

In [None]:
# Can select multiple parts
print(re.findall(r'A.{3}T[CG]{2}', dna_seq))
print(re.findall(r'A(.{3})T([CG]{2})', dna_seq))

In [None]:
# regex has a 1-based indexing system
for match in re.finditer(r'A(.{3})T([CG]{2})', dna_seq):
    print(match.group(1)) # First group
    print(match.group(2)) # Second group
    print(match.group(0)) # Everything
    print(match.group()) # Same as 0
    print('*' * 10)

In [None]:
# Can also name the groups with: (?P<name>...)

for match in re.finditer(r'A(?P<triple>.{3})T(?P<pair>[CG]{2})', dna_seq):
    print(match.groupdict())

In [None]:
# | specifies OR
print(re.findall(r'A.T|T.A', dna_seq))

# Can use with parentheses, but then it only selects what's inside them
# So parentheses here do 2 functions: specify the limites of OR operator, and output only what is inside of them
print(re.findall(r'(A.T|T.A)G', dna_seq))

# (?:...) is a non-capturing version that overcomes the problem
print(re.findall(r'(?:A.T|T.A)G', dna_seq))

# This can be also useful when we want to apply a special modifier to a complex regex but don't want to capture it
# as a separate group.
print(re.findall(r'AT+', dna_seq))
print(re.findall(r'(?:AT)+', dna_seq)) # so parenthesis don't output only AT, but + is asigned to AT

# Substitution

In [None]:
dna_seq = 'ACGTCGGGGGGGACCGT'
print(re.sub(r'ACG', r'ATG', dna_seq))

In [None]:
print(re.sub(r'\s+', '-', song))

In [None]:
# Reference groups by \1, \2, etc.

# Substitute the nucleotides before and after CCG
print(re.sub(r'(.)CCG(.)', r'\2CCG\1', dna_seq))

# Reference named groups by \g<name>
print(re.sub(r'(?P<prefix>.)CCG(?P<suffix>.)', r'\g<suffix>CCG\g<prefix>', dna_seq))

In [None]:
# .sub() is really flexible and can also take a function as a parameter

def switch(match):
    if match[1] == 'A':
        return match[0]
    else:
        return match[2] + match.string[match.end(1):match.start(2)] + match[1]

print(re.sub(r'(.)CCG(.)', switch, 'ACCGT TCCGA'))

# Example - parsing GENCODE's extra fields in the GTF file

In [None]:
# last time we've done parsing using a function
# but it can be parsed with regex in one line of code

gtf_extra_fields = r'gene_id "ENSG00000183186.7"; transcript_id "ENST00000332235.7"; gene_type "protein_coding"; gene_name ' + \
        '"C2CD4C"; transcript_type "protein_coding"; transcript_name "C2CD4C-001"; level 2; protein_id "ENSP00000328677.4"; ' + \
        'transcript_support_level "2"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS45890.1"; havana_gene ' + \
        '"OTTHUMG00000180534.3"; havana_transcript "OTTHUMT00000451789.3";'
print(gtf_extra_fields)
print('*' * 50)

print(dict(re.findall(r'(\w+) "(.*?)";', gtf_extra_fields)))
# so we with first parentheses we select our key - any alpha-num-underscore character '\w'...
# any number of times '+' until the space
# with the second parentheses, we select any character '.' any number of repetitions '*' until a semicolon ';'...
# and a nongreedy approach, to get individual values within quotation marks

# Example - finding a motif

The LexA binding motif:

![Logo](https://upload.wikimedia.org/wikipedia/commons/8/85/LexA_gram_positive_bacteria_sequence_logo.png)

In [None]:
dna_seq = 'AAAAGTGAGTGAGTTAGAACAAATGTTCGAGATGAGTGAGTGGGGGATGA'
motif_regex = r'.[AC]GAA[CA][AG]..[TC][GTA]TT[CT][GT].'

print(re.findall(motif_regex, dna_seq))

# Limitations

Limitations of regex:
1. Matches are not overlapping.
2. Can't parse recursive formats (XML, JSON, etc.).
3. Regexes can quickly become "write only" (hard to understand for others)
    * Use `re.VERBOSE` and `(?#...)` when appropriate.
    * There are tools to examine cryptic or problematic regexes, such as https://regex101.com/

When parsers are available, prefer using them rather than writing your own.

Some people, when confronted with a problem, think "I know, I'll use regular expressions." Now they have two problems.

    -- Jamie Zawinski