# Advanced Regex

## Regular Expression review

- A powerful way to match text

In [1]:
import pandas as pd
import numpy as np
import re

https://regexper.com/

In [2]:
text = "That person wears marvelous trousers."

### `Literal strings` vs `sets`

In [3]:
# literal strings: find the pattern 'person'
pattern = 'person'
re.findall(pattern, text)

['person']

In [4]:
pattern = 'persona'
re.findall(pattern, text)

[]

In [5]:
pattern = 'person'
re.sub(pattern,'man', text)

'That man wears marvelous trousers.'

In [6]:
# sets: Finding the pattern `p` or `e` or `r` or ...
pattern = '[person]'
print(re.findall(pattern, text))

['p', 'e', 'r', 's', 'o', 'n', 'e', 'r', 's', 'r', 'e', 'o', 's', 'r', 'o', 's', 'e', 'r', 's']


In [7]:
text = 'São Paulo Sao Paulo Sáo Paulo Sun Paulo seu paulo san paolo sao paulo são paolo sAo Paolo sao_paulo'

pattern = '[Ss][ãaáàâAÃÁÀâeu][oun][ _][Pp]a[uob]lo'
print(re.sub(pattern, 'São Paulo\n', text))

São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo



In [8]:
text = "Is it spelled gray or grey?"

pattern = 'gr[ae]y'
re.findall(pattern, text)

['gray', 'grey']

> So anything within brackets `[ ]` are considered `sets` in RegEx. A set of patterns you want to find. 

## Since it is a set, you can look for complete sets

For example: The set of upper-case letters from A to C.

In [9]:
text = "This is an A and B conversation, so C your way out of it, or Even F."

pattern = '[A-C]'
re.findall(pattern, text)

['A', 'B', 'C']

In [10]:
pattern = '[A-Z]'
re.findall(pattern, text)

['T', 'A', 'B', 'C', 'E', 'F']

In [11]:
text = "I'm not going to 0A the party because 1) Karen is going, 2) I don't like her, and 3) 3B I already have a headache."

pattern = '[1-3]'
re.findall(pattern, text)

['1', '2', '3', '3']

In [12]:
pattern = '[0-9]'
re.findall(pattern, text)

['0', '1', '2', '3', '3']

In [13]:
pattern = '[0-9A-Z]'
re.findall(pattern, text)

['I', '0', 'A', '1', 'K', '2', 'I', '3', '3', 'B', 'I']

In [14]:
# pattern = '[0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ]'
# re.findall(pattern, text)

In [15]:
pattern = '[0-9][A-Z]'
re.findall(pattern, text)

['0A', '3B']

Some useful sets: 

* [a-z]: Any lowercase letter between a and z.
* [A-Z]: Any uppercase letter between A and Z.
* [0-9]: Any numeric character between 0 and 9.

In [16]:
pattern = '[^0-9 a-z]'
re.findall(pattern, text)

['I', "'", 'A', ')', 'K', ',', ')', 'I', "'", ',', ')', 'B', 'I', '.']

# Meta characters - They means something different than the character they represent.

* `.` : Match **any character** except newline (`\n`)
* `^` : If used within a `set`, negates the condition (similar to `~` in python)
> Careful, this pattern also represents another thing: If used <u>outside a set</u>, it represents `match if at the beginning of the line`
* `$` : Match if at end of the line
* `|` : "OR" operator

## OR

In [17]:
text = 'Andre andre'

In [18]:
pattern = '[Aa]'
re.findall(pattern, text)

['A', 'a']

In [19]:
pattern = 'A|a'
re.findall(pattern, text)

['A', 'a']

In [20]:
text = '''
I like penguins
I like lions
I like penguins and lions
'''

pattern = 'penguins|lions'
re.findall(pattern, text)

['penguins', 'lions', 'penguins', 'lions']

In [21]:
text = '''
I like penguins
I like lions
I like penguins and lions
'''

pattern = '[penguinslions]'
re.findall(pattern, text)

['l',
 'i',
 'e',
 'p',
 'e',
 'n',
 'g',
 'u',
 'i',
 'n',
 's',
 'l',
 'i',
 'e',
 'l',
 'i',
 'o',
 'n',
 's',
 'l',
 'i',
 'e',
 'p',
 'e',
 'n',
 'g',
 'u',
 'i',
 'n',
 's',
 'n',
 'l',
 'i',
 'o',
 'n',
 's']

## Match any character

In [22]:
text = """My boss asked me to turn in my TPS reports. 
I told him they were done, but they are not."""

pattern = '.|\n'
print(re.findall(pattern, text))

['M', 'y', ' ', 'b', 'o', 's', 's', ' ', 'a', 's', 'k', 'e', 'd', ' ', 'm', 'e', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'i', 'n', ' ', 'm', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'e', 'p', 'o', 'r', 't', 's', '.', ' ', '\n', 'I', ' ', 't', 'o', 'l', 'd', ' ', 'h', 'i', 'm', ' ', 't', 'h', 'e', 'y', ' ', 'w', 'e', 'r', 'e', ' ', 'd', 'o', 'n', 'e', ',', ' ', 'b', 'u', 't', ' ', 't', 'h', 'e', 'y', ' ', 'a', 'r', 'e', ' ', 'n', 'o', 't', '.']


## Match everything not in specific set

In [23]:
text = """My boss asked me to turn in my TPS reports. 
I told him they were done, but they are not."""

In [24]:
pattern = '[^a-m]'
print(re.findall(pattern, text))

['M', 'y', ' ', 'o', 's', 's', ' ', 's', ' ', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'n', ' ', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'p', 'o', 'r', 't', 's', '.', ' ', '\n', 'I', ' ', 't', 'o', ' ', ' ', 't', 'y', ' ', 'w', 'r', ' ', 'o', 'n', ',', ' ', 'u', 't', ' ', 't', 'y', ' ', 'r', ' ', 'n', 'o', 't', '.']


## Match sentences `beginning with pattern`

In [25]:
text = '''My boss asked me to turn in my TPS reports. 
The boss told him they were done, but they are not.'''

In [26]:
pattern = '^My boss'
print(re.findall(pattern, text))

['My boss']


In [27]:
pattern = '^The boss'
print(re.findall(pattern, text))

[]


In [28]:
pattern = '^turn'
print(re.findall(pattern, text))

[]


In [29]:
text = '''My boss asked me to turn in my TPS reports. 
The boss told him they were done, but they are not.'''

In [30]:
pattern = 'reports.$'
print(re.findall(pattern, text))

[]


In [31]:
pattern = 'are not\.$'
print(re.findall(pattern, text))

['are not.']


## Characters classes

* `\d`: numeric characters
* `\w`: alphanumeric characters 
* `\s`: spaces
* `\D`: not numeric characters

In [32]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d'
print(re.findall(pattern, text))

['3', '2', '3', '2', '1', '3', '3', '1']


In [33]:
#pattern = '[^\d]'
pattern = '\D'

print(re.findall(pattern, text))

['A', 'n', 'd', 'r', 'e', ' ', 'a', 'n', 'd', 'r', 'e', ' ', 'a', 'o', 'i', 'j', 'o', ' ', '(', ' ', ' ', '$', ' ', 'p', ' ', 'i', 'o', ' ', 'x', ' ', '-', 'o', ' ', '=', ' ', ' ', ' ', '™', '¡', '¡', '™', '£', '¡', 'Ω', 'å', ' ', '.', ' ', 'á', 'é', 'ó', 'ã', 'à']


# Quantifiers 

* *: Matches previous character 0 or more times
* +: Matches previous character 1 or more times
* ?: Matches previous character 0 or 1 times (optional)
* {}: Matches previous characters however many times specified within:
* {n} : Exactly n times
* {n,} : At least n times
* {n,m} : Between n and m times

## \d* --> Matches any numeric character that appears 0 or more times.

In [34]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.11648 áéóãà 1'

pattern = '\d*'
print(re.findall(pattern, text))

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '3232', '', '13', '', '', '', '', '', '', '', '', '', '', '3', '', '11648', '', '', '', '', '', '', '', '1', '']


In [35]:
## \d+ --> Matches any numeric character that appears 1 or more times.

In [36]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d+'
print(re.findall(pattern, text))

['3232', '13', '3', '1']


In [37]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d+\.?\d+'
print(re.findall(pattern, text))

['3232', '13', '3.1']


## Application of previous example of `$` using one of the most useful quantifiers `*`

In [38]:
text = '''My boss asked me to turn in my TPS reports. 
My boss told him they were done, but they are not.'''

In [39]:
pattern = 'are not\.$'
print(re.findall(pattern, text))

['are not.']


In [40]:
pattern = '.*are not\.$'
print(re.findall(pattern, text))

['My boss told him they were done, but they are not.']


In [41]:
pattern = '.+\n.*are not\.$'
print(re.findall(pattern, text))

['My boss asked me to turn in my TPS reports. \nMy boss told him they were done, but they are not.']


In [42]:
text

'My boss asked me to turn in my TPS reports. \nMy boss told him they were done, but they are not.'

In [43]:
pattern = ',.*are not\.$'
print(re.findall(pattern, text))

[', but they are not.']


In [44]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss told, him they were done, but they, are not.'''

In [45]:
pattern = ',.*are not.$'
print(re.findall(pattern, text))

[', him they were done, but they, are not.']


In [46]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss (told him they) were done (but they) are not.'''

In [47]:
pattern = '\([\w ]*\)'
print(re.findall(pattern, text))

['(told him they)', '(but they)']


In [48]:
pattern = '\(.*?\)'
print(re.findall(pattern, text))

['(told him they)', '(but they)']


In [49]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss (told him they) were done (but they) are not.'''

In [50]:
pattern = '\((.*?)\)'
print(re.findall(pattern, text))

['told him they', 'but they']


# Capturing group

What if I wanted to capture only things up until the comma (`,`), however, not include the comma?

I would have to use a capturing group to specify what specifically I want to capture.

In [51]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss -told him they- were done -but they- are not.'''

In [52]:
pattern = '-.*?-'
print(re.findall(pattern, text))

['-told him they-', '-but they-']


In [53]:
pattern = '-(.*?)-'
print(re.findall(pattern, text))

['told him they', 'but they']


In [54]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss told, him they were done, but they, are ,not.'''

In [55]:
pattern = ',(.*?),'
print(re.findall(pattern, text))

[' him they were done', ' are ']


In [56]:
pattern = ',([\w ]*)'
print(re.findall(pattern, text))

[' me to turn in my TPS reports', ' him they were done', ' but they', ' are ', 'not']


In [57]:
text = "TerraPower, a nuclear-energy company founded by Bill Gates, is unlikely to follow through on building a demonstration reactor in China, due largely to the Trump administration’s crackdown on the country."

pattern = '[A-Z][a-zA-Z]+'
print(re.findall(pattern, text))

['TerraPower', 'Bill', 'Gates', 'China', 'Trump']


In [58]:
text = "TerraPower, a nuclear-energy company founded by Bill Gates, is unlikely to follow through on building a demonstration reactor in China, due largely to the Trump administration’s crackdown on the country."

pattern = '[A-Z][a-z]+ ?[A-Z][a-z]+'
print(re.findall(pattern, text))

['TerraPower', 'Bill Gates']


In [59]:
pattern = '([A-Z][a-zA-Z]+ ?[A-Z][a-zA-Z]+)|([A-Z][a-z]+)'

In [60]:
print(re.findall(pattern, text))

[('TerraPower', ''), ('Bill Gates', ''), ('', 'China'), ('', 'Trump')]


In [61]:
[name[0] for name in re.findall(pattern, text) if name[1]=='']
[name[1] for name in re.findall(pattern, text) if name[0]=='']

['China', 'Trump']

In [62]:
simple_names = [name[1] for name in re.findall(pattern, text) if name[1] != '']
combined_names = [name[0] for name in re.findall(pattern, text) if name[0] != '']

In [63]:
print(simple_names)
print(combined_names)

['China', 'Trump']
['TerraPower', 'Bill Gates']


In [64]:
pattern = '([A-Z][a-zA-Z]+ ?[A-Z][a-zA-Z]+)|([A-Z][a-z]+)'

In [65]:
pattern = '([A-Z][a-z]+)|([A-Z][a-zA-Z]+ ?[A-Z][a-zA-Z]+)'

In [66]:
print(re.findall(pattern, text))

[('Terra', ''), ('Power', ''), ('Bill', ''), ('Gates', ''), ('China', ''), ('Trump', '')]


# Important Regex Concept: Greediness


What will this match?

In [67]:
text = 'You are yelling! So I will yell too! Let me yell!.'

# anything up to exclamation point
pattern = ".*!"
print(re.findall(pattern, text))

['You are yelling! So I will yell too! Let me yell!']


In [68]:
pattern = ".*?!"
re.findall(pattern, text)

['You are yelling!', ' So I will yell too!', ' Let me yell!']

In [69]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2}"
print(re.findall(pattern, text))

['aww', 'aww', 'aww', 'aww']


In [70]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,}"
print(re.findall(pattern, text))

['aww', 'awww', 'awwww', 'awwwww']


In [71]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,3}"
print(re.findall(pattern, text))

['aww', 'awww', 'awww', 'awww']


In [72]:
text = "Ooooooiiiii gente"

pattern = "[Oo]{1,}i{1,}e{0,}"
pattern = "[Oo]+i+e*"
print(re.findall(pattern,text))

['Ooooooiiiii']


In [73]:
text = "If you tell the truth 12504 time, you don't have to remember anything 2 times."

pattern = '\w+'
print(re.findall(pattern, text))

['If', 'you', 'tell', 'the', 'truth', '12504', 'time', 'you', 'don', 't', 'have', 'to', 'remember', 'anything', '2', 'times']


In [74]:
## word length
pattern = '[A-Za-z]{4,}'
print(re.findall(pattern, text))

['tell', 'truth', 'time', 'have', 'remember', 'anything', 'times']


https://phoneregex.com/

In [75]:
phone='(61) 9.94536852'
pattern='(^|\()?\s*(\d{2})\s*(\s|\))*(9?\.?\d{4})(\s|-)?(\d{4})($|\n)'
print(re.findall(pattern,phone))

[('(', '61', ' ', '9.9453', '', '6852', '')]


In [76]:
#ponto depois 9
#hifen entre ddd e numero 
# 55 
#parenteses do ddd
