# REGular EXpression - INTRO TO REGEX

In [1]:
import re

1. `re.search`: Returns the first instance of an expression in a string.
2. `re.findall`: Finds all instances of an expression in a string and returns them as a list.
3. `re.split`: Splits a string based on a specified delimiter.
4. `re.sub`: Substitutes a string/substring with another.

Website to visually see what your regular expressions look like: https://regexper.com/

In [31]:
text = 'My neighbor, Mr. Rogers, has 5 dogs.'
pattern = 'neigh'

re.findall(pattern, text)

['neigh']

In [32]:
text = 'My neighbor, Mr. neighrogers, has 5 dogs.'
pattern = 'neigh'

re.findall(pattern, text)

['neigh', 'neigh']

In [33]:
text = 'My , Mr. Rogers, has 5 dogs.'
pattern = 'neigh'

re.findall(pattern, text)

[]

## Introducing Sets

In [34]:
text = 'My neighbor, Mr. Rogers, has 5 dogs.'
pattern = '[neigh]'

re.findall(pattern, text)

['n', 'e', 'i', 'g', 'h', 'g', 'e', 'h', 'g']

In [35]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'



pattern = '[Rr]ogers'
# pattern = rogers ou Rogers ou xogers

re.findall(pattern, text)

['Rogers', 'rogers']

In [38]:
text.replace('Rogers','Guilherme').replace('rogers','Gulherme')

'My neighbor, Mr. Andre, has 5 Andre.'

In [36]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'

pattern = '[Rr]ogers'
# pattern = rogers ou Rogers ou xogers

re.sub(pattern, 'Guilherme', text)

'My neighbor, Mr. Andre, has 5 Andre.'

In [51]:
text = 'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São Paulo Saon Paulo'

pattern = '[Ss][áãa][on][ -][Pp]a[uob]lo'

re.findall(pattern, text)

['Sáo Paulo',
 'São Paulo',
 'Sao Paulo',
 'Sao Paolo',
 'San Pablo',
 'sao paulo',
 'sao Paulo',
 'são Paulo',
 'sao-paulo',
 'são paulo',
 'São Paulo']

In [52]:
text

'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São Paulo Saon Paulo'

In [49]:
re.sub(pattern, 'São Paulo', text)

'São Paulo São Paulo São Paulo São Paulo São Paulo São Paulo São Paulo São Paulo São Paulo São Paulo São Paulo'

# Pattern sets:

Range

1. [a-z]: Any lowercase letter between a and z.
2. [A-Z]: Any uppercase letter between A and Z.
3. [0-9]: Any numeric character between 0 and 9.

In [53]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'
pattern = '[a-e]'

re.findall(pattern, text)

['e', 'b', 'e', 'a', 'e']

In [56]:
re.findall('[A-Z]', text)

['M', 'M', 'R']

In [55]:
re.findall('[A-N]', text)

['M', 'M']

In [57]:
re.findall('[efghijklmno]', text)

['n', 'e', 'i', 'g', 'h', 'o', 'o', 'g', 'e', 'h', 'o', 'g', 'e']

In [58]:
re.findall('[e-o]', text)

['n', 'e', 'i', 'g', 'h', 'o', 'o', 'g', 'e', 'h', 'o', 'g', 'e']

In [61]:
re.findall('[0-9]', text)

['5']

In [62]:
# you can concatenate ranges

re.findall('[A-Za-z0-9]', text)

['M',
 'y',
 'n',
 'e',
 'i',
 'g',
 'h',
 'b',
 'o',
 'r',
 'M',
 'r',
 'R',
 'o',
 'g',
 'e',
 'r',
 's',
 'h',
 'a',
 's',
 '5',
 'r',
 'o',
 'g',
 'e',
 'r',
 's']

In [64]:
text

'My neighbor, Mr. Rogers, has 5 rogers.'

In [65]:
re.findall('[A-Z,0-9]', text)

['M', ',', 'M', 'R', ',', '5']

The opposite: 
- `^` matches everything except the pattern 

In [66]:
pattern = '[^a-z]'
re.findall(pattern, text)

['M', ' ', ',', ' ', 'M', '.', ' ', 'R', ',', ' ', ' ', '5', ' ', '.']

In [67]:
# concat patterns [] 
# space character == \s
pattern = '[^a-zA-Z0-9\s]'
pattern = '[^a-zA-Z0-9 ]'
re.findall(pattern, text)


[',', '.', ',', '.']

In [68]:
re.findall('[ ]', text)

[' ', ' ', ' ', ' ', ' ', ' ']

In [69]:
re.findall('[\s]', text)

[' ', ' ', ' ', ' ', ' ', ' ']

# Meta Characters:

Characters that don't mean what they are.

1. `\w`: Any alphanumeric character.
3. `\d`: Any numeric character.
7. `.` : Any character except newline (\n).

In [78]:
text = 'My neighbor, Mr. Rogers, ] has 5 - dogs 10. α π'

In [79]:
pattern = '\w'
print(re.findall(pattern, text))

['M', 'y', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', 'M', 'r', 'R', 'o', 'g', 'e', 'r', 's', 'h', 'a', 's', '5', 'd', 'o', 'g', 's', '1', '0', 'α', 'π']


In [80]:
print(re.findall('\d', text))
# print(re.findall('[0-9]', text))

['5', '1', '0']


In [81]:
print(re.findall('.', text))

['M', 'y', ' ', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', ',', ' ', 'M', 'r', '.', ' ', 'R', 'o', 'g', 'e', 'r', 's', ',', ' ', ']', ' ', 'h', 'a', 's', ' ', '5', ' ', '-', ' ', 'd', 'o', 'g', 's', ' ', '1', '0', '.', ' ', 'α', ' ', 'π']


## Quantifiers

1. `*`: 0 or more
2. `?`: 0 or 1
3. `+`: 1 or more

In [82]:
text = '''My neighbor, Mr. Rogers, has 5 - dogs and 100 cats and β sheeps.'''

In [84]:
print(re.findall('\d*', text))

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '5', '', '', '', '', '', '', '', '', '', '', '', '', '100', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [85]:
print(re.findall('\d+', text))

['5', '100']


In [87]:
print(re.findall('-.*', text))

['- dogs and 100 cats and β sheeps.']


In [88]:
print(re.findall('\w+', text))

['My', 'neighbor', 'Mr', 'Rogers', 'has', '5', 'dogs', 'and', '100', 'cats', 'and', 'β', 'sheeps']


In [111]:
my_string = 'Andre Park and andré 21, Andre Aguiar andré 21,65 are part of Ironhack"s andré 21.65 Da Silva andré 32 Sauro'


In [112]:
re.findall('\w+ \d+[\.,]?\d*', my_string)

['andré 21,', 'andré 21,65', 'andré 21.65', 'andré 32']

In [113]:
text = 'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São SãoPaulo Saon Paulo'

pattern = '[Ss][ãaáàâä][on]n? ?[Pp]a[buo]lo'

re.findall(pattern, text)

['Sáo Paulo',
 'São Paulo',
 'Sao Paulo',
 'Sao Paolo',
 'San Pablo',
 'sao paulo',
 'sao Paulo',
 'são Paulo',
 'são paulo',
 'SãoPaulo',
 'Saon Paulo']

In [114]:
text = 'This colonel has the colour or color blue'

re.findall('colou?r', text)

['colour', 'color']

In [115]:
text = 'These apples are beautiful and the apple is blue.'

re.findall('apples?', text)

['apples', 'apple']

In [116]:
text = 'Andre Aguiar and Andre Park are from the Ironhack team'

re.findall('\w+ Aguiar', text)

['Andre Aguiar']

In [117]:
re.sub('\w+ Park', 'Joao Park', text)

'Andre Aguiar and Joao Park are from the Ironhack team'

# Other methods for regular expressions

In [125]:
text = 'My neighbor, Mr. Rogers, ] has 5 - rogers 1000,'

In [130]:
re.sub('[Rr]ogers','Andre', text)

'My neighbor, Mr. Andre, ] has 5 - Andre 1000,'

In [131]:
re.sub('\d+','-1', text)

'My neighbor, Mr. Rogers, ] has -1 - rogers -1,'

In [132]:
text.split('Rogers')

['My neighbor, Mr. ', ', ] has 5 - rogers 1000,']

In [133]:
print(re.split('[Rr]ogers', text))

['My neighbor, Mr. ', ', ] has 5 - ', ' 1000,']


In [134]:
print(re.split('[0-9]+', text))

['My neighbor, Mr. Rogers, ] has ', ' - rogers ', ',']


# Examples

Find the regexes that: 
1. Matches “Dan” and “Ban” (first letter can be "D" or "B").
2. Matches “Dan”, “Ban”, “Tan”, and “Pan”.
3. Matches “Dan” and “Dag” (last letter can be "n" or "g").
4. Matches Dan followed by lower case "and"


In [135]:
text = 'Dan and Ban and Tan Dah andSan And Dag wasodkpDanksdpofk i9as09riPan i 09wie'

In [None]:
pattern = '[DB]an'
re.findall(pattern, text)

In [None]:
pattern = ''
re.findall(pattern, text)

In [None]:
pattern = ''
re.findall(pattern, text)