# Regular expressions (Regex)

In [1]:
import re

### Guía rápida

https://www.debuggex.com/cheatsheet/regex/python

- Before you can use regular expressions in your program, you must import the library using "import re"
- You can use re.search() to see if a string matches a regular expression, similar to using the find() method for strings
- You can use re.findall() to extract portions of a string that match your regular expression similar to a combination of find() and slicing: var[5:10] 

In [2]:
x = "asdf"

In [3]:
x.find("s")

1

### Búsqueda de patrones

In [4]:
seq0 = "AAACCCTTTGGG"
seq1 = "AAGCGTTGGG"

In [5]:
pat = "GTT"

In [6]:
match = re.search(pat, seq0)

In [7]:
match

In [8]:
print(match is None)

True


In [9]:
match = re.search(pat, seq1)

In [10]:
print(match is None)

False


In [11]:
match.start()

4

In [12]:
match.end()

7

In [13]:
match.group()

'GTT'

-----

In [14]:
s = "purple alice-b@google.com max joe leo@uchicago.edu"

In [17]:
re.search("\w+@\w+", s).group()

'b@google'

In [18]:
re.search("\w+@\w", s).group()

'b@g'

In [19]:
re.search("\w@\w+", s).group()

'b@google'

In [27]:
re.search("[\w-]+@[\w.-]+\w", s).group()

'alice-b@google.com'

In [30]:
re.search("[\w-]+@[\w.-]+", s).group()

'alice-b@google.com'

In [32]:
re.search("[\w-]+@.+", s).group()

'alice-b@google.com max joe leo@uchicago.edu'

In [33]:
re.search("[\w-]+@.", s).group()

'alice-b@g'

### Group matching

In [36]:
m = re.search("([\w-]+)@([\w.-]+)", s)

In [37]:
m.groups()

('alice-b', 'google.com')

In [38]:
m.group(0)

'alice-b@google.com'

In [39]:
m.group(1)

'alice-b'

In [40]:
m.group(2)

'google.com'

In [42]:
re.findall("[\w-]+@[\w.-]+", s)

['alice-b@google.com', 'leo@uchicago.edu']

In [43]:
re.findall("([\w-]+)@([\w.-]+)", s)

[('alice-b', 'google.com'), ('leo', 'uchicago.edu')]

Usualmente lo que queremos es usar "findall".

In [45]:
bible = open("bible.txt").read()

In [46]:
pat = "([a-zA-Z]+) loved ([a-zA-Z]+)"

In [47]:
re.findall("([a-zA-Z]+) loved ([a-zA-Z]+)", bible)

[('he', 'her'),
 ('Isaac', 'Esau'),
 ('Rebekah', 'Jacob'),
 ('Jacob', 'Rachel'),
 ('he', 'also'),
 ('he', 'the'),
 ('Israel', 'Joseph'),
 ('father', 'him'),
 ('he', 'thy'),
 ('LORD', 'you'),
 ('God', 'thee'),
 ('he', 'the'),
 ('he', 'a'),
 ('he', 'Hannah'),
 ('he', 'him'),
 ('Jonathan', 'him'),
 ('he', 'him'),
 ('Judah', 'David'),
 ('daughter', 'David'),
 ('daughter', 'him'),
 ('he', 'him'),
 ('he', 'him'),
 ('he', 'his'),
 ('LORD', 'him'),
 ('David', 'her'),
 ('had', 'her'),
 ('Solomon', 'the'),
 ('LORD', 'Israel'),
 ('Solomon', 'many'),
 ('hath', 'his'),
 ('God', 'Israel'),
 ('Rehoboam', 'Maachah'),
 ('he', 'husbandry'),
 ('king', 'Esther'),
 ('I', 'are'),
 ('have', 'the'),
 ('he', 'cursing'),
 ('have', 'thee'),
 ('hath', 'him'),
 ('have', 'strangers'),
 ('they', 'to'),
 ('have', 'thee'),
 ('hast', 'a'),
 ('I', 'him'),
 ('have', 'you'),
 ('thou', 'us'),
 ('I', 'Jacob'),
 ('him', 'him'),
 ('she', 'much'),
 ('so', 'the'),
 ('men', 'darkness'),
 ('Jesus', 'Martha'),
 ('he', 'him'),
 ('t

In [48]:
pat = "([A-Z][a-zA-Z]*) loved ([A-Z][a-zA-Z]*)"

In [49]:
re.findall(pat, bible)

[('Isaac', 'Esau'),
 ('Rebekah', 'Jacob'),
 ('Jacob', 'Rachel'),
 ('Israel', 'Joseph'),
 ('Judah', 'David'),
 ('LORD', 'Israel'),
 ('God', 'Israel'),
 ('Rehoboam', 'Maachah'),
 ('I', 'Jacob'),
 ('Jesus', 'Martha')]

In [50]:
pat = "([A-Z][a-zA-Z]+) loved ([A-Z][a-zA-Z]*)"
# Noten diferencia entre + y *

In [52]:
re.findall(pat, bible)

[('Isaac', 'Esau'),
 ('Rebekah', 'Jacob'),
 ('Jacob', 'Rachel'),
 ('Israel', 'Joseph'),
 ('Judah', 'David'),
 ('LORD', 'Israel'),
 ('God', 'Israel'),
 ('Rehoboam', 'Maachah'),
 ('Jesus', 'Martha')]

---

In [53]:
x = 'My 2 favorite numbers are 19 and 427'
y = re.findall('[0-9]+', x)
y

['2', '19', '427']

In [54]:
x = 'My 2 favorite numbers are 19 and 427'
y = re.findall('[0-9]', x)
y

['2', '1', '9', '4', '2', '7']

In [55]:
x = 'My 2 favorite numbers are 19 and 427'
y = re.findall('[0-4]', x)
y

['2', '1', '4', '2']

In [56]:
x = 'My 2 favorite numbers are 19 and 427'
y = re.findall('[0-430]', x)
y
# Intervalos numéricos solo los entiende del 0 al 9.

['2', '1', '4', '2']

In [57]:
y = re.findall('[AEIOU]+', x)
y

[]

---

In [58]:
x = 'From: Using the : character'
y = re.findall('F.+:', x)
y


['From: Using the :']

In [59]:
x = 'From: Using the : character'
y = re.findall('F.+?:', x)
y


['From:']

"Greedy" vs "Non-greedy"

---

In [62]:
x = "From: stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008"
y = re.findall('\S+@\S+',x)
y

['stephen.marquard@uct.ac.za']

In [63]:
y = re.findall('From: \S+@\S+', x)
y

['From: stephen.marquard@uct.ac.za']

In [64]:
y = re.findall('From: (\S+@\S+)', x)
y

['stephen.marquard@uct.ac.za']

---

In [66]:
y = re.findall('@([^ ]+)', x)
y

['uct.ac.za']

In [67]:
y = re.findall('^From: .*@([^ ]*)', x)
y

['uct.ac.za']

---

In [68]:
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+',x)
y

['$10.00']

In [69]:
x = 'We just received $10.00 for cookies.'
y = re.findall('$[0-9.]+',x)
y

[]

A veces queremos buscar caracteres especiales.

---

http://regexr.com/

http://www.regular-expressions.info/

http://en.wikipedia.org/wiki/Regular_expression