In [None]:
"""
Tutorial 1: Regular expressions
This notebook aims to showcase how regular expressions work and show some examples .
Author: Yoanna Koleva
"""

In [46]:
import regex as re

## Simplest form of regular expressions

In [14]:
# Match all "a" characters in a string
text="I am learning about regular expressions"
match=re.findall("a", text)
print(match)

['a', 'a', 'a', 'a']


In [28]:
# Characters to be matched can be concatenated 
text="strings"
match=re.findall("string",text)
print(match)

['string']


## Metacharacters


In [18]:
# Find all words which end in "ain". 
text = "The train travels through the mountain in the rain."
match = re.findall(r"ain", text)
print(match)


['train', 'mountain', 'rain']


## Sets

In [19]:
# Match patterns consisting of a, b or c 
text="abbbbbbcaabb"
match=re.findall(r"[abc]+",text)
print(match)

['abbbbbbcaabb']


In [21]:
# Match any character in the range a-m
# Try also with zzz at the end
text="abbbbbbcaabb"
match=re.findall(r"[a-m]",text)
print(match)

['a', 'b', 'b', 'b', 'b', 'b', 'b', 'c', 'a', 'a', 'b', 'b']


In [24]:
# Match any character *, /, or +
# Try also with \ at the end
text="*///+++*"
match=re.findall(r"[*/+]",text)
print(match)

['*', '/', '/', '/', '+', '+', '+', '*']


In [25]:
# Match any two-digit number
text="I have 22 apples and 40 oranges in my bag."
match=re.findall(r"[0-5][0-9]",text)
print(match)

['22', '40']


In [26]:
# Match any characters that are not a, b or c
text = "I have 22 apples and 40 oranges in my bag."
match = re.findall(r"[^abc]", text)
print(match)


['I', ' ', 'h', 'v', 'e', ' ', '2', '2', ' ', 'p', 'p', 'l', 'e', 's', ' ', 'n', 'd', ' ', '4', '0', ' ', 'o', 'r', 'n', 'g', 'e', 's', ' ', 'i', 'n', ' ', 'm', 'y', ' ', 'g', '.']


## Useful functions and properties

In [33]:
# Finding patterns using search
text = "The train travels through the mountain in the rain."
match = re.search("ain", text)
print("The first 'ain' match is found in position:", match.start()) 

The first 'ain' match is found in position: 6


In [31]:
# Finding patterns using findall
text = "abcdef"
match = re.findall(r"\w\w", text)
print(match)

['ab', 'cd', 'ef']


#### Note: Once a character has been included in a match, it is not included in the following one: Findall does not return "ab", "bc", "cd", etc.

In [36]:
# Extracting "groups" from matches
# Find all words which end in "ain". 
text = "Name: John Age: 22 Grade: 9.5."
match = re.findall(r"Name:\s\w*\s", text)
print(match)


['John']


# Greedy and non-greedy quantifying operators
Greedy operators are a way of indicating that a certain character or pattern can be encountered more than once. 
The main greedy operators are "*", "+", and "?". Additionally, combinations between them are possible. Below are showcased patterns with the different operators and possible combinations between them (non-greedy).

### The greedy operator "*"

In [37]:
# "*" ->  0 or more repetitions
text="acabab"
pattern=r"a.*b"
found=re.findall(pattern, text)
print(found)


['acabab']


### The greedy operator "+"


In [6]:
# "+" -> 1 or more repetitions
# Try also "a", "abbbbb"
text="ab"
pattern=r"ab+"
found=re.findall(pattern, text)
print(found)


['ab']


### The greedy operator "?"


In [10]:
# "+" -> 0 or 1 repetitions
# Try also "a", "acb", "ba"
text="ab"
pattern=r"ab?"
found=re.findall(pattern, text)
print(found)


['a']


### The non-greedy operator "*?"


In [45]:
# "*?" ->  0 or more repetitions (non-greedy)
text="acabab"
# Try also r"a.b*?"
pattern=r"a.*?b"
found=re.findall(pattern, text)
print(found)


['ac', 'ab', 'ab']


### The non-greedy operator "+?"

In [42]:
# "+?" -> 1 or more repetitions (non-greedy)
text="abbbbbbbbbbbb"
pattern=r"ab+?"
found=re.findall(pattern, text)
print(found)


['ab']
