In [None]:
from IPython.display import Image
from IPython.display import clear_output
from IPython.display import FileLink, FileLinks

## Introduction to

![title](../img/python-logo-master-flat.png)

### with Application to Bioinformatics

#### - Day 5

## Review Day 4

- Controlling loops
- Scope
- Importing a module
- Documenting your own code

## TODAY

- Formatting
- Regular expressions


## String formatting


Putting your values into tables or other nice looking strings.

In [None]:
"A string with formatting instructions".format(some, values, to, put, in, string)

In [None]:
"The movie '{}' got {} votes".format('Beauty and the Beast', 10)

In [None]:
valueA = 'Beauty and the Beast'
valueB = '10'

In [None]:
valueD = 'It'
valueE = '10000'

In [None]:
def pretty(val1, val2):
    print("The movie '"+ val1 +"' got "+ val2 +" votes.")

pretty('Beauty and the Beast', '10')
pretty('It', '10000')

In [None]:
def pretty(val1, val2):
    print("The movie '{:20}' got {:^10} votes.".format(val1, val2))

    
pretty(valueA, valueB)
pretty(valueD, valueE)

In [None]:
"I've read the story '{} and the {}' {} times".format(valueA, valueB, valueC)

### Positional arguments

In [None]:
"The movie '{}' got {} votes.".format("Beauty and the Beast", "10")

In [None]:
"The movie '{0}' got {1} votes.".format("Beauty and the Beast", "10")

In [None]:
"The movie '{1}' got {1} votes.".format("Beauty and the Beast", "10")

In [None]:
"The movie '{3}' got {43} votes.".format("Beauty and the Beast", "10")

In [None]:
"I've read the story '{1} and the {0}' {2} times".format('Beauty', 'Beast', 10)

In [None]:
"I've read the story '{11} and the {20}' {22} times".format('Beauty', 'Beast', 10)

### Keyword arguments

In [None]:
"The movie '{movie}' got {number} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{movie}' got {movie} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{0}' got {1} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{0}' got {number} votes.".format("Beauty and the Beast", number="10")

In [None]:
"I've read the story '{princess} and the {beast}' {num} times".format(princess='Beauty', beast='Beast', num=10)

In [None]:
"I've read the story '{0} and the {1}' {2} times".format(princess='Beauty', beast='Beast', num=10)

Positional arguments comes first, keyword arguments after!

In [None]:
"I've read the story '{0} and the {1}' {num} times".format('Beauty', 'Beast', num=10)

### Field width

In [None]:
"The movie '{movie:30}' got {number:10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"I've read the story '{0:30} and the {1:20}' {2:10} times".format('Beauty', 'Beast', '10')

In [None]:
"I've read the story '{0:30} and the {1:20}' {2:10} times".format('Beauty', 'Beast', 10)

### Alignment

In [None]:
"The movie '{movie:<30}' got {number:>10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{movie:^30}' got {number:^10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"I've read the story '{0:<30} and the {1:>20}' {2:^10} times".format('Beauty', 'Beast', 10)

In [None]:
print("I've read the story '{0:30} and the {1:20}' {2:^20} times".format(valueA, valueB, valueC))
print("I've read the story '{0:30} and the {1:20}' {2:^20} times".format(valueD, valueE, valueF))

In [None]:
print("I've read the story '{0:^30} and the {1:>20}' {2:>20} times".format(valueA, valueB, valueC))
print("I've read the story '{0:^30} and the {1:>20}' {2:>20} times".format(valueD, valueE, valueF))

In [None]:
def pretty(val1, val2):
    print("The movie '{:20}' got {:^10} votes.".format(val1, val2))

    
pretty('Beauty and the Beast', 10)
pretty('It', 10000)

### Filling

In [None]:
"The movie '{movie:_<30}' got {number:*^10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
print("I've read the story '{0:_^30} and the {1:_>20}' {2:_>20} times".format(valueA, valueB, valueC))

In [None]:
"|{0:-^20}|{1:-^20}|".format(' Movie ',' Votes ')

In [None]:
"|{0:-^20}|{1:-^20}|{2:-^20}|".format(' Lines ',' Words ',' Characters ')

Conversion

In [None]:
"int: {0:d};  hex: {0:x};  oct: {0:o};  bin: {0:b}".format(42)

In [None]:
"int: {0:d};  hex: {0:#x};  oct: {0:#o};  bin: {0:#b}".format(42)

### Rounding

In [None]:
points = 19
total = 23
'Score: {}'.format( points / total )

In [None]:
'Score: {:.2f}'.format( points / total )

In [None]:
'Score: {:.2%}'.format( points / total )

### Formatting dates

In [None]:
import datetime
d = datetime.datetime(2010, 7, 4, 12, 15, 58)
d

In [None]:
'{:%Y-%m-%d %H:%M:%S}'.format(d)

In [None]:
now = datetime.datetime.now()
'{:%Y-%m-%d week:%W %A %H:%M:%S}'.format(now)

Learn more: [strftime.org](http://strftime.org)


### Printing a  full table

In [None]:
# movies :: [(name, votes, total score)]
movies = [('Beauty and the Beast', 10, 55), ('It', 10000, 30450)]
width = 5

print('|{:-^30}|{:-^10}|{:-^10}|'.format('Movie', 'Votes', 'Average'))
for movie, votes, total in movies: 
    print('|{:30}|{:^10}|{:^10.2f}|'.format(movie, votes, total/votes))

Learn more from the [Python docs](https://docs.python.org/3.4/library/string.html#format-string-syntax)!


### Older syntax

In [None]:
print("I've read the story '%s' %s times" % ('Alice in Wonderland', 10))


### Alternative syntax: `f-string`

In [None]:
story = "Alice in Wonderland"
num = 10
print(f"I've read the story '{story}' {num} times")

## Regular Expressions

Search for patterns in text.



- A formal language for defining search patterns

- Why?

 - Example: find information in a bibliographic dictionary [skbl.se](skbl.se)

- In most programming languages, text editors...

 - Search/replace
   - becuase &rarr; because

### Common operations
- `.` matches anything (once)
- `?` repeat previous pattern 0 or 1 times
- `*` repeat previous pattern 0 or more times
- `+` repeat previous pattern 1 or more times

`.*` matches everything!


<center><code>"h?.itfeld?t?"</code></center>



<font color="green"><center>hvitfeldt</center>
   
<center>witfelt</center></font>


<center><font color="red">oitfel</font></center>
<center><font color="red">8itfel</font></center>

</center>

- `\d` matches any digit
- `\s` matches any whitespace (spaces, tabs, ...)
- `[abc]` matches a single character defined in this set {a, b, c}
- `[^abc]` matches a single character that is **not** a, b or c

`[a-z]` matches all letters between `a` and `z` (the english alphabet).

<center><code>"h?[uwv]itfel[dt]+</code></center>

<font color="green"><center>hvitfeldt</center>
    
<center>witfelt</center></font>


<center><strike><font color="red">oitfel</font></strike></center>
<center><strike><font color="red">8itfel</font></strike></center>



## Exercise 1

<b>&rarr; Notebook Day_5_Exercise_1  (~15 minutes) </b>

### Regular expressions in Python

In [None]:
import re

In [None]:
p = re.compile('ab*')
p

In [None]:
p.search('abc')

In [None]:
print(p.search('cb'))

### Searching

In [None]:
p = re.compile('HELLO')
m = p.search('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]')

print(m)

### Matching

Like `search`, but anchored in the beginning of the string.

In [None]:
p = re.compile('[a-z]+')  # any letter but at least one


In [None]:
p = re.compile('[a-z]+')  # any letter but at least one

result = p.match('hello world!')

result

In [None]:
print(result)

In [None]:
result = p.match('Hello world')
print(result)

In [None]:
result = p.search('Hello world')
print(result)

### Case insensitiveness

In [None]:
p = re.compile('[a-z]+', re.IGNORECASE)
result = p.match('Hello World')
print(result)

### The match object

In [None]:
print('The result {} is of type {}'.format(result, type(result)))

`result.group()`: Return the string matched by the expression

`result.start()`: Return the starting position of the match

`result.end()`: Return the ending position of the match

`result.span()`: Return both (start, end)

In [None]:
result.group()

In [None]:
result.start()

In [None]:
result.end()

In [None]:
result.span()

In [None]:
p = re.compile('.*HELLO.*')

In [None]:
m = p.match('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]')

In [None]:
m.group()

The `*` is **greedy**.

### Finding all the matching patterns

In [None]:
p = re.compile('\d+')  # any digits
p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')

In [None]:
p = re.compile('HELLO')
matches = p.findall('gsdfgsdfgs  HeLLo  __!@£§≈[|ÅÄÖ‚…’ﬁ]  HELLO  ...ÖQ!§<>kds')

In [None]:
print(matches)

In [None]:
matches = p.findall('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]  HELLO  ...ÖQ!§<>kds')
print(matches)

In [None]:
for m in matches:
    print('Found {0:30} at position {1}'.format(m.group(), m.start()) )

In [None]:
p = re.compile('HELLO')
objects = p.finditer('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]  HELLO  ...ÖQ!§<>kds')
print(objects)

In [None]:
for m in objects:
    print('Found {0:30} at position {1}'.format(m.group(), m.start()) )

In [None]:
objects = p.finditer('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]  HELLO  ...ÖQ!§<>kds')
for m in objects:
    print('Found {0:^30} at position {1}'.format(m.group(), m.span()) )

### How to find a full stop?

In [None]:
txt = "The first full stop is here: ."
p = re.compile('.')

p.search(txt).group()

In [None]:
p = re.compile('\.')

p.search(txt).group()

### More operations
- `\` escaping a character
- `^` beginning of the string
- `$` end of string
- `|` boolean `or`

### Capturing groups

Find the domain name in an email address:

kalle.larsson@**gmail**.com pigelin@**uu**.se

In [None]:
email1 = "kalle.larsson@gmail.com"
email2 = "pigelin@uu.se"
p = re.compile('.*@.*\..*')

p.search(email1).group()


In [None]:
p = re.compile('.*@(.*)\..*')
p.match(email1).group(1)


In [None]:
p = re.compile('.*@(.*)\.(.*)')
p.match(email1).groups()


### Substitution

In [None]:
p = re.compile('gmail')
p.sub('hotmail', email1)

In [None]:
p = re.compile('@.*\.')
e1 = p.sub('@hotmail.', email1)
e2 = p.sub('@hotmail.', email2)
print(e1)
print(e2)

#### Finally, we can fix our spelling mistakes!

In [None]:
txt = "Do it becuase I say so, not because you want!"

In [None]:
p = re.compile('becuase')
p.sub('because', txt)

### Backreference

A reference to a previously captured group. Finds repetitions!

In [None]:
p = re.compile(r'(.)TG\1')
p.search('CTGACCATGAG')

In [None]:
email3 = "kalle.kalle@gmail.com"
email4 = "lisa-lisa@hotmail.com"
email5 = "lisa-lisen@uu.se"

p = re.compile(r'(.*)[._-]\1@.*')

In [None]:
p.search(email3)

In [None]:
p.search(email4)

In [None]:
p.search(email5)

Remember to use *raw strings* for all backreferences! `r'\1'`

### Substitution + backreference!


In [None]:
p = re.compile(r'(.*)@')
p.sub(r'\1-\1@', email2)

In [None]:

p = re.compile(r'(.*)@(.*)\.')
p.sub(r'\1-\1@\2.\2.', email2)


### Substitution + backreference


**Date formats**

Swedish format: `year-month-day`

UK format:      `day/month/year`

US format:      `month/day/year`

In [None]:
swedish_date = '2018-04-11'

p = re.compile(r'(\d\d\d\d)-(\d\d)-(\d\d)')

In [None]:
uk_date = p.sub(r'\3/\2/\1', swedish_date)
print('UK:', uk_date)

In [None]:
us_date = p.sub(r'\2/\3/\1', swedish_date)
print('US:', us_date)

**Typical code structure:**

```python
p = re.compile( ... )
m = p.match('string goes here')
if m:
    print('Match found: ', m.group())
else:
    print('No match')
```

## Exercise 2

Read more: full documentation https://docs.python.org/3.6/library/re.html
         
         
<br/>
<b>&rarr; Notebook Day_5_Exercise_2  (~30 minutes) </b>
