In [None]:
from IPython.display import Image
from IPython.display import clear_output
from IPython.display import FileLink, FileLinks

## Introduction to

![title](../img/python-logo-master-flat.png)

### with Application to Bioinformatics

#### - Day 5

## Review Day 4

- More control!

  - `break`, `continue`, `pass`
  - keyword arguments
  
- Higher level: working with modules

  - importing code
  - documentation: reading and writing

#### Control loops

- `break` a loop => stop it

<center>
<img src="../img/break.png" alt="break" width="30%"/>
</center>


#### Control loops

- `continue` => go on to the next iteration

<center>
<img src="../img/continue.png" alt="break" width="30%"/>
</center>

#### Control loops

- `pass`  => do nothing

```py
for line in file:
    if len(line) > 40:
        # TODO find out what to do here
        pass
    do_something(line)
```

#### Keyword arguments

```py
open(filename, encoding="utf-8")
```

```py
open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)
```


#### Keyword arguments

- programmer: set default values
- user: ignore parameters
- better overview

#### Using code

- Python standard modules
- Your collegue's code


- `import datetime`
- `import xml.etree.ElementTree as ET`
- `from collections import defaultdict`


- `help(datetime)`

#### Using code

**`Counter`** - a handy type of dictionary.

```py
help(Counter)
```


> ... Elements are stored as dictionary keys and their counts are stored as dictionary values.
...


In [None]:
from collections import Counter

In [None]:
text = 'lettercountingstring'

In [None]:
c = Counter(text)
c

In [None]:
c.most_common(3)

In [None]:
for letter in "a nice story about books":
    if letter is not " ":
        c[letter] += 1

In [None]:
c.most_common(3)

https://docs.python.org/3/py-modindex.html

#### Your code being used

- write comments       `# why do I do this?`
- write documentation  `"""what is this? how do you use it?"""`

#### Your code being used

```py
def f(a, b):
    for c in open(a):
        if c.startswith(b):
            print(c)
            ```

==>

```py
def print_lines(filename, start):
    for line in open(filename):
        if line.startswith(start):
            print(line)
            ```

**<center>Care about the names of your variables and functions</center>**

#### Your code being used

```py
def main(input):
    ...
    
    
if __name__ == "__main__":
    main(sys.argv[1])
    ```

## TODAY

- Formatting
- Regular expressions
- Sum up of the course

## String formatting


Putting your values into tables or other nice looking strings.

In [None]:
"A string with formatting instructions".format(some, values, to, put, in, string)

In [None]:
"The movie '{}' got {} votes".format('Beauty and the Beast', 10)

In [None]:
def pretty(val1, val2):
    print("The movie '"+ val1 +"' got "+ val2 +" votes.")

pretty('Beauty and the Beast', '10')
pretty('It', '10000')

In [None]:
def pretty(val1, val2):
    print("The movie '{:20}' got {:^10} votes.".format(val1, val2))

    
pretty(valueA, valueB)
pretty(valueD, valueE)

### Positional arguments

In [None]:
"The movie '{}' got {} votes.".format("Beauty and the Beast", "10")

In [None]:
"The movie '{0}' got {1} votes.".format("Beauty and the Beast", "10")

In [None]:
"The movie '{1}' got {1} votes.".format("Beauty and the Beast", "10")

In [None]:
"The movie '{3}' got {43} votes.".format("Beauty and the Beast", "10")

### Keyword arguments

In [None]:
"The movie '{movie}' got {number} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{movie}' got {movie} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{0}' got {1} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{0}' got {number} votes.".format("Beauty and the Beast", number="10")

Positional arguments comes first, keyword arguments after!

### Field width

In [None]:
"The movie '{movie:30}' got {number:10} votes.".format(movie="Beauty and the Beast", number="10")

### Alignment

In [None]:
"The movie '{movie:<30}' got {number:>10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"The movie '{movie:^30}' got {number:^10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
def pretty(val1, val2):
    print("The movie '{:20}' got {:^10} votes.".format(val1, val2))

    
pretty('Beauty and the Beast', 10)
pretty('It', 10000)

### Filling

In [None]:
"The movie '{movie:_<30}' got {number:*^10} votes.".format(movie="Beauty and the Beast", number="10")

In [None]:
"|{0:-^20}|{1:-^20}|".format(' Movie ',' Votes ')

### Rounding

In [None]:
points = 19
total = 23
'Score: {}'.format( points / total )

In [None]:
'Score: {:.2f}'.format( points / total )

In [None]:
'Score: {:.2%}'.format( points / total )

### Formatting dates

In [None]:
import datetime
d = datetime.datetime(2010, 7, 4, 12, 15, 58)
d

In [None]:
'{:%Y-%m-%d %H:%M:%S}'.format(d)

In [None]:
now = datetime.datetime.now()
'{:%Y-%m-%d week:%W %A %H:%M:%S}'.format(now)

Learn more: [strftime.org](http://strftime.org)


### Printing a  full table

In [None]:
# movies :: [(name, votes, total score)]
movies = [('Beauty and the Beast', 12, 55), ('It', 10000, 30450)]
 
print('|{:-^30}|{:-^10}|{:-^10}|'.format('Movie', 'Votes', 'Average'))
for movie, votes, total in movies: 
    print('|{:30}|{:^10}|{:^10.2f}|'.format(movie, votes, total/votes))

Learn more from the [Python docs](https://docs.python.org/3.4/library/string.html#format-string-syntax)!


### Older syntax

In [None]:
print("I've read the story '%s' %s times" % ('Alice in Wonderland', 10))


### Alternative syntax: `f-string`

In [None]:
story = "Alice in Wonderland"
num = 10
print(f"I've read the story '{story}' {num} times")

(*version >3.6*)

## Regular Expressions

Search for patterns in text.



- A formal language for defining search patterns

- Why?

 - Example: find information in a bibliographic dictionary [skbl.se](https://skbl.se)

- In most programming languages, text editors...

 - Search/replace
   - becuase &rarr; because

### Defining a search pattern

<center>
<img src="../img/regex.png" alt="regex" width="50%"/>
</center>

#### Common operations
- `.` matches anything (once)
- `?` repeat previous pattern 0 or 1 times
- `*` repeat previous pattern 0 or more times
- `+` repeat previous pattern 1 or more times

`.*` matches everything!


<center><code>"h?.it+feld?t?"</code></center>



<font color="green"><center>hvitfeldt</center>
   
<center>wittfelt</center></font>


<center><font color="red">oitfel</font></center>
<center><font color="red">8itfel</font></center>

</center>

#### More common operations

- `\d` matches any digit
- `\s` matches any whitespace (spaces, tabs, ...)
- `\S` matches any non-whitespace
- `[abc]` matches a single character defined in this set {a, b, c}
- `[^abc]` matches a single character that is **not** a, b or c

#### `[a-z]` matches all letters between `a` and `z` (the english alphabet).

#### `[a-z]+` matches any (lowercased) english word.

<center><code>"h?[uwv]it+fel[dt]+</code></center>

<font color="green"><center>hvitfeldt</center>
    
<center>wittfelt</center></font>


<center><strike><font color="red">oitfel</font></strike></center>
<center><strike><font color="red">8itfel</font></strike></center>



## Exercise 1

<b>&rarr; Notebook Day_5_Exercise_1  (~15 minutes) </b>

### Regular expressions in Python

In [None]:
import re

In [None]:
p = re.compile('ab*')
p

### Searching

In [None]:
p = re.compile('ab*')

p.search('abc')

In [None]:
print(p.search('cb'))

In [None]:
p = re.compile('HELLO')
m = p.search('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]')

print(m)

### Matching

Like `search`, but anchored in the beginning of the string.

In [None]:
p = re.compile('[a-z]+')  # any letter but at least one

result = p.match('hello world!')

result

In [None]:
result = p.match('Hello world')
result

In [None]:
result = p.search('Hello world')
result

### Case insensitiveness

In [None]:
p = re.compile('[a-z]+', re.IGNORECASE)
result = p.match('Hello World')
result

### The match object

In [None]:
print('The result {} is of type {}'.format(result, type(result)))

`result.group()`: Return the string matched by the expression

`result.start()`: Return the starting position of the match

`result.end()`: Return the ending position of the match

`result.span()`: Return both (start, end)

In [None]:
result.group()

In [None]:
result.start()

In [None]:
result.end()

In [None]:
result.span()

#### Zero or more...?

In [None]:
p = re.compile('.*HELLO.*')

In [None]:
m = p.match('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]')

In [None]:
m.group()

The `*` is **greedy**.

### Finding all the matching patterns

In [None]:
p = re.compile('HELLO')
objects = p.finditer('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]  HELLO  ...ÖQ!§<>kds')
print(objects)

In [None]:
for m in objects:
    print('Found {0:30} at position {1}'.format(m.group(), m.start()) )

In [None]:
objects = p.finditer('gsdfgsdfgs  HELLO  __!@£§≈[|ÅÄÖ‚…’ﬁ]  HELLO  ...ÖQ!§<>kds')
for m in objects:
    print('Found {0:^30} at position {1}'.format(m.group(), m.span()) )

### How to find a full stop?

In [None]:
txt = "The first full stop is here: ."
p = re.compile('.')

p.search(txt).group()

In [None]:
p = re.compile('\.')

p.search(txt).group()

### More operations
- `\` escaping a character
- `^` beginning of the string
- `$` end of string
- `|` boolean `or`

### Capturing groups

Find the domain name in an email address:

kalle.larsson@**gmail**.com pigelin@**uu**.se

In [None]:
email1 = "kalle.larsson@gmail.com"
email2 = "pigelin@uu.se"
p = re.compile('.*@.*\..*')

p.search(email1).group()


In [None]:
p = re.compile('.*@(.*)\..*')
p.match(email1).group(1)

In [None]:
p = re.compile('.*@(.*)\.(.*)')
p.match(email1).groups()


### Capturing groups

**Structure**

- Each pair of parentheses creates a group
- `group()` stores the whole match, as usual
- `group(1)` stores the first group, `group(2)` the second...
- `groups()` stores all groups

### Substitution

In [None]:
p = re.compile('gmail')
p.sub('hotmail', email1)

In [None]:
p = re.compile('@.*\.')
e1 = p.sub('@hotmail.', email1)
e2 = p.sub('@hotmail.', email2)
print(e1)
print(e2)

#### Finally, we can fix our spelling mistakes!

In [None]:
txt = "Do it becuase I say so, not becuase you want!"

In [None]:
p = re.compile('becuase')
p.sub('because', txt)

#### Overview

 - Construct regular expressions
 
     ```py
     p = re.compile()```
     
 - Searching
 
     ```py
     p.search(text)
     p.match(text)```
     
 - Replacements
 
     ```py
     p.sub(replacement, text)
     ```

**Typical code structure:**

```python
p = re.compile( ... )
m = p.search('string goes here')
if m:
    print('Match found: ', m.group())
else:
    print('No match')
```

### Backreference

A reference to a previously captured group. Finds repetitions!

In [None]:
p = re.compile(r'(.)TG\1')

<center>
<img src="../img/backref.png" alt="break" width="20%"/>
</center>

In [None]:
p.search('CTGACCATGAG')
p

<center>CTGACC<u><b>A</b>TG<b>A</b></u>G</center>

In [None]:
kalle = "kalle-kalle@gmail.com"
lisa = "lisa-lisa@hotmail.com"
lisen = "lisa-lisen@uu.se"

p = re.compile(r'(.*)-\1@(.*)')

In [None]:
p.search(kalle)

In [None]:
p.search(lisa)

In [None]:
p.search(lisen)

### Backreference

**Structure**

- Each pair of parentheses creates a group, which can be referred to
- `\1` refers the first group, `\2` the second...
- Remember to use *raw strings* for all backreferences! `r'\1'`
- In the match object, the groups can be found with `group(n)` and `groups()`, as usual

### Substitution + backreference!


In [None]:
email2

In [None]:
p = re.compile(r'(.*)@')
p.sub(r'\1-\1@', email2)

In [None]:
p = re.compile(r'(.*)@(.*)\.')
p.sub(r'\1-\1@\2.\2.', email2)

### Substitution + backreference


**Date formats**

<table border="0" width="80%" style="font-size:80%">
<tr>
    <td >Swedish format:</td><td> <code>year-month-day</code></td><td>   2018-11-23</td>
</tr><tr>
    <td>UK format:</td><td>      <code>day/month/year</code></td><td>   23/11/2018</td>
</tr><tr>
    <td>US format:</td><td>      <code>month/day/year</code></td><td>   11/23/2018</td>
</tr>
</table>

In [None]:
swedish_date = '2018-11-23'

p = re.compile('(\d\d\d\d)-(\d\d)-(\d\d)')

In [None]:
uk_date = p.sub(r'\3/\2/\1', swedish_date)
print('UK:', uk_date)

In [None]:
us_date = p.sub(r'\2/\3/\1', swedish_date)
print('US:', us_date)

## Exercise 2

Read more: full documentation https://docs.python.org/3.6/library/re.html
         
         
<br/>
<b>&rarr; Notebook Day_5_Exercise_2  (~30 minutes) </b>


### Sum up!