# Lambda (anonymous) functions
Defined without a name.
Used when we need a "small" function for a short amount of time...

In [None]:
def f(x):
    return x ** 2
    
print(f(5))

In [None]:
f = lambda x: x ** 2
print(f(5))

In [None]:
# you don't actually have to give it a name, hence the "anonymous"
# can be used one time

(lambda x: x ** 2)(5)

In [None]:
(lambda x, y: x + y)(1, 2)

So why bother with an unnamed lambda? Why not simply use 1+2?

...

We can save the time by using lambda function as an argument of another function (which needs a function)...

In [None]:
def is_valid_seq(seq, is_valid_letter_func):

    for letter in seq:
        if not is_valid_letter_func(letter):
            return False
            
    return True

dna_seq = 'ACGTTGACGT'
print(is_valid_seq(dna_seq, lambda nt: nt in 'ACGT'))
print(is_valid_seq(dna_seq, lambda nt: nt in 'ACGU'))

# so we can specify the behavior of the required function "on the go", without having to predifine it

In [None]:
# another example of usage

pairs = [(1, 6), (4, 2), (8, 1), (7, 6)]
print(sorted(pairs, key = lambda pair: pair[0] - pair[1]))

Note: if your lambda function becomes too long (and hardly readable), then define the function separately...

Note: usage of labmda function is suitable when the function returns something; if the function also changes something else (unexpectedly, has side effects) in the global environment (e.g.), then do not use lambda...

# filter & map

In [None]:
my_list = [1 ,5, -2, -4, 0, 3, -2]
filtered_list = []

for number in my_list:
    if number > 0:
        filtered_list.append(number)
        
print(filtered_list)

In [None]:
# filter takes a function and a list of elements
# returns only those elements for which the function returns True

print(list(filter(lambda x: x > 0, my_list)))

In [None]:
digits = []

for digit in range(10):
    digits.append(str(digit))
    
print(digits)
print(''.join(digits))

In [None]:
# map function takes a function and a sequence
# applies the function on every element of the sequence and returns the new sequence

digits = list(map(str, range(10)))
print(digits)
print(''.join(digits))

In [None]:
# another example

print(list(map(lambda x: x ** 2, range(10))))

In [None]:
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
dna_seq = 'AATGCGATGCAGTGAGTAAGTCAAAAGTAA'
print(''.join(map(complement.get, dna_seq[::-1])))

# List/dict/set comprehension

In [None]:
print(list(map(lambda x: x ** 2, range(10))))
print([x ** 2 for x in range(10)])

In [None]:
my_list = [1 ,5, -2, -4, 0, 3, -2]
print(list(filter(lambda x: x > 0, my_list)))
print([x for x in my_list if x > 0])

In [None]:
print(list(map(lambda x: x ** 2, filter(lambda x: x > 0, my_list))))
print([x ** 2 for x in my_list if x > 0])

In [None]:
dna_seq = 'AATGCGATGCAGTGAGTAAGTCAAAAGTAA'
codons = [dna_seq[i:(i + 3)] for i in range(0, len(dna_seq), 3)]
print(codons)

In [None]:
numbers = ['one', 'two', 'three', 'four']
numbers_dict = {number: i + 1 for i, number in enumerate(numbers)}
print(numbers_dict)

In [None]:
# additional examples - skip

from collections import Counter

dna_seq = 'AATGCGATGCAGTGAGTAAGTCAAAAGTAA'
nt_counts = Counter(dna_seq)
print(nt_counts)

nt_freqs = {nt: count / len(dna_seq) for nt, count in nt_counts.items()}
print(nt_freqs)

In [None]:
# additional examples - skip

print([i % 6 for i in range(5, 20, 2)])
print({i % 6 for i in range(5, 20, 2)})

# if we wanted a dict: {i: i % 6 for...}

In [None]:
# additional examples - skip

nested_list = [[1, 2, 3], [4, 5], [6, 7, 8, 9], [10]]
flat_list = [i for sublist in nested_list for i in sublist]
print(flat_list)
# can be expaned, e.g.:
# flat_list = [i ** 2 for sublist in nested_list for i in sublist if len(sublist) < 3 and i < 8]

# JSON

In [None]:
import json

f = open(r'C:\Users\mirza\OneDrive\Desktop\Fakultet\Biological Data Analysis with Python\Code\\3 herpesvirus_genome.json', 'r')
data = json.load(f)
f.close()
print(type(data))

In [None]:
# It's safer to check how big something is before printing it.
print(len(data))

In [None]:
for key, value in data.items():
    print(key, type(value))

In [None]:
for key, value in data.items():
    if key != 'coding_regions':
        print('%s: %s' % (key, value))

In [None]:
coding_regions = data['coding_regions']
print(type(coding_regions))
print(len(coding_regions))

In [None]:
coding_region = coding_regions[0]
print(type(coding_region))

In [None]:
print(len(coding_region))

In [None]:
for key, value in coding_region.items():
    print(key, type(value))

In [None]:
for key, value in coding_region.items():
    if key != 'intervals':
        print('%s: %s' % (key, value))

In [None]:
print(len(coding_region['intervals']))

In [None]:
interval, = coding_region['intervals']
print(type(interval))

In [None]:
print(len(interval))

In [None]:
for key, value in interval.items():
    print(key, type(value))

In [None]:
print(interval)

In [None]:
print(coding_region)

In [None]:
products = []

for coding_region in coding_regions:
    products.append(coding_region['product'])
    
print(products)

In [None]:
lengths_per_group = {'envelope': [], 'membrane': [], 'capsid': []}
all_lengths = []

for coding_region in coding_regions:
    
    product_name = coding_region['product'].lower()
    length = len(coding_region['translation'])
    
    for group_name, group_lengths in lengths_per_group.items():
        if group_name in product_name:
            group_lengths.append(length)
    
    all_lengths.append(length)
    
lengths_per_group['all'] = all_lengths

for group_name, group_lengths in sorted(lengths_per_group.items()):
    avg = sum(group_lengths) / len(group_lengths)
    print('%s: # = %d, avg. = %.2f aa' % (group_name, len(group_lengths), avg))

In [None]:
f = open(r'C:\Users\mirza\OneDrive\Desktop\Fakultet\Biological Data Analysis with Python\Code\protein_lengths_per_group.json', 'w')
json.dump(lengths_per_group, f)
f.close()

In [None]:
# convert json/original data to raw format
raw_json = json.dumps(lengths_per_group)
print(type(raw_json))
print(raw_json)

In [None]:
# convert it back to the json/original data
data = json.loads(raw_json)
print(type(data))
print(data)

# CSV

In [None]:
# Reading human gene annotations from gencode.v29lift37.annotation.gtf.gz at:
# https://www.gencodegenes.org/human/release_29lift37.html
# (download Comprehensive gene annotation in GTF format)

import gzip

f = gzip.open(r'C:\Users\mirza\OneDrive\Desktop\Fakultet\Biological Data Analysis with Python\Code\3 gencode.v29lift37.annotation_partial.gtf.gz', 'rt')
print(f.read(1000))
f.close()

In [None]:
# When using gzip with Python 3, you need to explictly ask for text (t) mode
# (gzip by default reads data in binary mode)

f = gzip.open(r'C:\Users\mirza\OneDrive\Desktop\Fakultet\Biological Data Analysis with Python\Code\3 gencode.v29lift37.annotation_partial.gtf.gz', 'r')
print(f.read(1000))
f.close()

In [None]:
import csv

f = gzip.open(r'C:\Users\mirza\OneDrive\Desktop\Fakultet\Biological Data Analysis with Python\Code\3 gencode.v29lift37.annotation_partial.gtf.gz', 'rt')

# Default delimiter is comma (,)
csv_reader = csv.reader(f, delimiter = '\t')

# Skip 5 first header lines 
for _ in range(5):
    next(csv_reader)
    
annotations = []

for _ in range(100):
    annotations.append(next(csv_reader))

f.close()

print(len(annotations))
print(annotations[:5])

In [None]:
# If want to go over all lines, just iterate over the csv reader with a for loop 
# (no need to use the 'next' function)
# (this will take too long if you are working on a full file)

for line in csv_reader:
    # Do something...
    pass

In [None]:
# https://www.gencodegenes.org/human/release_29lift37.html
# Documentation -> Data Format

genes = []

def parse_extra_fields(raw_extra_fields):

    extra_fields = {}

    for raw_extra_field in raw_extra_fields[:-1].split(';'):
        key, raw_value = raw_extra_field.strip().split(' ')
        value = raw_value.strip('"')
        extra_fields[key] = value
        
    return extra_fields

# out of 9 data formats, we only want some, we use '_' for others

for a_chr, _, a_type, a_start, a_end, _, _, _, raw_extra_fields in annotations:
    if a_type == 'gene':
        extra_fields = parse_extra_fields(raw_extra_fields)
        genes.append([extra_fields['gene_name'], extra_fields['gene_type'], a_chr, int(a_start), int(a_end)])

print(len(genes))
print(genes)

##### Exercise

In [None]:
# Modify the above code (copy it first below) so that it also extracts
# the strand on which the gene is located
# Add that extra output at the end of the info in "genes" variable...

In [None]:
f = open(r'c://downloads/genes.csv', 'w', newline = '')
# we use newline '' in order to avoid writing empty lines inbetween our data 

csv_writer = csv.writer(f)
csv_writer.writerows(genes)
f.close()

Notes:
* `newline = ''` is for avoiding blank lines between each two content lines
* `writerows` expects a list of lists of strings (or objects to convert to strings)
* You can write one row at a time using `writerow`, which expects a list of strings

##### Exercise

In [None]:
# Read the new file "genes.csv" using csv library and print out 
# every gene name, its length, and strand orientation...
# e.g. "Gene X is 1000 nucleotides long, and it is located on a + strand of a chromosome."
