# Code optimization (examples)

Some are probably obvious...

### Print in line if you are tracking something

In [None]:
import time

for i in range(10):
    print("Current iteration is:", i+1, end="\r")
    time.sleep(1)

### Use appropriate data formats

In [None]:
large_list = [x for x in range(6**10)]
large_set = set(x for x in range(6**10))

print(len(large_list))
print(len(large_set))

In [None]:
list_start = time.time()
print(60000000 in large_list)
list_end = time.time()

set_start = time.time()
print(60000000 in large_set)
set_end = time.time()

print("List search time:", list_end - list_start)
print("Set search time:", set_end - set_start)

In [None]:
# also delete large variables if no longer needed
# these two take up several GBs of RAM

del(large_list, large_set)

### Be conservative with loops (especially in large files)

In [None]:


# If you have to do multiple analyses, include all of them in a single loop
# E.g. count the number of ncRNAs in a file and calculate the average length of mRNAs

# Don't:

ncRNAs_count = 0

for transcript in file:
    if transcript == ncRNA:
        ncRNAs_count += 1

print("ncRNAs_count:", ncRNAs_count)
        
mRNAs_length = []

for transcript in file:
    if transcript == mRNA:
        mRNAs_length.append(len(transcript))

import numpy as np
print("Mean of mRNA transcripts:", np.mean(mRNAs_length))

In [None]:
# Do:

import numpy as np
ncRNAs_count = 0
mRNAs_length = []

for transcript in file:
    if transcript == ncRNA:
        ncRNAs_count += 1
    if transcript == mRNA:
        mRNAs_length.append(len(transcript))

print("ncRNAs_count:", ncRNAs_count)
print("Mean of mRNA transcripts:", np.mean(mRNAs_length))

### Avoid unnecessary computation and variable defintion
(unless it is better for readibility, or saves computation)

In [None]:
# E.g. calculate the average GC content of mRNA transcripts

# Don't:

GC_content = []

for transcript in file:
    GC_perc = GC(transcript)
    if transcript == mRNA:
        GC_content.append(GC_perc)

In [None]:
# Do:

GC_content = []

for transcript in file:
    if transcript == mRNA:
        GC_content.append(GC(transcript))

In [None]:
# Or:

GC_content = []

for transcript in file:
    if transcript == mRNA:
        GC_perc = GC(transcript) # if we will need this GC_perc later in the code...
        GC_content.append(GC_perc)

### Avoid creating large variables if you can

In [None]:
# E.g. count nucleotide frequency in all transcripts combined

# Don't:

from collections import Counter

transcripts_combined = ''

for transcript in file:
    transcripts_combined += transcript.sequence
    
nuc_freq = Counter(transcripts_combined)

In [None]:
# Do:

from collections import Counter

nuc_freq = Counter()

for transcript in file:
    Counter.update(transcript.sequence)

### Which one is better?

sum(x ** 4 for x in range(10 ** 7))

or

sum([x ** 4 for x in range(10 ** 7)])

In [None]:
# time difference?
# (another way to measure time execution; doesn't require import time)

%time sum(x ** 4 for x in range(10 ** 7))

%time sum([x ** 4 for x in range(10 ** 7)])

In [None]:
# btw, if you want to measure really short execution time...

%time sum(x for x in range(1000))
%timeit sum(x for x in range(1000))

In [None]:
# resources?

sum(x ** 4 for x in range(10 ** 8))

In [None]:
# resources?

sum([x ** 4 for x in range(10 ** 8)])