# Files - Exercises VI

## 1. Counting Severe Acute Respiratory Syndrome coronavirus 2 genome bps

In [20]:
filename = 'sequence.fasta'

try:
    with open(filename, 'r') as f:
         seq = ''.join([line.strip() for line in f if '>' not in line])

except FileNotFoundError as e:
    print(f'{type(e).__name__} - {e}')

print(f'Number of nucleotides (bps): {len(seq):,}')

Number of nucleotides (bps): 29,903


## 2. Start codons in the SARS-CoV-2 genome

In [21]:
start_codon = 'ATG'
stop_codons = ('TAA', 'TAG', 'TGA')

start_codon_index = seq.find(start_codon)

if start_codon_index > -1:
    print(f'Start codon {start_codon} found at:\t{start_codon_index}')
else:
    print('Start codon not found')

for codon in stop_codons:
    stop_codon_index = seq.find(codon, start_codon_index + 3)

    if stop_codon_index > -1:
        print(f'Stop codon {codon} found at:\t{stop_codon_index}')
        break
else:
    print('Stop codon not found.')

Start codon ATG found at:	106
Stop codon TAA found at:	129


## 3. SARS-CoV-2 genomic variations

In [25]:
US_filename = 'USA.txt'
Wuhan_filename = 'Wuhan-Hu-1.txt'

def get_seq(filename):
    try:
        with open(filename, 'r') as f:
             return ''.join([line.strip() for line in f if '>' not in line])
    
    except FileNotFoundError as e:
        print(f'{type(e).__name__} - {e}')

US = get_seq(US_filename)
Wu = get_seq(Wuhan_filename)

print('W -> U     Index')
print('-' * 16)

for i, n in enumerate(zip(Wu, US)):
    w, u = n

    if u != w:
        print(f'{w} -> {u} at: {i:}')

W -> U     Index
----------------
C -> T at: 8516
C -> T at: 17795
G -> T at: 21294
A -> T at: 21295
G -> T at: 21296
A -> G at: 21297
G -> T at: 21298
C -> T at: 21299
C -> T at: 21300
G -> C at: 21303
C -> T at: 21305
C -> G at: 21306
C -> T at: 21307
G -> T at: 21309
G -> T at: 21310
T -> A at: 21311
C -> G at: 21314
A -> C at: 21315
A -> C at: 21316
C -> A at: 21317
G -> C at: 21318
A -> T at: 21319
G -> A at: 21320
A -> G at: 21321
A -> T at: 21322
A -> C at: 21323
A -> T at: 21324
A -> T at: 21326
C -> A at: 21327
A -> G at: 21328
C -> T at: 21329
G -> C at: 21330
T -> A at: 21331
C -> G at: 21332
C -> T at: 21333
A -> G at: 21334
A -> T at: 21335
C -> G at: 21336
C -> T at: 21338
G -> A at: 21340
T -> C at: 21342
G -> T at: 21344
C -> A at: 21345
T -> A at: 21347
G -> A at: 21348
T -> C at: 21349
T -> C at: 21350
T -> A at: 21351
T -> G at: 21352
C -> A at: 21354
A -> C at: 21355
G -> T at: 21356
G -> C at: 21357
T -> A at: 21358
T -> A at: 21359
C -> T at: 21360
G -> T at: 2136

## 4. BLAST (Basic Local Alignment Search Tool)

In [None]:
Wuhan-Hu-1: NC_045512.2 (considered the Reference) & India: MT050493.1

Query  8781   GCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAA  8840
Sbjct  8761   .T..........................................................  8820

Query  28101  CCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTTACCTTTTACAATTAAT  28160
Sbjct  28081  ...........................................C................  28140

## 5. Logistic growth model

In [2]:
p = 0.43
r = 3.1
y = 12

with open('Logistic growth model.txt', 'w') as f:
    for i in range(y + 1):
        f.write(f'At year {i:>2}, the population density is {p:.2f}\n')
        p = r * p * (1 - p)

## 6. Copying a file

In [None]:
in_file = ''
out_file = ''

from shutil import copyfile

try:
    copyfile(in_file, out_file)
except e:
    print(f'{type(e).__name__} - {e}')

## 7. Comparing k-mers in viral genomes

### 7.1

In [2]:
filename = 'Dengue virus 1.fasta'

from collections import Counter

try:
    with open(filename, 'r') as f:
         seq = ''.join(line.strip() for line in f if '>' not in line)

except FileNotFoundError as e:
    print(f'{type(e).__name__} - {e}')

k = 9

kmers = [seq[i: i + k] for i in range(len(seq) - k + 1)]

del seq

kcounts = Counter(kmers)

for k, v in kcounts.items():
    print(f'{k} : {v}')

ATGAACAAC : 1
TGAACAACC : 1
GAACAACCA : 2
AACAACCAA : 1
ACAACCAAC : 1
CAACCAACG : 1
AACCAACGG : 1
ACCAACGGA : 1
CCAACGGAA : 1
CAACGGAAA : 1
AACGGAAAA : 1
ACGGAAAAA : 1
CGGAAAAAG : 1
GGAAAAAGA : 1
GAAAAAGAC : 1
AAAAAGACG : 1
AAAAGACGG : 1
AAAGACGGG : 1
AAGACGGGT : 2
AGACGGGTC : 1
GACGGGTCG : 1
ACGGGTCGA : 1
CGGGTCGAC : 1
GGGTCGACC : 1
GGTCGACCG : 1
GTCGACCGT : 1
TCGACCGTC : 1
CGACCGTCT : 1
GACCGTCTT : 1
ACCGTCTTT : 1
CCGTCTTTC : 1
CGTCTTTCA : 1
GTCTTTCAA : 1
TCTTTCAAT : 1
CTTTCAATA : 1
TTTCAATAT : 1
TTCAATATG : 1
TCAATATGC : 1
CAATATGCT : 1
AATATGCTG : 1
ATATGCTGA : 1
TATGCTGAA : 1
ATGCTGAAA : 1
TGCTGAAAC : 1
GCTGAAACG : 1
CTGAAACGC : 1
TGAAACGCG : 1
GAAACGCGC : 1
AAACGCGCG : 1
AACGCGCGA : 1
ACGCGCGAG : 1
CGCGCGAGA : 1
GCGCGAGAA : 1
CGCGAGAAA : 1
GCGAGAAAC : 1
CGAGAAACC : 1
GAGAAACCG : 1
AGAAACCGC : 1
GAAACCGCG : 1
AAACCGCGT : 1
AACCGCGTG : 1
ACCGCGTGT : 1
CCGCGTGTC : 1
CGCGTGTCA : 1
GCGTGTCAA : 1
CGTGTCAAC : 1
GTGTCAACT : 1
TGTCAACTG : 1
GTCAACTGT : 1
TCAACTGTT : 1
CAACTGTTT : 2
AACTGT

### 7.2

In [4]:
vgfilename_1 = 'Zika.fasta'
vgfilename_2 = 'Mutant Zaire ebolavirus.fasta'

from collections import Counter

def get_seq(filename):
    try:
        with open(filename, 'r') as f:
             return ''.join(line.strip() for line in f if '>' not in line)

    except FileNotFoundError as e:
        print(f'{type(e).__name__} - {e}')

vg1 = get_seq(vgfilename_1)
vg2 = get_seq(vgfilename_2)

k = 9

kvg1 = [vg1[i: i + k] for i in range(len(vg1) - k + 1)]
kvg2 = [vg2[i: i + k] for i in range(len(vg2) - k + 1)]

del vg1, vg2

counts1 = Counter(kvg1)
counts2 = Counter(kvg1)

del kvg1, kvg2

common = counts1.keys() & counts2.keys()

print(f'Count: {len(common)}')
print(f'Common {k}-mers:\n{common}')

Count: 9776
Common 9-mers:
{'CTGGCCATT', 'GGAGGTCCC', 'ATGGATGGT', 'CCATATGGA', 'GCCTGGGCT', 'GTGGACAAG', 'GCTGCTCAG', 'AGCTCGACG', 'CTCATGAAG', 'TGCTAGTGT', 'TGGAGTGGC', 'TGGCTGGGA', 'TCAGCAGGA', 'CCACTAGCT', 'CCCGTGTAA', 'ACCATAATG', 'TAAGGTCAG', 'TCCCTCGTG', 'AGAGAAAAG', 'GGGAAGGAG', 'CTCACATGC', 'AAGTGAAAA', 'GGAGTGGCC', 'GCTTGCTAA', 'CTAAAACCA', 'GAGGGCATG', 'CACGCTTAC', 'GAAGCTTAG', 'ATCGTTTCG', 'CACTCTGCT', 'GGGACCTCA', 'TGGCTGGAC', 'ACCACTGAG', 'TGGGAGTAA', 'ATTATGCTC', 'GGTTCTCAG', 'GCCTATCAG', 'ATTTCGTTT', 'ATCGGACAT', 'CAGATCCCC', 'GTCTCACCA', 'GAGGACCAT', 'GCCTGGTTA', 'TCGGACAGT', 'TACACGAAG', 'GCACAAAGA', 'CATGGCTGC', 'TTGGACACC', 'AGTTACAGG', 'GCCTTGAGA', 'GATAAGGTA', 'TCACCAATC', 'CGGGTACAC', 'GACACTGCT', 'TTACGCCTA', 'TTCAGAGGA', 'TGTATGTGA', 'GTAAACCCC', 'GCTGCTGGT', 'AAAGATTGC', 'GGAAGCTGT', 'TACAGAGTG', 'TGGAATGGA', 'CCAGATGGT', 'AGCCTTGGG', 'GAGAGCCTG', 'ACTGTGAGA', 'GAAGCTCTG', 'AAGCCATAA', 'AGGACGGGG', 'AAAGAATGG', 'TAGAGGACC', 'TGTGATTCT', 'TCAGACCAG', 'CTCAGAGGA