In [38]:
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO

### Generate lists of di, tri, and tetranucleotides

In [39]:
nucl = ["A","T","G","C"]
di_nucl = {i+j:0 for i in nucl for j in nucl}
tri_nucl = {i+j:0 for i in di_nucl for j in nucl}
tetra_nucl = [i+j for i in tri_nucl for j in nucl]

In [80]:
def count_occurrences(subsequence, sequence):
    """
    Count occurrences of substring in a string
    (including overlaping substrings)
    
    Inputs:
    sequence - sequence
    subsequence - subsequence to be found in sequence
    
    Output: number of occurrences of subsequence in sequence
    """
    number_of_occurrences = 0
    for i in range(len(sequence)):
        if sequence[i:i+len(subsequence)] == subsequence:
            number_of_occurrences += 1
            
    return number_of_occurrences
    

def exp_tnf(tetra_nucl, tri_nucl, di_nucl):
    """Docstring goes here"""
    expected_tnf = np.zeros(len(tetra_nucl))
    for i in range(len(tetra_nucl)):
        tn = tetra_nucl[i]
        expected_tnf[i] = (tri_nucl[tn[0:3]]*tri_nucl[tn[1:4]])/di_nucl[tn[1:3]]
        
    return expected_tnf


def app_variance(expected_tnf, tetra_nucl, tri_nucl, di_nucl):
    """Docstring goes here"""
    app_var = np.zeros(len(tetra_nucl))
    for i in range(len(tetra_nucl)):
        tn = tetra_nucl[i]
        n23 = di_nucl[tn[1:3]]
        n123 = tri_nucl[tn[0:3]]
        n234 = tri_nucl[tn[1:4]]
        app_var[i] = expected_tnf[i] * (((n23-n123)*(n23-n234))/(n23*n23))
        
    return app_var

In [43]:
contigs = SeqIO.parse("./test/test.fasta", 'fasta')

for record in contigs:
    
    for di in di_nucl:
        di_nucl[di] = count_occurrences(di,record.seq)
        
    for tri in tri_nucl:
        tri_nucl[tri] = count_occurrences(tri,record.seq)

    tetra_freq = np.zeros(len(tetra_nucl))
    for i in range(len(tetra_nucl)):
        tetra_freq[i] = count_occurrences(tetra_nucl[i],record.seq)
        


In [70]:
expected_tnf = exp_tnf(tetra_nucl, tri_nucl, di_nucl)

In [81]:
app_var = app_variance(expected_tnf, tetra_nucl, tri_nucl, di_nucl)

In [82]:
app_var

array([12.76938805, 11.85026571,  9.83674653, 11.12209282,  9.95094866,
       12.63966295,  8.18861607, 10.26496429,  7.08706189,  7.41860377,
        6.07826779,  7.52345034,  9.04910036, 10.83510701,  7.63111932,
        8.71895974, 10.66829031, 10.3873731 ,  6.16967693,  9.14687423,
       13.9791508 , 17.49015823,  9.53494263, 14.32165131,  6.77163064,
        8.03783703,  4.91041018,  7.33370275, 11.09167698,  9.55671564,
        6.9715176 , 10.65580057,  5.92729332,  6.48950271,  4.28082295,
        5.23390173,  6.1062125 ,  8.06158193,  5.00271975,  6.01411773,
        4.35077843,  5.42969487,  3.54637919,  4.45292436,  6.00883083,
        7.89974476,  5.2451925 ,  5.00717537,  9.49250947,  9.07577532,
        5.6481369 ,  8.18188056,  8.85191511, 13.42710568,  6.55156156,
        9.55709912,  5.09029648,  7.3933557 ,  5.65802735,  6.54497295,
        8.4515491 ,  7.31754378,  7.31754378,  7.51011072, 12.6838883 ,
       11.77092011,  9.77088281, 11.04762283, 10.33610149, 13.12