In [1]:
import os
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from Bio import SeqIO
import seaborn as sns
plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = [8, 6]
#%matplotlib inline
#%matplotlib

### Functions

In [2]:
def nucl_lists():
    """
    Generate lists of possible nucleotides
    
    Outputs:
    di_list - list of possible dinuclotides
    tri_list - list of possible trinuclotides
    tetra_list - list of possible tetranuclotides
    """
    
    nucl = ["A","T","C","G"]
    di_list = [i+j for i in nucl for j in nucl]
    tri_list = [i+j for i in di_list for j in nucl]
    tetra_list = [i+j for i in tri_list for j in nucl]

    return (di_list, tri_list, tetra_list)
  
    
def rev_comp(string):
    """
    Return reverse complement of DNA strand
    
    Input - sequence
    
    Output - reverse complement
    """
    
    complementary = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 
    comp = ''
    for i in string:
        comp += complementary[i]

    return comp[::-1]

    
def count_occurrences(subsequence, sequence):
    """
    Count occurrences of substring in a string
    (including overlaping substrings)
    
    Inputs:
    sequence - sequence
    subsequence - subsequence to be found in sequence
    
    Output: number of occurrences of subsequence in sequence
    """
    number_of_occurrences = 0
    for i in range(len(sequence)):
        if sequence[i:i+len(subsequence)] == subsequence:
            number_of_occurrences += 1
            
    return number_of_occurrences


def get_frequencies(sequence):
    """
    Count occurrences of oligonuclotides in sequence
    
    Input - sequence
    
    Output - array of tetranuclotides counts
    """
    encoding = {'A':0, 'T':1, 'C':2, 'G':3}
    
    seq = np.zeros(4)
    tetra_freq = np.ones(256)
    
    non_nucl = 0
    
    n = 1
    for i in sequence:

        if i in ['A','T','C','G']:
            seq[0] = seq[1]
            seq[1] = seq[2]
            seq[2] = seq[3]
            seq = seq*4
            seq[3] = encoding[i]

            if n > 3:
                tetra_freq[int(seq.sum())] += 1
            n +=1
            
        else:
            non_nucl += 1 
    
    if non_nucl > 0:
        print("Warning: %i characters other than A T C G found" % non_nucl)
    return(tetra_freq)


def get_23(tetra_list, tetra_freq, tri_list, di_list):
    """
    Extract tri and di nucleotides frequencies 
    from tetranucleotides frequencies 
    
    imputs:
    di_list - list of possible dinuclotides
    tri_list - list of possible trinuclotides
    tetra_list - list of possible tetranuclotides
    tetra_freq - array of tetranuclotides counts
    
    Outputs:
    tri_nucl - dictionary of trinuclotides frequency
    di_nucl - dictionary of dinuclotides frequency
    """
    #Calculate 3 and 2-mers based on 4-mers
    di_nucl = {di_list[i]:0 for i in range(len(di_list))}
    tri_nucl = {tri_list[i]:0 for i in range(len(tri_list))}
    
    for i in range(len(tetra_list)):
        tri_nucl[tetra_list[i][0:3]] += tetra_freq[i]
    
    for i in tri_nucl:
        di_nucl[i[0:2]] += tri_nucl[i]

    return(tri_nucl, di_nucl)


def rev_comp_freq(oligo_list, oligo_freq):
    """
    Calculate frequencies of oligonuclotides from the complementary strand
    
    Inputs:
    oligo_list - list of oligonuclotides
    oligo_freq - list of oligonuclotides counts
    
    Output - updated frequencies
    """
    
    r_dir = {oligo_list[i]:oligo_freq[i] for i in range(len(oligo_list))}
    
    for i in range(len(oligo_list)):
        oligo_freq[i] += r_dir[rev_comp(oligo_list[i])]
    
    return oligo_freq


def exp_tnf(tetra_list, tri_nucl, di_nucl):
    """
    Calculate expected tetranucleotide frequency
    
    Inputs:
    tetra_list - list of tetranuclotides
    tri_nucl - dictionary of trinuclotides frequency
    di_nucl - dictionary of dinuclotides frequency
    
    Output: expected tetranuclotide frequency
    """
    
    expected_tnf = np.zeros(len(tetra_list))
    for i in range(len(tetra_list)):
        tn = tetra_list[i]
        expected_tnf[i] = (tri_nucl[tn[0:3]]*tri_nucl[tn[1:4]])/di_nucl[tn[1:3]]
            
    return expected_tnf


def app_variance(expected_tnf, tetra_list, tri_nucl, di_nucl):
    """
    Approximated variance of tetranuclotide frequency
    
    Inputs:
    expected_tnf - expected tetranuclotide frequency
    tetra_list - list of tetranuclotides
    tri_nucl - dictionary of trinuclotides frequency
    di_nucl - dictionary of dinuclotides frequency
    
    Output - approximated variance of tetranuclotide frequency
    """
    
    app_var = np.zeros(len(tetra_list))
    for i in range(len(tetra_list)):
        tn = tetra_list[i]
        n23 = di_nucl[tn[1:3]]
        n123 = tri_nucl[tn[0:3]]
        n234 = tri_nucl[tn[1:4]]
        app_var[i] = expected_tnf[i] * (((n23-n123)*(n23-n234))/(n23*n23))
            
    return app_var


def tnf_z_score(tetra_freq, expected_tnf, app_var):
    """
    Calculate tetranuclotide z-score
    
    Inputs:
    tetra_freq - observed tetranuclotide frequency
    expected_tnf - expected tetranuclotide frequency
    app_var - approximated variance of tetranuclotide frequency
    
    Output - tetranuclotide z-score
    """
    
    z_scores = (tetra_freq-expected_tnf)/np.sqrt(app_var)
    return z_scores


def get_z_scores(sequence, di_list, tri_list, tetra_list):
    """
    Calculate z-scores from sequence
    
    Inputs:
    sequence - DNA sequence
    di_list - list of possible dinuclotides
    tri_list - list of possible trinuclotides
    tetra_list - list of possible tetranuclotides
    
    Output - np array of z-scores
    """
    
    tetra_freq = get_frequencies(sequence)
    tetra_freq = rev_comp_freq(tetra_list, tetra_freq)
    tri_nucl, di_nucl = get_23(tetra_list, tetra_freq, tri_list, di_list)
    
    expected_tnf = exp_tnf(tetra_list, tri_nucl, di_nucl)
    app_var = app_variance(expected_tnf, tetra_list, tri_nucl, di_nucl)
    z_scores = tnf_z_score(tetra_freq, expected_tnf, app_var)
    
    return z_scores


def inter_correlations(all_z_scores):
    """
    Calcualte Person's correlation coefficient between fragment and median
    
    Input - np array of z-scores for all fragments
    
    Output - np array of correlation coeeficients
    """
    
    median_z = np.median(all_z_scores, axis=0)

    correlations = np.zeros(len(all_z_scores))
    for i in range(len(all_z_scores)):
        correlations[i] = np.corrcoef(all_z_scores[i], median_z)[1,0]
        
    return correlations


def split_fragment(sequence, n):
    """
    Split sequence into fragments no longer than n
    
    Inputs:
    sequence - sequence
    n - max length of fragment
    
    Output - fragments
    """
    for i in range(0, len(sequence), n):
        yield sequence[i:i + n]
        
def opt_len(s,l):
    return math.ceil(len(s)/round(len(s)/l))

### Tetranucleotide Z-scores for bin

In [5]:
contigs = SeqIO.parse("./test/bin.5.fa", 'fasta')
di_list, tri_list, tetra_list = nucl_lists()

leng =[]
contigs_z_scores = []
for record in contigs:  
    if len(record.seq) > 2500:
        for fragment in split_fragment(record.seq, opt_len(record.seq, 5000)):
            leng.append(len(fragment))
            z_scores = get_z_scores(fragment, di_list, tri_list, tetra_list)
            contigs_z_scores.append(z_scores)
    
contigs_z_scores = np.array(contigs_z_scores)
contig_corr = inter_correlations(contigs_z_scores)



In [6]:
sns.distplot(contig_corr, kde=False)

NameError: name 'sns' is not defined

### Similarity of bins 

In [10]:
bins_path = "./contamination/s_marcescens/"
di_list, tri_list, tetra_list = nucl_lists()

all_z = []
all_corr = []
names = []

for fasta in os.listdir(bins_path):
    
    path = bins_path+fasta
    print(path)
    names.append(fasta)
    
    contigs = SeqIO.parse(path, 'fasta')
    di_list, tri_list, tetra_list = nucl_lists()

    contigs_z_scores = []
    for record in contigs:  
        if len(record.seq) > 2500:
            for fragment in split_fragment(record.seq, opt_len(record.seq, 5000)):
                z_scores = get_z_scores(fragment, di_list, tri_list, tetra_list)
                contigs_z_scores.append(z_scores)

    contigs_z_scores = np.array(contigs_z_scores)
    contig_corr = inter_correlations(contigs_z_scores)
    
    all_z.append(contigs_z_scores)
    all_corr.append(contig_corr)
    

./contamination/s_marcescens/contaminated_0.05
./contamination/s_marcescens/contaminated_0.4
./contamination/s_marcescens/contaminated_0.15
./contamination/s_marcescens/contaminated_0.35
./contamination/s_marcescens/contaminated_0.3
./contamination/s_marcescens/contaminated_0.25
./contamination/s_marcescens/contaminated_0.1
./contamination/s_marcescens/contaminated_0.2


In [12]:
#Make easy to handle dataframes and save them for later 
frames = []
for i in range(len(all_z)):
    df = pd.DataFrame(all_z[i])
    df['contamination'] = names[i].split('_')[1]
    #df['genome'] = names[i]
    frames.append(df)

z = pd.concat(frames,ignore_index=True)

z.to_csv("s_marcescens_z.csv",header=True, index=False)


#Make Dataframe of correlations and save it for later
vec = []
for i in range(len(all_corr)):
    for j in all_corr[i]:
        vec.append(np.array([j,names[i].split('_')[1]]))
        #vec.append(np.array([j,names[i]]))
corr = pd.DataFrame(vec, columns=['corr', 'contamination'])

corr.to_csv("s_marcescens_corr.csv",header=True, index=False)



## Raw TN counts

In [6]:
bins_path = "./contamination/s_marcescens/"
di_list, tri_list, tetra_list = nucl_lists()


tnf_list = []
names = []

for fasta in os.listdir(bins_path):
    
    path = bins_path+fasta
    print(path)
    names.append(fasta)
    
    contigs = SeqIO.parse(path, 'fasta')
    di_list, tri_list, tetra_list = nucl_lists()

    contigs_tnf = []
    for record in contigs:  
        if len(record.seq) > 2500:
            for fragment in split_fragment(record.seq, opt_len(record.seq, 5000)):
                tnf = get_frequencies(fragment)
                tnf = rev_comp_freq(tetra_list, tnf)
                contigs_tnf.append(tnf)
    
    tnf_list.append(contigs_tnf)

./contamination/s_marcescens/contaminated_0.05
./contamination/s_marcescens/contaminated_0.4
./contamination/s_marcescens/contaminated_0.15
./contamination/s_marcescens/contaminated_0.35
./contamination/s_marcescens/contaminated_0.3
./contamination/s_marcescens/contaminated_0.25
./contamination/s_marcescens/contaminated_0
./contamination/s_marcescens/contaminated_0.1
./contamination/s_marcescens/contaminated_0.2


In [7]:
frames = []
for i in range(len(tnf_list)):
    df = pd.DataFrame(tnf_list[i])
    df['contamination'] = names[i].split('_')[1]
    #df['genome'] = names[i]
    frames.append(df)

z = pd.concat(frames,ignore_index=True)

z.to_csv("s_marcescens_counts.csv",header=True, index=False)


## Alternative z-scores