In [9]:
import SmithWaterman as sw
import Preprocess as fp
import DiGraph as dg
from datetime import datetime
import Function as ft

# Preassembly

In [2]:
print("start experiment at", datetime.now())

print("start distance computation at", datetime.now())
processed = fp.preprocess('coverage_1.fasta')
print("distance computation ends at", datetime.now())

start experiment at 2020-07-17 00:31:38.201317
start distance computation at 2020-07-17 00:31:38.202944
distance computation ends at 2020-07-17 00:31:38.336347


# Alignment and Graph Construction

In [3]:
print("alignment/graph construction starts at", datetime.now())

# Declare a graph
G = dg.DiGraph()

# set ploidy level
ploidy_level = 3

#Comparing the sequences
for seq in processed:
    
    for seq2 in processed[seq]: 
        # store seq, score and prefix/suffix info
        scoring, pre_su, i, max_i, j, max_j= sw.smith_waterman(seq, seq2)

        # if it is not a contained read
        if(scoring != 0):
            # if seq2 is a prefix
            if (pre_su): 
                
                G.add_edge(seq2, seq , scoring, j, max_j, i, max_i)
            
            else:  
                
                G.add_edge(seq , seq2, scoring, i, max_i, j, max_j)
        
print("alignment/graph construction ends at", datetime.now())

alignment/graph construction starts at 2020-07-17 00:32:00.796830
overlap score = 13
overlap score = 15
overlap score = 13
overlap score = 14
overlap score = 13
overlap score = 2928
overlap score = 2840
overlap score = 14
overlap score = 14
overlap score = 7393
alignment/graph construction ends at 2020-07-17 02:57:42.125909


In [4]:
G.num_of_subgraphs()

The number of disconnected components in the graph is  1


# Graph Cleaning

In [5]:
print("start graph cleaning at", datetime.now())
G.remove_cycle()

sorted_node = G.topological_sort()

G.transitive_reduction(sorted_node)

print("graph cleaning ends at", datetime.now())

start graph cleaning at 2020-07-17 02:57:42.357457
All the cycles are removed from the graph.
Transitive reduction is completed.
graph cleaning ends at 2020-07-17 02:57:42.362220


# Graph Reduction

In [6]:
print("start graph reduction at", datetime.now())

G.filter_graph(sorted_node, ploidy_level)

print("graph reduction ends at", datetime.now())

start graph reduction at 2020-07-17 02:57:42.401116
Graph is reduced to best overlap graph.
graph reduction ends at 2020-07-17 02:57:42.417462


# Haplotype Formation

In [7]:
print("start haplotype formation at", datetime.now())

result = G.haplotype(ploidy_level)

for i in result:
    print(i)

print("haplotype formation ends at", datetime.now())

print("experiment ends at", datetime.now())

start haplotype formation at 2020-07-17 02:57:42.456595
TTAATAGTTTTAGTGTTTACGCTAGTTTTATTATTTGCTTTTTATTTGATTAATTTTTTATTAAGAATTAAGGATATAGGAAAAAATAAAATTAGAGCGTTTGAATGTGGTTTTGTAAGAGTTGGAAAAATTCAAAATTCTTTTAGAATTCATTTTTTTATTATGATATTGATATTTGTTATTTTTGATTTAGAAATTGTTATGTTTTTAGGTATTTTAGTATCAGATTTAAGTTCGTATATCAGGTTTTTAATAATATTCATCTTCATCTTGGGAGGATTTTACATAGAGTGATGATATGGTAAATTAGTTTGAGTAATTTAATTAATATTTCTATTTTTTTGATTGGATTTGTTTTTTTTATAGGTGGAATTAGTGTTTGGCTTATACCCACATTTAAATTAGGAATCTTTTTTTTAGAATGAGATTTTTTAAGGTTAAAATTTAATTTTTATTTTAATAGAATCTTATTTTCGTTTATTCTTTTTTTGGTAACGTTTAGAGTTTTAGTTTTTAGTACTTATTATTTAAATAGTGAGTTAAACTTTAATTATTATTATTTTGTATTGTTAATTTTCGTAGGTAGAATGTTTAGGCTAAATTTTAGAAACAGTATTTTTACAATGTTACTAAGATGAGATTTATTGGGTATTTCTAGGTTTTTTTTAGTTTTATTTTATAATAATTGAGATAGATGTAGGGGTGCAATAAATACAGCATTAACTAATCGTCTAGGTGATTATTTTATATTTGTCTTTTTTGGTTTATCGGTTTTTAGAGGTTATTATTTTTTAAGATTTAGAATATTTAGAAGTTATATATCTTTATTATTACTTTTAACAGCTTTTACTAAAAGAGCACAATTTCCATTTAGATCTTGGTTACCCAAAGCTATAAGAGCCCCCACACCGGTGAGGTCTTTGGTTCATAGTAGAACTTTAGTT

In [10]:
ofile = open("res_p3_cov1.fasta", "w")

for i in range(len(result)):
    
    seq_num = i + 1
    ofile.write(">NC_seq" + ft.int_to_Roman(seq_num) + "\n" +result[i] + "\n")

ofile.close()