In [1]:
import SmithWaterman as sw
import Preprocess as fp
import DiGraph as dg
from datetime import datetime
import Function as ft

# Preassembly

In [2]:
print("start experiment at", datetime.now())

print("start distance computation at", datetime.now())
processed = fp.preprocess('coverage_10.fasta')
print("distance computation ends at", datetime.now())

start experiment at 2020-07-17 00:10:20.586570
start distance computation at 2020-07-17 00:10:20.589993
distance computation ends at 2020-07-17 00:10:25.024661


# Graph Alignment and Graph Construction

In [3]:
print("graph alignment/construction starts at", datetime.now())

# Declare a graph
G = dg.DiGraph()

# set ploidy level
ploidy_level = 3

#Comparing the sequences
for seq in processed:
    
    for seq2 in processed[seq]: 
        # store seq, score and prefix/suffix info
        scoring, pre_su, i, max_i, j, max_j= sw.smith_waterman(seq, seq2)

        # if it is not a contained read
        if(scoring != 0):
            # if seq2 is a prefix
            if (pre_su): 
                
                G.add_edge(seq2, seq , scoring, j, max_j, i, max_i)
            
            else:  
                
                G.add_edge(seq , seq2, scoring, i, max_i, j, max_j)
        
print("graph alignment/construction ends at", datetime.now())

graph alignment/construction starts at 2020-07-17 00:10:25.038142
overlap score = 23
overlap score = 15
overlap score = 15
overlap score = 3898
overlap score = 7445
overlap score = 8384
overlap score = 22
overlap score = 22
overlap score = 9697
overlap score = 23
overlap score = 23
overlap score = 23
overlap score = 10303
overlap score = 23
overlap score = 23
overlap score = 23
overlap score = 10142
overlap score = 23
overlap score = 23
overlap score = 22
overlap score = 22
overlap score = 9418
overlap score = 2447
overlap score = 23
overlap score = 23
overlap score = 3041
overlap score = 9486
overlap score = 23
overlap score = 22
overlap score = 23
overlap score = 3675
overlap score = 23
overlap score = 23
overlap score = 23
overlap score = 7044
overlap score = 5328
overlap score = 15
overlap score = 22
overlap score = 4645
overlap score = 16
overlap score = 4053
overlap score = 4890
overlap score = 8472
overlap score = 23
overlap score = 15
overlap score = 23
overlap score = 4679
ove

overlap score = 9057
overlap score = 2669
overlap score = 23
overlap score = 23
overlap score = 3041
overlap score = 8971
overlap score = 23
overlap score = 22
overlap score = 23
overlap score = 3314
overlap score = 23
overlap score = 23
overlap score = 23
overlap score = 6683
overlap score = 5169
overlap score = 15
overlap score = 22
overlap score = 4284
overlap score = 16
overlap score = 3447
overlap score = 4529
overlap score = 8111
overlap score = 36
overlap score = 15
overlap score = 38
overlap score = 4679
overlap score = 23
overlap score = 6819
overlap score = 8422
overlap score = 23
overlap score = 5803
overlap score = 10620
overlap score = 4275
overlap score = 23
overlap score = 5219
overlap score = 9352
overlap score = 4160
overlap score = 7244
overlap score = 23
overlap score = 22
overlap score = 6087
overlap score = 10620
overlap score = 22
overlap score = 23
overlap score = 9945
overlap score = 3400
overlap score = 10620
overlap score = 23
overlap score = 8438
overlap scor

overlap score = 15
overlap score = 3972
overlap score = 15
overlap score = 15
overlap score = 22
overlap score = 7205
overlap score = 5345
overlap score = 7205
overlap score = 15
overlap score = 5052
overlap score = 1702
overlap score = 23
overlap score = 23
overlap score = 2521
overlap score = 9542
overlap score = 23
overlap score = 22
overlap score = 23
overlap score = 3885
overlap score = 23
overlap score = 23
overlap score = 23
overlap score = 7254
overlap score = 5279
overlap score = 15
overlap score = 22
overlap score = 4855
overlap score = 16
overlap score = 4018
overlap score = 4996
overlap score = 8682
overlap score = 23
overlap score = 15
overlap score = 23
overlap score = 4679
overlap score = 23
overlap score = 14
overlap score = 22
overlap score = 2222
overlap score = 1694
overlap score = 22
overlap score = 22
overlap score = 22
overlap score = 15
overlap score = 22
overlap score = 14
overlap score = 14
overlap score = 17
overlap score = 17
overlap score = 14
overlap score 

In [4]:
G.num_of_subgraphs()

The number of disconnected components in the graph is  1


# Graph Cleaning

In [5]:
print("start graph cleaning at", datetime.now())
G.remove_cycle()

sorted_node = G.topological_sort()

G.transitive_reduction(sorted_node)

print("graph cleaning ends at", datetime.now())

start graph cleaning at 2020-07-22 23:46:12.664161
All the cycles are removed from the graph.
Transitive reduction is completed.
graph cleaning ends at 2020-07-22 23:46:13.665205


# Graph Reduction

In [6]:
print("start graph reduction at", datetime.now())

G.filter_graph(sorted_node, ploidy_level)

print("graph reduction ends at", datetime.now())

start graph reduction at 2020-07-22 23:46:13.679304
Graph is reduced to best overlap graph.
graph reduction ends at 2020-07-22 23:46:13.685646


# Haplotype Formation

In [7]:
print("start haplotype formation at", datetime.now())

result = G.haplotype(ploidy_level)

for i in result:
    print(i)

print("haplotype formation ends at", datetime.now())

print("experiment ends at", datetime.now())

start haplotype formation at 2020-07-22 23:46:13.723881
['AGTATTTAGATATACAAATACTTTACCATTAAGGTCAGTAATTTCTATTTTTACTTTTATTGTTCTTTTAACTTGTTGTTTTGGAGGTTATTTTACTTACTCTTTTTGTCCTTGTGGAATGGTTGAATTTACTTTTGTTTATGCTGCTGTAGCGTGATTAAGTACTTTGTTAACTTTTATTTCAAGAGAAAAATTTTCAGTTTATATAAGAAAACCAGGAGACACATATTTGAAAACTCTTAGAATGCTATTAATTGAAATCGTTAGAGAATTTTCTCGTCCACTTGCTTTAACAGTGCGTTTAACAGTTAATATTACTGTTGGTCATTTAGTTAGAATAATGCTTTATCAAGGATTAGAATTAAGAATAGGTGATCAGTATATTTGATTATCAATTTTAGCCATTATAATAGAATGTTTTGTTTTCTTCATTCAAAGTTATATTTTCTCTCGTTTAATTTTTTTATATCTTAATGAGTAATAAAAAAAAAAAGATGTTAACTTAAGTTTTAAAGTGCCAAACTTTTAATTTGGAAATGGTGGACCACATCTTAGTTGATATAGCATAAGAAGTGCATTTGTTTTAAGCGCAAAAGATATCCGTCAACTAACGAGTTCATAAAGCAAGTCTTCTAAATTTGTTCTAGGTTAAATCCTGCTCGTTTTTGATTGTTTTTATTTCTTTATTTACCTTGTTTTTAACATTATTAAGAATTTTGACTAATAACGTTATTGTTTGATGAAGAATTTTTTTATTGATAACTGTAGTTTTTATTCTATTAAATAAAAGCAGCAAGAGATATACCAGAATTTTTAATTATTTTGTTATTCAAGAGTCTTTAGGTTTATTATTTCTTCTTTGTAGAGGAGGTCTATTACAATTTTTTATTATTTTATTGAAAATTGGTGTAGCACCGCTCCACTTTTGAATTTTTAATGTAA

In [8]:
ofile = open("res_p3_cov10.fasta", "w")

for i in range(len(result)):
    
    seq_num = i + 1
    ofile.write(">NC_seq" + ft.int_to_Roman(seq_num) + "\n" +result[i] + "\n")

ofile.close()

In [9]:
for i in result: 
    print(i)

ATAAAAATTTACCTCGGCAATTTATCGCTTGTAAAATACTTGTTCCAGAATAATCGGCTAGACTTGTTAAAGCTTGTACTTTAATTGATGTTAATTATGAAATTATTATATTTTCTTTTAGATCTATGGTAGAATTTGGATTTATATTAGTGAATTTTCATAATTTTAAGATTTGTTGAACAAAGCAGATTAGTACCTGGTTAGACAAAAATTAAAAGAGCAGGAGTAAAGTTGTATTTAAACTGAAAAGATATTGGCAGACATTCTAAATTATCTTTGGAGGCTGAGTAGTAACTGAGAACCCTCATTAACTACTTAATTTTTTGACTCGTGTATGATCGTTTATTTTATTCTTAAGGATTATAATAAAAAATTTTTAATTTATTAAAATAGATATATACCCGGTTTATGATTTAAGAAACATTTGGCCTACAATATTTTATATTATGGATTTTAGTTTTAGTTAACTAAATGAAATTGTAAAAGACAGTAAAAAATTCTTAATGTATTTTTGAAGATTATCTAGAAGTGGTACAAATCATCCATCAATTGCCCAAAGGGGAGTAAGTTGTAGTAAAGTAGATTTAGGGGAACCTGAATCTAGTAATAAAACTATTTTTAAATATGTTTTGAAAACATGTTTTGAGGTAACTCGTAGTTTTTAAGAGTTAGTTTAATATAGAATTGTTGACTGTTAATCAAAAGGTGTACCTCTTAATATAAGAGTTTAGTTTAAGTTAAAACGTTAGATTGTAAATCTAAAGATTATTGCTCTTGATAATTTTAGTTTTACTTATAGTTATTTTAATGATGATTTTTATTGTTCAAAGAATCGCTTTTATTACTCTATATGAGCGTCATTTATTGGGAAGAAGACAAAATCGTCTAGGGCCCACCAAGGTTACATTTATGGGATTAGCACAAGCTTTATTGGATGGGGTTAAACTTTTAAAAAAAGAACAAATAACACCCTTAAATTCCTCTGAAGTATCATTTTTAC