In [117]:
def composition(text, k, sort=True):
    kmers = list()
    for i in range(len(text)-k+1):
        kmers.append(text[i:i+k])
    if sort:
        return sorted(kmers)
    return kmers

In [2]:
composition('TATGGGGTGC', 3)

['ATG', 'GGG', 'GGG', 'GGT', 'GTG', 'TAT', 'TGC', 'TGG']

In [3]:
with open('data/dataset_197_3.txt', 'r') as f:
    data = f.read().splitlines() 
    k = int(data[0])
    genome = data[1]

In [4]:
result = composition(genome, k)

In [5]:
with open('data/submission_197_3.txt', 'w') as f:
    f.write('\n'.join(result))
    f.close()

---

In [11]:
def path_to_genome(path):
    genome = path[0]
    # k = len(genome)
    for kmer in path[1:]:
        genome += kmer[-1]
    return genome        

In [13]:
path = ['ACCGA', 'CCGAA', 'CGAAG', 'GAAGC', 'AAGCT']
genome = 'ACCGAAGCT'
assert path_to_genome(path) == genome

In [14]:
with open('data/dataset_198_3.txt', 'r') as f:
    path = f.read().splitlines() 

In [16]:
genome = path_to_genome(path)

In [17]:
with open('data/submission_198_3.txt', 'w') as f:
    f.write(genome)
    f.close()

---

In [18]:
from itertools import combinations

In [24]:
def overlap_graph(patterns):
    adjacencies = list()
    for (i, kmer_i), (j, kmer_j) in combinations(enumerate(patterns), 2):
        if kmer_i[1:] == kmer_j[:-1]:
            adjacencies.append((i, j))
        if kmer_j[1:] == kmer_i[:-1]:
            adjacencies.append((j, i))
    return adjacencies

In [36]:
def display_graph(patterns, adjacencies):
    edges = [
        f'{patterns[i]} ->  {patterns[j]}'
        for i, j in adjacencies
    ]
    return '\n'.join(edges)

In [37]:
patterns = ['ATGCG', 'GCATG', 'CATGC', 'AGGCA', 'GGCAT', 'GGCAC']
adjacencies = overlap_graph(patterns)

In [39]:
print(display_graph(patterns, adjacencies))

CATGC ->  ATGCG
GCATG ->  CATGC
GGCAT ->  GCATG
AGGCA ->  GGCAT
AGGCA ->  GGCAC


In [40]:
with open('data/dataset_198_10.txt', 'r') as f:
    patterns = f.read().splitlines()

In [41]:
adjacencies = overlap_graph(patterns)
graph = display_graph(patterns, adjacencies)

In [42]:
with open('data/submission_198_10.txt', 'w') as f:
    f.write(graph)
    f.close()

---

In [97]:
def is_universal(binary_string, k):
    kmers = composition(binary_string, k)
    unique_kmers = set(kmers)
    if len(unique_kmers) == len(kmers) and len(kmers) == 2 ** k:
        return True
    return False

In [113]:
sequence = '1100001111010010110'
assert is_universal(sequence, 4)

---

In [148]:
from collections import defaultdict

In [141]:
def build_debruijn_graph(text, k):
    nodes = composition(text, k-1, False)
    edges = defaultdict(list)
    for i in range(len(text)-k+1):
        edges[nodes[i]].append(nodes[i+1])
    return edges

In [8]:
def display_debruijn_graph(graph):
    return '\n'.join([
        f'{key} -> {", ".join(values)}'
        for key, values in graph.items()
    ])        

In [145]:
with open('data/dataset_199_6.txt', 'r') as f:
    data = f.read().splitlines()
    k = int(data[0])
    text = data[1]

In [146]:
graph = build_debruijn_graph(text, k)

In [147]:
with open('data/submission_199_6.txt', 'w') as f:
    f.write(display_debruijn_graph(graph))

In [153]:
sequence = 'TAATGCCATGGGATGTT'

In [154]:
print(display_debruijn_graph(build_debruijn_graph(sequence, 2)))

T -> A, G, G, G, T
A -> A, T, T, T
G -> C, G, G, A, T
C -> C, A


In [155]:
print(display_debruijn_graph(build_debruijn_graph(sequence, 3)))

TA -> AA
AA -> AT
AT -> TG, TG, TG
TG -> GC, GG, GT
GC -> CC
CC -> CA
CA -> AT
GG -> GG, GA
GA -> AT
GT -> TT


In [156]:
print(display_debruijn_graph(build_debruijn_graph(sequence, 4)))

TAA -> AAT
AAT -> ATG
ATG -> TGC, TGG, TGT
TGC -> GCC
GCC -> CCA
CCA -> CAT
CAT -> ATG
TGG -> GGG
GGG -> GGA
GGA -> GAT
GAT -> ATG
TGT -> GTT


In [157]:
shuffled_sequence = 'TAATGGGATGCCATGTT'

In [158]:
print(display_debruijn_graph(build_debruijn_graph(shuffled_sequence, 4)))

TAA -> AAT
AAT -> ATG
ATG -> TGG, TGC, TGT
TGG -> GGG
GGG -> GGA
GGA -> GAT
GAT -> ATG
TGC -> GCC
GCC -> CCA
CCA -> CAT
CAT -> ATG
TGT -> GTT


+ the larger the k, the easier it is to reconstruct the text from the graph
+ shuffled sequences can have the same graph
---

In [5]:
kmers = [
    'GAGG',
    'CAGG',
    'GGGG',
    'GGGA',
    'CAGG',
    'AGGG',
    'GGAG',
]

In [2]:
def prefix(kmer):
    return kmer[:-1]
def suffix(kmer):
    return kmer[1:]

In [3]:
from collections import defaultdict

def debruijn_graph_from_kmers(kmers):
    edges = defaultdict(list)
    for kmer in kmers:
        edges[prefix(kmer)].append(suffix(kmer))
    return edges

In [11]:
graph = debruijn_graph_from_kmers(kmers)
print(display_debruijn_graph(graph))

GAG -> AGG
CAG -> AGG, AGG
GGG -> GGG, GGA
AGG -> GGG
GGA -> GAG


In [12]:
with open('data/dataset_200_8.txt', 'r') as f:
    data = f.read().splitlines()

In [15]:
graph = debruijn_graph_from_kmers(data)
print(display_debruijn_graph(graph))

TTAGCACTCGGTAGAGCAA -> TAGCACTCGGTAGAGCAAC
ACACATAACTCTGGGTGGC -> CACATAACTCTGGGTGGCG
CCCACGACCACCTTGGCTG -> CCACGACCACCTTGGCTGC
GGACCGCTCACTTAATCGT -> GACCGCTCACTTAATCGTC
GGCTCTGGACCCGTGGATA -> GCTCTGGACCCGTGGATAC
GAAATATTTCGCCAAGCGC -> AAATATTTCGCCAAGCGCC
CTTCGCTAGTGAAACTCTC -> TTCGCTAGTGAAACTCTCA
GTGGTATAGACTCCCTGAT -> TGGTATAGACTCCCTGATG
CCCCCGTCCGTAGCGAATT -> CCCCGTCCGTAGCGAATTA
TTTGTCAGCAAGCATCACT -> TTGTCAGCAAGCATCACTG
ATTACGTGTGGCTCTGGAC -> TTACGTGTGGCTCTGGACC
CATACCAAAGGGGTCAATA -> ATACCAAAGGGGTCAATAT
GCGACTTAAACGCTTTCAC -> CGACTTAAACGCTTTCACT
TGTCGGTGGTATAGACTCC -> GTCGGTGGTATAGACTCCC
GACGTTGCACTCTCAATCG -> ACGTTGCACTCTCAATCGG
GGCGGTACCTCCTACCTCA -> GCGGTACCTCCTACCTCAT
CGTAGCGAATTATATCACT -> GTAGCGAATTATATCACTA
CCGCCTTAGTGAAGATGTA -> CGCCTTAGTGAAGATGTAG
GAAGATTGGTAAACTTACT -> AAGATTGGTAAACTTACTG
ACCGCATATGTCGAGGTGA -> CCGCATATGTCGAGGTGAG
AGATGTCGTACGACGCTGG -> GATGTCGTACGACGCTGGC
ACCACGTCGAAGTACCTAT -> CCACGTCGAAGTACCTATA
TAGTGTGTACATGCGATTT -> AGTGTGTACATGCGATTTC
CACCTGAGACA