In [1]:
%load_ext autoreload 
%autoreload 2
from Bio import SeqIO
import numpy as np
from pgmpy.models import MarkovChain
import warnings

In [5]:
def load_genome(filepath):
    """
    Load a genome sequence from a FASTA file.
    """
    for record in SeqIO.parse(filepath, "fasta"):
        return str(record.seq)

def generate_kmers(sequence, k):
    """
    Generate k-mers from a given sequence.
    """
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

# Load the genome (update the path to the actual file)
genome = load_genome("escherichia_coli_reference.fasta")
print(f"Genome loaded: {len(genome)} bases")

k = 10  # Length of k-mers
kmers = generate_kmers(genome, k)
print(f"Generated {len(kmers)} k-mers of length {k}")

unique_kmers = set(kmers)
n_unique = len(unique_kmers)
print(n_unique)
unique_kmers, index, reverse_index = np.unique(np.array(kmers), return_index=True, return_inverse=True)
index = {kmer: idx for idx, kmer in enumerate(unique_kmers)}
reverse_index = {idx: kmer for idx, kmer in enumerate(unique_kmers)}

def encode_kmers(kmers, index):
    return [index[kmer] for kmer in kmers]

encoded_kmers = encode_kmers(kmers, index)

Genome loaded: 4641652 bases
Generated 4641643 k-mers of length 10
898115


In [11]:
encoded_kmers

[142996,
 563375,
 448705,
 895368,
 887489,
 857478,
 741170,
 284071,
 219924,
 866530,
 775169,
 422770,
 788969,
 476279,
 108675,
 424739,
 796848,
 506765,
 234312,
 25500,
 96968,
 377817,
 594806,
 577892,
 508242,
 239621,
 47949,
 182795,
 722571,
 211171,
 831965,
 643091,
 774475,
 420037,
 778156,
 434790,
 835525,
 657761,
 834365,
 652979,
 814628,
 576157,
 501393,
 213121,
 839611,
 673803,
 33,
 138,
 550,
 2174,
 8497,
 32896,
 124742,
 489375,
 165723,
 657193,
 832023,
 643317,
 775402,
 423790,
 793141,
 492446,
 177991,
 704951,
 130985,
 514509,
 263989,
 142570,
 561608,
 442106,
 866442,
 774833,
 421407,
 783715,
 456641,
 29129,
 110475,
 432017,
 824872,
 614877,
 660449,
 845199,
 694830,
 86119,
 334586,
 426295,
 803001,
 530837,
 326487,
 391619,
 647368,
 791748,
 487224,
 156492,
 619177,
 677399,
 14565,
 55776,
 213183,
 839856,
 674789,
 3974,
 15496,
 59276,
 227147,
 895091,
 886407,
 853262,
 724991,
 220834,
 870132,
 789181,
 477063,
 111923,


In [12]:
from collections import defaultdict

transitions = defaultdict(lambda: defaultdict(int))

for i in range(len(encoded_kmers)-1):
    current_kmer = encoded_kmers[i]
    next_kmer = encoded_kmers[i+1]
    transitions[current_kmer][next_kmer] += 1

for kmer in transitions:
    total = sum(transitions[kmer].values())
    for next_kmer in transitions[kmer]:
        transitions[kmer][next_kmer] /= total

In [13]:
transitions = dict(transitions)
for kmer in transitions:
    transitions[kmer] = dict(transitions[kmer])

In [15]:
mc = MarkovChain(["kmers"], [n_unique])

In [16]:
mc.add_transition_model("kmers", transitions)

In [8]:
sample = mc.sample(size=50000)

In [9]:
sample

Unnamed: 0,kmers
0,8
1,32
2,2
3,8
4,34
...,...
49995,58
49996,42
49997,41
49998,39
