In [54]:
%load_ext autoreload
%autoreload 2
import markovify
from Bio import SeqIO 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
def load_genome(filepath):
    """
    Load a genome sequence from a FASTA file.
    """
    for record in SeqIO.parse(filepath, "fasta"):
        return str(record.seq)

def generate_kmers(sequence, k):
    """
    Generate k-mers from a given sequence.
    """
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

# Load the genome (update the path to the actual file)
genome = load_genome("escherichia_coli_reference.fasta")
print(f"Genome loaded: {len(genome)} bases")

k = 6  # Length of k-mers
kmers = generate_kmers(genome, k)

print(f"Number of {k}-mers: {len(kmers)}")

Genome loaded: 4641652 bases
Number of 6-mers: 4641647


In [58]:
text_model = markovify.Chain([kmers], state_size=1)
text_model.compile(inplace=True)

<markovify.chain.Chain at 0x7fcb0c7945e0>

In [59]:
sample = text_model.walk()

In [60]:
len(sample)

15855787

In [61]:
print(sample[:100])

['AGCTTT', 'GCTTTC', 'CTTTCG', 'TTTCGA', 'TTCGAT', 'TCGATC', 'CGATCA', 'GATCAC', 'ATCACC', 'TCACCA', 'CACCAT', 'ACCATG', 'CCATGC', 'CATGCC', 'ATGCCA', 'TGCCAA', 'GCCAAT', 'CCAATG', 'CAATGT', 'AATGTA', 'ATGTAG', 'TGTAGT', 'GTAGTC', 'TAGTCC', 'AGTCCA', 'GTCCAT', 'TCCATT', 'CCATTT', 'CATTTC', 'ATTTCA', 'TTTCAC', 'TTCACG', 'TCACGG', 'CACGGC', 'ACGGCA', 'CGGCAT', 'GGCATG', 'GCATGA', 'CATGAC', 'ATGACT', 'TGACTG', 'GACTGG', 'ACTGGA', 'CTGGAA', 'TGGAAG', 'GGAAGT', 'GAAGTG', 'AAGTGG', 'AGTGGA', 'GTGGAT', 'TGGATA', 'GGATAC', 'GATACG', 'ATACGG', 'TACGGC', 'ACGGCT', 'CGGCTT', 'GGCTTC', 'GCTTCG', 'CTTCGA', 'TTCGAT', 'TCGATT', 'CGATTT', 'GATTTT', 'ATTTTC', 'TTTTCT', 'TTTCTT', 'TTCTTC', 'TCTTCA', 'CTTCAG', 'TTCAGA', 'TCAGAT', 'CAGATA', 'AGATAT', 'GATATG', 'ATATGC', 'TATGCC', 'ATGCCT', 'TGCCTC', 'GCCTCG', 'CCTCGG', 'CTCGGC', 'TCGGCC', 'CGGCCT', 'GGCCTC', 'GCCTCC', 'CCTCCA', 'CTCCAG', 'TCCAGA', 'CCAGAC', 'CAGACG', 'AGACGC', 'GACGCA', 'ACGCAC', 'CGCACT', 'GCACTT', 'CACTTA', 'ACTTAG', 'CTTAGA', 'TTAGAG']