In [1]:
import mmh3
import pandas as pd
import numpy as np

In [2]:
def generate_kmers(sequence, k):
    """Generate k-mers from a given sequence."""
    for i in range(len(sequence) - k + 1):
        yield sequence[i:i + k]

def create_mash_sketch(sequence, k, sketch_size):
    """Create a Mash-like sketch for a given sequence, skipping canonical k-mer computation."""
    start_time = time.time()  # Start timing for the whole function
    hashes = set()
    
    kmer_gen_time = 0
    hashing_time = 0
    
    # Generate k-mers and hash them directly without canonicalization
    for kmer in generate_kmers(sequence, k):
        kmer_start_time = time.time()
        kmer_hash = mmh3.hash(kmer)  # Hash the k-mer directly
        hashing_time += time.time() - kmer_start_time
        
        hashes.add(kmer_hash)
    
    # Select the smallest hashes to create the sketch
    sketch = np.array(sorted(hashes)[:sketch_size])
    
    # total_time = time.time() - start_time
    # print(f"Total sketch creation time: {total_time:.4f} seconds")
    # print(f"Total k-mer generation time: {kmer_gen_time:.4f} seconds")
    # print(f"Total hashing time: {hashing_time:.4f} seconds")
    
    return sketch

In [3]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [4]:
data['Sequence'] = data['Sequence'].str.replace('[^ACTG]', '', regex=True)

In [5]:
import time

k = 22
sketch_size = 1000

start = time.time()
sketch_array = []
for genome in data["Sequence"]:
    sketch_array.append(create_mash_sketch(genome, k, sketch_size))
end = time.time()

print(end-start)

1095.9188392162323


In [6]:
mash_data = pd.DataFrame(sketch_array)
mash_data["Target"] = data["Lineage"].tolist()
mash_data["Test"] = data["Test"].tolist()
mash_data.to_parquet('../../data/features/mash.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
