In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix
from itertools import product
from joblib import Parallel, delayed
from collections import Counter
from itertools import product

In [2]:
def get_kmers(sequence, k=3):
    """Generate k-mers for a given sequence."""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def generate_valid_kmers(k=3):
    """Generate all possible k-mers made up of 'A', 'C', 'T', and 'G'."""
    return {''.join(kmer) for kmer in product('ACTG', repeat=k)}

def generate_kmer_features(df, k=3):
    """Generate k-mer feature counts for a DataFrame of sequences."""
    kmer_counts = [Counter(get_kmers(seq, k)) for seq in df['Sequence']]
    kmer_df = pd.DataFrame(kmer_counts).fillna(0)

    # Keep only columns corresponding to valid k-mers
    valid_kmers = generate_valid_kmers(k)
    valid_columns = kmer_df.columns.intersection(valid_kmers)
    kmer_df = kmer_df[valid_columns]

    # Convert DataFrame to NumPy array
    return kmer_df

In [3]:
k = 5

In [4]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [5]:
# Assuming 'sequences_df' is your input DataFrame
kmer_features = generate_kmer_features(data, k=k)

In [6]:
kmer_features["Target"] = data["Lineage"].tolist()
kmer_features["Test"] = data["Test"].tolist()

In [7]:
kmer_features.to_parquet(f'../../data/features/{k}-mer_standard.parquet', engine='pyarrow')