In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from utils import replace_deg, remove_deg
from itertools import product

In [2]:
def get_kmers(sequence, k=3):
    """Generate k-mers for a given sequence."""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def generate_kmer_features(df, k=3, alphabet='ACGT'):
    """Generate k-mer feature counts for a DataFrame of sequences."""
    # Generate all possible k-mers from the given alphabet
    all_kmers = [''.join(p) for p in product(alphabet, repeat=k)]
    
    # Initialize a list to store the feature vectors for each sequence
    feature_vectors = []
    
    # Process each sequence in the DataFrame
    for seq in df['Sequence']:
        # Generate k-mer counts for the current sequence
        kmer_counts = Counter(get_kmers(seq, k))
        
        # Initialize a feature vector with zeros for all possible k-mers
        feature_vector = {kmer: 0 for kmer in all_kmers}
        
        # Update the feature vector with counts from the current sequence
        for kmer, count in kmer_counts.items():
            if kmer in feature_vector:
                feature_vector[kmer] = count
        
        # Convert the feature vector to a numpy array and append it to the list
        feature_vectors.append(np.array(list(feature_vector.values())))
    
    # Return the list of feature vectors
    return feature_vectors

In [3]:
k = 7

In [4]:
data = pd.read_parquet('../../data/processed/cov-19.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Sequence,Coverage,Train
0,EPI_ISL_15104785,BA.5.1,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,99.213260,2
1,EPI_ISL_3411570,AY.19,TGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTG...,99.939512,2
2,EPI_ISL_2433815,C.1,ATACCTTCCTAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGA...,95.359497,2
2,EPI_ISL_1715397,L.3,CTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGT...,99.775462,1
3,EPI_ISL_14795073,BA.4.6,ACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAA...,99.972625,0
...,...,...,...,...,...
46982,EPI_ISL_18407444,BA.4.1.8,TTGTAGATCTGTTCTCTAAACGNANNNNNNNNNNNNNNNNNNNNNN...,98.319865,1
46984,EPI_ISL_18407446,BA.4.1.8,TTGTAGATCTGTTCTCTAAACGAACNTGAAAANNNNNNNNNNNNNN...,99.107744,2
46985,EPI_ISL_18407448,BA.5.11,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,99.983160,2
46986,EPI_ISL_15426702,BA.4,TTGTAGATCTGTTCTCTAAACGAACNNNNNNNNNNNNNNNNNNNNN...,98.986805,2


In [5]:
# Apply degenerate nucleotide processing
deg = "Remove"

if deg == 'Replace':
    print("Replacing Deg...")
    data = replace_deg(data)
elif deg == 'Remove':
    print("Removing Deg...")
    data = remove_deg(data)

Removing Deg...


In [6]:
# Assuming 'sequences_df' is your input DataFrame
import time
start = time.time()
kmer_features = generate_kmer_features(data, k=k)
end = time.time()
print(end-start)

426.11202597618103


In [7]:
kmer_features["Target"] = data["Lineage"].tolist()
kmer_features["Test"] = data["Test"].tolist()

TypeError: list indices must be integers or slices, not str

In [None]:
# kmer_features.to_parquet(f'../../data/features/{k}-mer_standard.parquet', engine='pyarrow')