In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix
from itertools import product
from joblib import Parallel, delayed
from collections import Counter
from itertools import product

In [2]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [3]:
def generate_kmers(k, alphabet='ACGT'):
    """Generate all possible k-mers from the given alphabet."""
    return [''.join(p) for p in product(alphabet, repeat=k)]

def find_spaced_words(sequence, pattern):
    k = pattern.count('1')
    all_kmers = generate_kmers(k)
    kmer_counts = {kmer: 0 for kmer in all_kmers}
    
    pattern_indices = [i for i, char in enumerate(pattern) if char == '1']
    
    for i in range(len(sequence) - len(pattern) + 1):
        spaced_word_chars = [sequence[i + j] for j in pattern_indices]
        spaced_word = ''.join(spaced_word_chars)
        
        if spaced_word in kmer_counts:
            kmer_counts[spaced_word] += 1
    
    feature_vector = np.array(list(kmer_counts.values()))
    
    return feature_vector

In [4]:
from dask.diagnostics import ProgressBar
import dask.array as da
from dask import delayed, compute
import numpy as np

@delayed
def delayed_find_spaced_words(sequence, pattern):
    return find_spaced_words(sequence, pattern)

def calculate_feature_matrix_dask(sequences, pattern):
    # Parallelize the computation of feature vectors using Dask delayed
    delayed_results = [delayed_find_spaced_words(seq, pattern) for seq in sequences]
    
    # Use Dask compute to execute computations in parallel with a progress bar
    with ProgressBar():
        feature_vectors = compute(*delayed_results)
    
    # Convert the list of feature vectors into a Dask array
    feature_matrix = da.stack([da.from_array(fv, chunks=len(fv)) for fv in feature_vectors])
    
    # Convert the Dask array to a NumPy array if necessary
    return feature_matrix.compute()

sequences = data["Sequence"]

pattern = "1001010101"
feature_matrix_dask = calculate_feature_matrix_dask(sequences, pattern)
print("Feature Matrix with Dask:\n", feature_matrix_dask)

[########################################] | 100% Completed | 386.45 s
Feature Matrix with Dask:
 [[ 96  49  54 ...  55  36 104]
 [ 71  48  53 ...  55  35 106]
 [ 97  48  53 ...  56  37 108]
 ...
 [ 70  48  53 ...  56  36 104]
 [ 73  48  54 ...  54  34 104]
 [ 71  47  53 ...  55  37 104]]


In [5]:
spaced_words = pd.DataFrame(feature_matrix_dask)

In [6]:
spaced_words["Target"] = data["Lineage"].tolist()
spaced_words["Test"] = data["Test"].tolist()

In [8]:
spaced_words.columns = spaced_words.columns.astype(str)

In [9]:
spaced_words.to_parquet(f'../../data/features/5-spaced.parquet', engine='pyarrow')