In [1]:
import pandas as pd
import copy
import numpy as np
import time

In [2]:
def precompute_kmer_dict(k):
    from itertools import product

    # Generate all possible k-mers
    alphabet = 'ACGT'
    all_k_mers = [''.join(p) for p in product(alphabet, repeat=k)]

    # Initialize dictionary with all possible k-mers
    kmer_stats = {k_mer: {'sum': 0, 'count': 0, 'last_seen': -1} for k_mer in all_k_mers}

    return kmer_stats

def compute_rtd_feature_vector(genome, k, kmer_stats):
    timings = {}  # Dictionary to store timings

    start_time = time.time()

    # Iterate through the genome
    for i in range(len(genome) - k + 1):
        k_mer = genome[i:i+k]
        stats = kmer_stats[k_mer]
        if stats['last_seen'] != -1:  # If k-mer has been seen before
            distance = i - stats['last_seen']
            stats['sum'] += distance
            stats['count'] += 1
        stats['last_seen'] = i  # Update last seen position

    timings['iteration'] = time.time() - start_time
    start_time = time.time()

    # Initialize the feature vector
    feature_vector = np.zeros(2 * 4**k)
    for idx, (k_mer, stats) in enumerate(kmer_stats.items()):
        if stats['count'] > 0:
            mean = stats['sum'] / stats['count']
            # For standard deviation, assume it's 0 if count is 1 as we don't have enough data
            std_dev = np.sqrt((stats['sum']**2 / stats['count'] - mean**2) / (stats['count'] if stats['count'] > 1 else 1))
            feature_vector[2*idx] = mean
            feature_vector[2*idx + 1] = std_dev
        else:  # If k-mer has never been seen
            feature_vector[2*idx] = -1  # Indicate absence with -1
            feature_vector[2*idx + 1] = -1

    timings['compute_feature_vector'] = time.time() - start_time

    return feature_vector

In [3]:
data = pd.read_parquet('../../data/processed/mock_data.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [4]:
data['Sequence'] = data['Sequence'].str.replace('[^ACTG]', '', regex=True)

In [5]:
k = 6
kmer_stats = precompute_kmer_dict(k)

In [7]:
rtd_arrays = []

for genome in data["Sequence"]:
    rtd_arrays.append(compute_rtd_feature_vector(genome, k, copy.deepcopy(kmer_stats)))

In [8]:
rtd_data = pd.DataFrame(rtd_arrays)

In [10]:
rtd_data["Target"] = data["Lineage"].tolist()
rtd_data["Test"] = data["Test"].tolist()

In [11]:
rtd_data.to_parquet('../../data/features/rtd.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
