In [18]:
import numpy as np
import pandas as pd
from numba import njit
import numba

In [25]:
@njit
def calculate_bbc_feature_numba(sequence, k):
    # Map bases to integers to work with Numba: A->0, T->1, G->2, C->3
    base_map = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    
    # Convert sequence to an array of integers
    seq_array = np.zeros(len(sequence), dtype=np.int32)
    for i, base in enumerate(sequence):
        seq_array[i] = base_map[base]
    
    # Initialize the feature vector to zeros
    feature_vector = np.zeros((16,))
    
    # Calculate base frequencies
    base_freqs = np.zeros(4)
    for i in range(4):
        base_freqs[i] = np.sum(seq_array == i) / len(seq_array)
    
    # Calculate joint probabilities and average relevance
    for i in range(4):
        for j in range(4):
            joint_probs = np.zeros(k)
            for l in range(1, k + 1):
                count = 0
                for pos in range(len(seq_array) - l):
                    if seq_array[pos] == i and seq_array[pos + l] == j:
                        count += 1
                # Calculate the joint probability for the current gap l
                p_ij_l = count / (len(seq_array) - l)
                joint_probs[l - 1] = p_ij_l
            
            # Calculate the average relevance T_ij(k) for the current base pair (i, j)
            T_ij_k = 0
            for p_ij_l in joint_probs:
                if p_ij_l > 0:
                    T_ij_k += p_ij_l * np.log2(p_ij_l / (base_freqs[i] * base_freqs[j]))
            
            # Store the result in the feature vector
            feature_vector[i * 4 + j] = T_ij_k
    
    return feature_vector


In [6]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [7]:
data['Sequence'] = data['Sequence'].str.replace('[^ACTG]', '', regex=True)

In [33]:
import time
start = time.time()
bbc_arrays = []
for genome in data["Sequence"][:100]:
    bbc_arrays.append(calculate_bbc_feature_numba(genome, 6))
end = time.time()

print(end-start)

2.060697317123413


In [30]:
b2b_data = pd.DataFrame(bbc_arrays)
b2b_data["Target"] = data["Lineage"].tolist()
b2b_data["Test"] = data["Test"].tolist()
b2b_data.to_parquet('../../data/features/b2b.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
