In [1]:
import numpy as np
import pandas as pd
import time
import random

In [2]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [3]:
data['Sequence'] = data['Sequence'].str.replace('[^ACTG]', '', regex=True)

In [4]:
import numpy as np
from numba import jit


@jit(nopython=True)
def update_adj_matrix(sequence_indices, n, alpha, max_distance, adj_matrix):
    for i in range(n):
        j_end = min(i + max_distance + 1, n)
        if i + 1 >= j_end:
            break

        for j in range(i + 1, j_end):
            weight = 1 / ((j - i) ** alpha)
            adj_matrix[sequence_indices[i], sequence_indices[j]] += weight

def calculate_adjacency_matrix_optimized(sequence, alpha=0.5, max_distance=1):
    nucleotide_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    sequence_indices = np.array([nucleotide_to_index[nuc] for nuc in sequence])

    adj_matrix = np.zeros((4, 4))
    n = len(sequence)

    update_adj_matrix(sequence_indices, n, alpha, max_distance, adj_matrix)

    return adj_matrix

def process_genomes_optimized(genomes):
    total_loop_time = 0
    flattened_matrices = []

    for seq in genomes:
        start_time = time.time()
        adj_matrix = calculate_adjacency_matrix_optimized(seq)
        total_loop_time += time.time() - start_time
        flattened_matrices.append(adj_matrix.flatten())

    # Convert the list of arrays into a single 2D array
    combined_array = np.vstack(flattened_matrices)

    return combined_array, total_loop_time

# Example usage
genomes = data['Sequence']
flattened_matrices, total_loop_time = process_genomes_optimized(genomes)

print("Total loop time:", total_loop_time, "seconds")

Total loop time: 43.75835466384888 seconds


In [5]:
graph_data = pd.DataFrame(flattened_matrices)
graph_data["Target"] = data["Lineage"].tolist()
graph_data["Test"] = data["Test"].tolist()

In [6]:
graph_data.to_parquet('../../data/features/graph.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
