In [1]:
import pandas as pd
import copy
import numpy as np
import time

In [2]:
def precompute_kmer_dict(k):
    from itertools import product

    # Generate all possible k-mers
    alphabet = 'ACGT'
    all_k_mers = [''.join(p) for p in product(alphabet, repeat=k)]

    # Initialize dictionary with all possible k-mers
    kmer_stats = {k_mer: {'sum': 0, 'count': 0, 'last_seen': -1} for k_mer in all_k_mers}

    return kmer_stats

def compute_rtd_feature_vector(genome, k, kmer_stats):
    timings = {}  # Dictionary to store timings

    start_time = time.time()

    # Iterate through the genome
    for i in range(len(genome) - k + 1):
        k_mer = genome[i:i+k]
        stats = kmer_stats[k_mer]
        if stats['last_seen'] != -1:  # If k-mer has been seen before
            distance = i - stats['last_seen']
            stats['sum'] += distance
            stats['count'] += 1
        stats['last_seen'] = i  # Update last seen position

    timings['iteration'] = time.time() - start_time
    start_time = time.time()

    # Initialize the feature vector
    feature_vector = np.zeros(2 * 4**k)
    for idx, (k_mer, stats) in enumerate(kmer_stats.items()):
        if stats['count'] > 0:
            mean = stats['sum'] / stats['count']
            # For standard deviation, assume it's 0 if count is 1 as we don't have enough data
            std_dev = np.sqrt((stats['sum']**2 / stats['count'] - mean**2) / (stats['count'] if stats['count'] > 1 else 1))
            feature_vector[2*idx] = mean
            feature_vector[2*idx + 1] = std_dev
        else:  # If k-mer has never been seen
            feature_vector[2*idx] = -1  # Indicate absence with -1
            feature_vector[2*idx + 1] = -1

    timings['compute_feature_vector'] = time.time() - start_time

    # # Print timings
    # for key, value in timings.items():
    #     print(f"{key}: {value} seconds")

    return feature_vector

In [3]:
data = pd.read_parquet('../../data/processed/genomes.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence,Test
0,EPI_ISL_16823464,XBB.1.5,2023-01-31,TAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATC...,0
1,EPI_ISL_3342425,AY.116,2021-07-26,GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGC...,0
2,EPI_ISL_1715410,B.1.525,2021-01-12,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...,1
3,EPI_ISL_515786,B.1.1.57,2020-07-29,TTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGT...,0
4,EPI_ISL_17385094,BQ.1.1,2023-02-06,TACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCA...,1
...,...,...,...,...,...
47317,EPI_ISL_15963061,CP.5,2022-11-01,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47322,EPI_ISL_15963067,BE.7,2022-11-05,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,1
47324,EPI_ISL_15963069,BE.7,2022-11-05,CTAAACGANCTTTAAAATCTGTGTGGCTGTCNCTCGGCTGCATNCT...,1
47344,EPI_ISL_18407436,BA.1,2022-07-26,TTGTAGATCTGTTCTCTAAACGAACNTGAAAATCTGTGTGGCTGTC...,1


In [4]:
data['Sequence'] = data['Sequence'].str.replace('[^ACTG]', '', regex=True)

In [5]:
k = 6
kmer_stats = precompute_kmer_dict(k)

In [6]:
# import pandas as pd
# import dask.dataframe as dd
# from dask.distributed import Client
# from dask import compute

# client = Client()

# def compute_rtd(row, k, kmer_stats):
#     rtd = compute_rtd_feature_vector(row["Sequence"], k, copy.deepcopy(kmer_stats))
#     return rtd

# # Convert your pandas DataFrame to a Dask DataFrameimport pandas as pd
# import dask.dataframe as dd
# from dask.distributed import Client
# from dask import compute

# client = Client()

# def compute_rtd(row, k, kmer_stats):
#     rtd = compute_rtd_feature_vector(row["Sequence"], k, copy.deepcopy(kmer_stats))
#     return rtd

# # Convert your pandas DataFrame to a Dask DataFrame
# dask_df = dd.from_pandas(data, npartitions=64)  # You can adjust npartitions based on your system's cores

# # Note: We're using a lambda to pass the 'comparison_sequences' variable as a constant argument
# meta = pd.Series([], dtype=float)
# results = dask_df.map_partitions(lambda df: df.apply(compute_rtd, axis=1, args=(k, kmer_stats,)), meta=meta)

# start = time.time()
# # Compute the result
# computed_results = results.compute(scheduler='processes')
# end = time.time()

# # Convert the computed result to a pandas DataFrame
# final_df = computed_results.to_frame()

# print(final_df)
# print(end - start)

# expanded_df = pd.DataFrame(computed_results.apply(pd.Series))
# expanded_df["Target"] = data["Lineage"].tolist()
# expanded_df["Test"] = data["Test"].tolist()
# expanded_df
# expanded_df.to_parquet('../../data/features/rtd.parquet', engine='pyarrow')
# dask_df = dd.from_pandas(data, npartitions=64)  # You can adjust npartitions based on your system's cores

# # Note: We're using a lambda to pass the 'comparison_sequences' variable as a constant argument
# meta = pd.Series([], dtype=float)
# results = dask_df.map_partitions(lambda df: df.apply(compute_rtd, axis=1, args=(k, kmer_stats,)), meta=meta)

# start = time.time()
# # Compute the result
# computed_results = results.compute(scheduler='processes')
# end = time.time()

# # Convert the computed result to a pandas DataFrame
# final_df = computed_results.to_frame()

# print(final_df)
# print(end - start)

# expanded_df = pd.DataFrame(computed_results.apply(pd.Series))
# expanded_df["Target"] = data["Lineage"].tolist()
# expanded_df["Test"] = data["Test"].tolist()
# expanded_df
# expanded_df.to_parquet('../../data/features/rtd.parquet', engine='pyarrow')

In [7]:
rtd_arrays = []

for genome in data["Sequence"]:
    rtd_arrays.append(compute_rtd_feature_vector(genome, k, copy.deepcopy(kmer_stats)))

In [8]:
rtd_data = pd.DataFrame(rtd_arrays)

In [9]:
rtd_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8182,8183,8184,8185,8186,8187,8188,8189,8190,8191
0,903.290323,888.601696,1792.600000,1731.816130,1377.095238,1343.907343,1593.733333,1539.692678,1116.346154,1094.667466,...,826.718750,813.698743,1572.312500,1522.385032,1442.722222,1402.073979,1105.272727,1079.860762,2293.000000,2050.921549
1,10176.000000,0.000000,1792.533333,1731.751724,1377.047619,1343.860872,1593.333333,1539.306241,1074.962963,1054.868425,...,853.193548,839.319558,1676.933333,1620.071514,1527.235294,1481.635867,1157.619048,1129.720513,2291.400000,2049.490465
2,965.896552,949.097069,1793.200000,1732.395785,1377.523810,1344.325586,1539.812500,1490.917042,1075.333333,1055.231872,...,826.812500,813.791017,1572.500000,1522.566578,1623.062500,1571.523508,1057.347826,1034.106574,2082.000000,1947.532670
3,10177.000000,0.000000,1905.571429,1836.254575,1367.047619,1334.101871,1578.866667,1525.330113,1108.230769,1086.709677,...,819.750000,806.839744,1662.466667,1606.095386,1514.470588,1469.252283,1204.650000,1174.147581,2807.500000,2431.366321
4,10168.000000,0.000000,1792.600000,1731.816130,1410.900000,1375.175214,1584.875000,1534.548620,940.037037,922.464702,...,826.096774,812.663411,1479.823529,1435.639699,1527.411765,1481.807068,1105.136364,1079.727534,2023.142857,1873.066322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22706,10167.000000,0.000000,1792.400000,1731.622912,1376.952381,1343.767929,1593.400000,1539.370647,1074.888889,1054.795736,...,801.515152,789.277560,1572.125000,1522.203486,1442.444444,1401.804028,1105.045455,1079.638715,2293.000000,2050.921549
22707,10168.000000,0.000000,1776.266667,1716.036631,1365.285714,1332.382428,1584.812500,1534.488105,1106.807692,1085.314235,...,826.500000,813.483438,1557.000000,1507.558768,1456.941176,1413.440556,1105.090909,1079.683124,2293.000000,2050.921549
22708,10168.000000,0.000000,1774.733333,1714.555290,1364.190476,1331.313585,1583.375000,1533.096251,1105.923077,1084.446798,...,825.937500,812.929797,1555.562500,1506.166914,1455.823529,1412.356279,1104.272727,1078.883754,2289.200000,2047.522726
22709,9904.000000,0.000000,1621.937500,1570.434232,1331.857143,1299.759483,1547.800000,1495.316862,1002.678571,984.610811,...,824.903226,811.489271,1882.230769,1808.388829,1472.294118,1428.335096,1061.428571,1035.848220,1666.000000,1542.416286


In [10]:
rtd_data["Target"] = data["Lineage"].tolist()
rtd_data["Test"] = data["Test"].tolist()

In [11]:
rtd_data.to_parquet('../../data/features/rtd.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
