In [4]:
import pandas as pd
import numpy as np
import gc
import time
import pyfftw
import random

In [5]:
data = pd.read_parquet('../../data/processed/mock_data.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,Accession ID,Lineage,Sequence,Coverage,Train
0,EPI_ISL_15104785,BA.5.1,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,99.213260,2
1,EPI_ISL_3411570,AY.19,TGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTG...,99.939512,2
2,EPI_ISL_2433815,C.1,ATACCTTCCTAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGA...,95.359497,2
2,EPI_ISL_1715397,L.3,CTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGT...,99.775462,1
3,EPI_ISL_14795073,BA.4.6,ACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAA...,99.972625,0
...,...,...,...,...,...
691,EPI_ISL_5099144,AY.46,TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...,100.000000,0
691,EPI_ISL_9695375,BA.1.21,ACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACT...,99.160820,1
692,EPI_ISL_602627,B.1.1.84,CTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCT...,100.000000,0
693,EPI_ISL_17231201,BQ.1.1.38,TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAG...,99.979875,0


In [6]:
# Define the function again after the reset
def replace_deg(genomes):
    # Define the mapping of degenerate characters to their possible bases
    degenerate_mapping = {
        'W': ['A', 'T'],
        'S': ['C', 'G'],
        'M': ['A', 'C'],
        'K': ['G', 'T'],
        'R': ['A', 'G'],
        'Y': ['C', 'T'],
        'B': ['C', 'G', 'T'],
        'D': ['A', 'G', 'T'],
        'H': ['A', 'C', 'T'],
        'V': ['A', 'C', 'G'],
        'N': ['A', 'C', 'T', 'G']
    }

    # Function to replace a single degenerate character with a random possible base
    def replace_char(char):
        if char in degenerate_mapping:
            return random.choice(degenerate_mapping[char])
        else:
            return char

    # Function to replace all degenerate characters in a sequence
    def replace_sequence(sequence):
        return ''.join(replace_char(char) for char in sequence)

    # Replace the sequences in the dataframe
    genomes['Sequence'] = genomes['Sequence'].apply(replace_sequence)

    return genomes

data = replace_deg(data)

In [7]:
def anti_symmetric_padding(seq, target_length):
    if len(seq) >= target_length:
        return seq
    
    # Calculate the number of elements needed to pad
    pad_size = target_length - len(seq)
    
    # Determine the sequence to replicate, reverse it, and apply alternating sign
    # The replication will start from the last element of 'seq'
    if pad_size <= len(seq):
        pad_seq = seq[-pad_size:][::-1] * (-1) ** np.arange(1, pad_size + 1)
    else:
        # If pad_size is larger than the sequence itself, repeat the anti-symmetric pattern
        repeats = pad_size // len(seq) + 1  # Calculate how many times to repeat the sequence
        extended_seq = np.tile(seq[::-1], repeats) * (-1) ** np.arange(1, len(seq) * repeats + 1)
        pad_seq = extended_seq[:pad_size]  # Take only the required number of elements
    
    # Append the anti-symmetrically padded sequence
    return np.concatenate((seq, pad_seq))

In [16]:
def numeric_transform(sq, mapping):
    # Create a NumPy array of the same length as the input sequence
    numSeq = np.zeros(len(sq), dtype=np.float32)

    # Map characters to indices: A->0, C->1, G->2, T->3, others->-1 (which will be ignored)
    char_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    indices = np.array([char_to_index.get(char, -1) for char in sq])

    # Use boolean indexing to set values only for valid indices
    valid_indices = (indices >= 0)
    numSeq[valid_indices] = mapping[indices[valid_indices]]

    return numSeq

def fourier_transform(ns):
    # Create an FFTW object which plans the FFT, executes it, and returns the result
    fft_object = pyfftw.builders.fft(ns)
    fourier = fft_object()

    # Compute the magnitude spectrum
    magnitude_spectrum = np.abs(fourier)
    return magnitude_spectrum

def process_sequence(seq, mLen, mapping):
    seq = numMappingReal(seq)
    
    if len(seq) > mLen:
        seq = seq[:mLen]
    elif len(seq) < mLen:
        seq = anti_symmetric_padding(seq, mLen)

    magnitude_spectrum = fourier_transform(seq)
    result = magnitude_spectrum.astype(np.float16)
    
    return result

In [13]:
lengths = data["Sequence"].str.len()
median_length = int(lengths.median())

In [22]:
if numeric = "Real":
    mapping = np.array([-1.5, 0.5, -0.5, 1.5], dtype=np.float32)  # A, C, G, T mapping in order
elif numeric = "PP":
    mapping = np.array([-1, 1, -1, 1], dtype=np.float32)  # A, C, G, T mapping in order
elif numeric = "JustA":
    mapping = np.array([1, 0, 0, 0], dtype=np.float32)  # A, C, G, T mapping in order
elif numeric = "EIIP":
    mapping = np.array([0.1260, 0.1340, 0.0806, 0.1335], dtype=np.float32)  # A, C, G, T mapping in order

In [14]:
data = data.reset_index()

In [17]:
dsp_data = np.zeros((len(data), median_length)).astype(np.float16)
for i in range(len(data)):
    dsp_data[i] = process_sequence(data["Sequence"][i], median_length)

In [18]:
# results = pd.DataFrame(np.vstack(results))
results = pd.DataFrame(dsp_data)

In [20]:
results["Target"] = data["Lineage"].tolist()
results["Test"] = data["Train"].tolist()

In [13]:
results.columns = results.columns.astype(str)

In [14]:
results.to_parquet('../../data/features/dsp_real.parquet', engine='pyarrow')