In [None]:
import pandas as pd
import numpy as np
import gc
import random

## **Functions**

In [None]:
def read_fasta(file_path):
    sequences = {}
    current_sequence = ''
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence:
                    sequences[header] = current_sequence
                    current_sequence = ''
                header = line[1:]
            else:
                current_sequence += line
        if current_sequence:
            sequences[header] = current_sequence
    return sequences

In [None]:
def clear_variable(var_list):
    for var in var_list:
        globals().pop(var, None)
    gc.collect()

In [None]:
def generate_kmers(sequence, k=3):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

In [None]:
def sequence_to_int_mapping(sequence):
    from itertools import product

    nucleotides = ['A', 'T', 'C', 'G']
    kmers = [''.join(p) for p in product(nucleotides, repeat=3)]

    kmer_to_int = {kmer: i+1 for i, kmer in enumerate(kmers)}

    return [kmer_to_int[kmer] for kmer in sequence]

In [None]:
def custom_pad_sequences(sequences, maxlen, padding='post', value=0):
    padded_sequences = []

    for seq in sequences:
        if len(seq) < maxlen:
            if padding == 'post':
                # Add padding at the end
                seq = seq + [value] * (maxlen - len(seq))
            elif padding == 'pre':
                # Add padding at the beginning
                seq = [value] * (maxlen - len(seq)) + seq
        else:
            # Truncate the sequence if it's longer than maxlen
            seq = seq[:maxlen]

        padded_sequences.append(seq)

    return padded_sequences


## **Process**

In [None]:
file_list = [
    'Cov-Alpha-US-13207.fasta',
    'Cov-BA.2.12.1-usa-11331.fasta',
    'Cov-Delta-US-10117.fasta',
    'Cov-BQ.1.1-usa-9999.fasta',
    'Cov-BA.1.1-usa-6694.fasta',
    'Cov-Gama-US-4995.fasta',
    'Cov-BA.5.4-3631.fasta',
    'Cov-BA.4.6-2607.fasta'
]

In [None]:
# Initialize variables
dataset_len = 2500
proportion = 125
max_seq_length = 30000

# Process each file in the list
for i in range(len(file_list)):
    file_path = '{index}'.format(index=file_list[i])
    data = read_fasta(file_path)

    data_nuc = list(data.values())
    clear_variable(['data'])

    random.seed(101)
    random.shuffle(data_nuc)

    chunk_num = 1
    for start in range(0, dataset_len, proportion):
        samples_chunk = data_nuc[start : start + proportion]
        kmers = [
            generate_kmers(sample) for sample in samples_chunk
        ]
        clear_variable(['samples_chunk'])

        kmers_int = [
            sequence_to_int_mapping(kmer) for kmer in kmers
        ]
        clear_variable(['kmers'])

        kmers_int_padded = custom_pad_sequences(kmers_int, maxlen=max_seq_length, padding='post', value=0)
        clear_variable(['kmers_int'])

        kmers_int_padded_pd = pd.DataFrame(kmers_int_padded)
        clear_variable(['kmers_int_padded'])

        kmers_int_padded_pd['class'] = i+1
        kmers_int_padded_pd = kmers_int_padded_pd.astype('uint8')
        kmers_int_padded_pd.to_csv('rnn_data_class_{index_1}_chunk_{index_2}.csv'.format(index_1=i+1, index_2=chunk_num), index=False)
        print('File saved: rnn_data_class_{index_1}_chunk_{index_2}.csv'.format(index_1=i+1, index_2=chunk_num))
        chunk_num += 1
        clear_variable(['kmers_int_padded_pd'])

    # Clean up memory after processing each file
    clear_variable(['data_nuc'])


In [None]:
num_subsets = 20

for i in range(num_subsets):
    subset_df = pd.DataFrame()  # Initialize an empty DataFrame
    for j in range(len(file_list)):
        # Load the chunk as a DataFrame
        chunk = pd.read_csv(
            'rnn_data_class_{index_1}_chunk_{index_2}.csv'.format(index_1=j+1, index_2=i+1)
        )

        subset_df = pd.concat([subset_df, chunk], ignore_index=True)  # Concatenate DataFrames
        clear_variable(['chunk'])

    # Shuffle the subset DataFrame
    random.seed(101)
    subset_df = subset_df.sample(frac=1, random_state=101).reset_index(drop=True)

    # Save the shuffled subset DataFrame
    subset_df.to_csv(
        'rnn_data_subset_{index}.csv'.format(index=i+1),
        index=False
    )
    print('File saved: rnn_data_subset_{index}.csv'.format(index=i+1))

    clear_variable(['subset_df'])