In [6]:
#Preprocessing the data- Converting Text file to CSV
import pandas as pd
import csv
import os

In [7]:
# Define a function to read FASTA file and return DNA sequences
def read_fasta(filename):
    sequences = []
    additional_data = []  # Additional data for each sequence
    with open(filename, "r") as file:
        lines = file.readlines()
        sequence = ""
        for line in lines:
            if line.startswith(">"):
                if sequence:
                    sequences.append(sequence)
                    sequence = ""
                # Extract additional data from header (e.g., accession number, description)
                header = line.strip().split("|")
                additional_data.append(header)
            else:
                sequence += line.strip()
        if sequence:  # Add the last sequence
            sequences.append(sequence)
    return sequences, additional_data

In [8]:
def fasta_to_csv(input_filename, output_filename, label, disease_name):
    sequences, additional_data = read_fasta(input_filename)
    region_name = os.path.splitext(os.path.basename(input_filename))[0]
    with open(output_filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        # Define column headers
        writer.writerow(["DNA", "Region", "Length", "Disease", "Label"])
        for sequence, data in zip(sequences, additional_data):
            sequence_length = len(sequence)
            writer.writerow([sequence, region_name, sequence_length, disease_name, label])


In [9]:
# Define the main folder containing subfolders with FASTA files
main_folder = "Viral_Data_Sequences"
output_folder = "Viral_Data_Csv"

# Define a dictionary to map folder names to disease names and labels
disease_mapping = {
    "HBV": {"disease": "HBV", "label": 0},
    "INFLUENZA": {"disease": "Influenza", "label": 1},
    "HCV": {"disease": "HCV", "label": 2},
    "DENGUE": {"disease": "Dengue", "label": 3}
}

# Function to process a folder containing FASTA files
def process_folder(folder_path, disease_name):
    region_name = os.path.basename(folder_path)
    output_subfolder = os.path.join(output_folder, region_name)
    os.makedirs(output_subfolder, exist_ok=True)
    label = disease_mapping[disease_name.upper()]["label"]
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".fasta"):
            input_file_path = os.path.join(folder_path, file_name)
            output_file_path = os.path.join(output_subfolder, f"{os.path.splitext(file_name)[0]}.csv")
            fasta_to_csv(input_file_path, output_file_path, label, region_name)

# Function to recursively traverse the directory structure and process folders
def process_folders(root_folder):
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if os.path.isdir(folder_path):
            if folder_name.upper() in disease_mapping:
                disease_name = disease_mapping[folder_name.upper()]["disease"]
                process_folder(folder_path, disease_name)

# Call the function to process folders
process_folders(main_folder)


In [27]:

# Define the input folder (output folder from previous code)
input_folder = "Viral_Data_Csv"
output_folder = "Padded_Viral_Data_Csv"

# Function to pad DNA sequences and save them to a new CSV file
def pad_sequences_and_save(input_file_path, output_file_path, max_length):
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Pad DNA sequences with 'N' characters
    df['sequence'] = df['sequence'].apply(lambda x: x.ljust(max_length, 'N'))

    # Remove the 'Length' field
    df = df.drop(columns=['Length'])

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through each disease folder in the input folder
for disease_folder in os.listdir(input_folder):
    disease_folder_path = os.path.join(input_folder, disease_folder)
    if os.path.isdir(disease_folder_path):
        output_disease_folder = os.path.join(output_folder, disease_folder)
        os.makedirs(output_disease_folder, exist_ok=True)
        
        # Iterate through each region subfolder in the disease folder
        for region_folder in os.listdir(disease_folder_path):
            region_folder_path = os.path.join(disease_folder_path, region_folder)
            if os.path.isdir(region_folder_path):
                output_subfolder = os.path.join(output_disease_folder, region_folder)
                os.makedirs(output_subfolder, exist_ok=True)
                
                # Iterate through each CSV file in the region subfolder
                for file_name in os.listdir(region_folder_path):
                    if file_name.endswith(".csv"):
                        input_file_path = os.path.join(region_folder_path, file_name)
                        output_file_path = os.path.join(output_subfolder, file_name)

                        # Read the CSV file to determine the maximum length
                        df = pd.read_csv(input_file_path)
                        max_length = df['sequence'].str.len().max()

                        # Pad sequences and save to a new CSV file
                        pad_sequences_and_save(input_file_path, output_file_path, max_length)


In [3]:
# Load the datasets without headers
train_df = pd.read_csv("fullset_train.csv", header=None)
validation_df = pd.read_csv("fullset_validation.csv", header=None)
test_df = pd.read_csv("fullset_test.csv", header=None)

# Calculate the length of the DNA sequence
train_df['length'] = train_df[1].str.len()
validation_df['length'] = validation_df[1].str.len()
test_df['length'] = test_df[1].str.len()

# Concatenate the datasets
fullset_df = pd.concat([train_df, validation_df, test_df], ignore_index=True)

# Save the concatenated dataset to a new CSV file
fullset_df.to_csv("fullset_combined.csv", index=False, header=False)  # Do not write headers

# Load the concatenated dataset with the correct data types and column names
fullset_df = pd.read_csv("fullset_combined.csv", header=None, names=["sequence", "DNA", "Label", "length"])
  # Adjust filter condition based on your data

# Save non-viral sequences to a new CSV file
fullset_df.to_csv("non_viral_sequences.csv", index=False)


In [5]:
# Load the datasets without headers
train_df = pd.read_csv("training_data_anella.csv", header=None)
test_df = pd.read_csv("test_data_anella.csv", header=None)

# Calculate the length of the DNA sequence
train_df['length'] = train_df[1].str.len()
test_df['length'] = test_df[1].str.len()

# Concatenate the datasets
fullset_df = pd.concat([train_df, test_df], ignore_index=True)

# Save the concatenated dataset to a new CSV file
fullset_df.to_csv("anella_viral.csv", index=False, header=False)  # Do not write headers

# Load the concatenated dataset with the correct data types and column names
fullset_df = pd.read_csv("anella_viral.csv", header=None, names=["sequence", "DNA", "Label", "length"])
  # Adjust filter condition based on your data

# Save non-viral sequences to a new CSV file
fullset_df.to_csv("anella_viral.csv", index=False)


In [29]:
# Define the input folder (output folder from previous code)
input_folder = "Viral_Data_Csv"
output_folder = "Padded_Viral_Data_Csv"

# Function to pad DNA sequences and save them to a new CSV file
def pad_sequences_and_save(input_file_path, output_file_path, max_length):
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Pad DNA sequences with 'N' characters
    df['DNA'] = df['DNA'].apply(lambda x: x.ljust(max_length, 'N'))

    # Remove the 'Length' field
    df = df.drop(columns=['Length'])

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through each disease folder in the input folder
for disease_folder in os.listdir(input_folder):
    disease_folder_path = os.path.join(input_folder, disease_folder)
    if os.path.isdir(disease_folder_path):
        # Create the disease folder in the output folder
        output_disease_folder = os.path.join(output_folder, disease_folder)
        os.makedirs(output_disease_folder, exist_ok=True)
        
        # Iterate through each CSV file in the disease subfolder
        for file_name in os.listdir(disease_folder_path):
            if file_name.endswith(".csv"):
                input_file_path = os.path.join(disease_folder_path, file_name)
                
                # Read the CSV file to determine the maximum length
                df = pd.read_csv(input_file_path)
                max_length = df['DNA'].str.len().max()
                
                # Construct the output file name with maximum length
                output_file_name = os.path.splitext(file_name)[0] + f"_{max_length}.csv"
                output_file_path = os.path.join(output_disease_folder, output_file_name)

                # Pad sequences and save to a new CSV file
                pad_sequences_and_save(input_file_path, output_file_path, max_length)


In [30]:

##### INTEGER ENCODING

input_folder = "Padded_Viral_Data_Csv"
output_folder = "Integer_Encoded_Viral_Data_Csv"

# Define nucleotide-to-integer mapping
nucleotide_mapping = {'N': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4, 'n': 0, 'a': 1, 't': 2, 'c': 3, 'g': 4}

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)


# Function to perform integer encoding on a DNA sequence
def integer_encode_sequence(sequence):
    return [nucleotide_mapping[n] if n in nucleotide_mapping else 0 for n in sequence]

# Function to perform integer encoding on a CSV file and save it
def encode_and_save(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # Integer encode the DNA sequences
    df['DNA'] = df['DNA'].apply(integer_encode_sequence)
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)

# Iterate through each CSV file in the input folder
for disease_folder in os.listdir(input_folder):
    disease_folder_path = os.path.join(input_folder, disease_folder)
    if os.path.isdir(disease_folder_path):
        output_disease_folder = os.path.join(output_folder, disease_folder)
        os.makedirs(output_disease_folder, exist_ok=True)
        # Iterate through each CSV file in the disease folder
        for file_name in os.listdir(disease_folder_path):
            if file_name.endswith(".csv"):
                input_file_path = os.path.join(disease_folder_path, file_name)
                output_file_path = os.path.join(output_disease_folder, file_name)
                encode_and_save(input_file_path, output_file_path)


In [31]:

##### ONE-HAT ENCODING

# Define the input and output folders
input_folder = "Padded_Viral_Data_Csv"
output_folder = "OneHot_Encoded_Viral_Data_Csv"

# Define nucleotide-to-integer mapping
nucleotide_mapping = {'N': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4, 'n': 0, 'a': 1, 't': 2, 'c': 3, 'g': 4}

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to perform one-hot encoding on a DNA sequence
def one_hot_encode_sequence(sequence):
    one_hot_encoding = [[0, 0, 0, 0, 0] for _ in range(len(sequence))]  # Initialize with zeros
    for i, nucleotide in enumerate(sequence):
        if nucleotide in nucleotide_mapping:
            one_hot_encoding[i][nucleotide_mapping[nucleotide]] = 1  # Set the corresponding position to 1
    return one_hot_encoding

# Function to perform one-hot encoding on a CSV file and save it
def encode_and_save(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # One-hot encode the DNA sequences
    df['DNA'] = df['DNA'].apply(one_hot_encode_sequence)
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)

# Iterate through each CSV file in the input folder
for disease_folder in os.listdir(input_folder):
    disease_folder_path = os.path.join(input_folder, disease_folder)
    if os.path.isdir(disease_folder_path):
        output_disease_folder = os.path.join(output_folder, disease_folder)
        os.makedirs(output_disease_folder, exist_ok=True)
        # Iterate through each CSV file in the disease folder
        for file_name in os.listdir(disease_folder_path):
            if file_name.endswith(".csv"):
                input_file_path = os.path.join(disease_folder_path, file_name)
                output_file_path = os.path.join(output_disease_folder, file_name)
                encode_and_save(input_file_path, output_file_path)


In [32]:
# from itertools import product
# import numpy as np

# # Define the input and output folders
# input_folder = "Padded_Viral_Data_Csv"
# output_folder = "Kmer_Encoded_Viral_Data_Csv"

# # Define the value of k for k-mer encoding
# k = 6

# # Function to generate all possible k-mers of length k
# def generate_kmers(k):
#     return [''.join(kmer) for kmer in product('ACGTN', repeat=k)]

# # Function to perform k-mer encoding on a DNA sequence
# def kmer_encoding(sequence, k):
#     kmers = generate_kmers(k)
#     encoding = np.zeros(len(kmers))
#     for i in range(len(sequence) - k + 1):
#         kmer = sequence[i:i+k]
#         if kmer in kmers:
#             index = kmers.index(kmer)
#             encoding[index] += 1
#     return encoding

# # Function to perform k-mer encoding on a CSV file and save it
# def encode_and_save(input_file_path, output_file_path):
#     # Read the CSV file
#     df = pd.read_csv(input_file_path)
    
#     # Perform k-mer encoding on the DNA sequences
#     encoded_sequences = []
#     for sequence in df['DNA']:
#         encoded_sequence = kmer_encoding(sequence, k)
#         encoded_sequences.append(encoded_sequence)
    
#     # Create a DataFrame with the encoded sequences
#     encoded_df = pd.DataFrame(encoded_sequences, columns=generate_kmers(k))
    
#     # Concatenate the encoded DataFrame with the original DataFrame
#     df_encoded = pd.concat([df, encoded_df], axis=1)
    
#     # Save the updated DataFrame to a new CSV file
#     df_encoded.to_csv(output_file_path, index=False)

# # Create the output folder if it doesn't exist
# os.makedirs(output_folder, exist_ok=True)

# # Iterate through each CSV file in the input folder
# for disease_folder in os.listdir(input_folder):
#     disease_folder_path = os.path.join(input_folder, disease_folder)
#     if os.path.isdir(disease_folder_path):
#         output_disease_folder = os.path.join(output_folder, disease_folder)
#         os.makedirs(output_disease_folder, exist_ok=True)
#         # Iterate through each CSV file in the disease folder
#         for file_name in os.listdir(disease_folder_path):
#             if file_name.endswith(".csv"):
#                 input_file_path = os.path.join(disease_folder_path, file_name)
#                 output_file_path = os.path.join(output_disease_folder, file_name)
#                 encode_and_save(input_file_path, output_file_path)
