In [1]:
import os
import pandas as pd

# Define the input folder containing the padded viral data
input_folder = "Viral_Data_Csv"

# Initialize an empty DataFrame to store concatenated data
concatenated_df = pd.DataFrame()

# Function to concatenate all CSV files in a folder and append to the main DataFrame
def concat_files_in_folder(folder_path):
    global concatenated_df
    # Iterate through each CSV file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            # Read the CSV file
            df = pd.read_csv(file_path)
            # Concatenate the DataFrame to the main DataFrame
            concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

# Iterate through each disease folder in the input folder
for disease_folder in os.listdir(input_folder):
    disease_folder_path = os.path.join(input_folder, disease_folder)
    if os.path.isdir(disease_folder_path):
        # Concatenate all CSV files in the disease folder and append to the main DataFrame
        concat_files_in_folder(disease_folder_path)

# Save the concatenated DataFrame as a single CSV file in the main folder
output_file_path = "Combined_Viral_Data.csv"
concatenated_df.to_csv(output_file_path, index=False)


In [2]:
import os
import pandas as pd

# Define the input file path
input_file_path = "Combined_Viral_Data.csv"
output_folder = "Region_Wise_Data"

# Read the combined viral data into a DataFrame
df = pd.read_csv(input_file_path)

os.makedirs(output_folder, exist_ok=True)

# Iterate through each unique region
for region in df['Region'].unique():
    # Filter the data for the current region
    region_data = df[df['Region'] == region]
    
    # Drop the 'Region' column
    region_data = region_data.drop(columns=['Region'])
    
    # Save the region-wise data to a CSV file in the corresponding folder
    region_file_path = os.path.join(output_folder, f"{region}.csv")
    region_data.to_csv(region_file_path, index=False)


In [2]:
import os
import pandas as pd

# Define the input folder containing region-wise data
input_folder = "Region_Wise_Data"
output_folder = "Padded_Region_Wise_Data"

# Function to pad DNA sequences and save them to a new CSV file
def pad_sequences_and_save(input_file_path, output_file_path, max_length):
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Pad DNA sequences with 'N' characters
    df['DNA'] = df['DNA'].apply(lambda x: x.ljust(max_length, 'N'))

    # Drop the 'Length' column
    df = df.drop(columns=['Length'])

    # Save the updated DataFrame to a new CSV file with max length appended to the filename
    output_file_path = output_file_path.replace('.csv', f'_MaxLength_{max_length}.csv')
    df.to_csv(output_file_path, index=False)

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through each CSV file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        input_file_path = os.path.join(input_folder, file_name)
        output_file_path = os.path.join(output_folder, file_name)

        # Read the CSV file to determine the maximum length
        df = pd.read_csv(input_file_path)
        max_length = df['DNA'].str.len().max()

        # Pad sequences and save to a new CSV file
        pad_sequences_and_save(input_file_path, output_file_path, max_length)


In [None]:
import os
import pandas as pd
import numpy as np

# Define the input and output folders
input_folder = "Padded_Region_Wise_Data"
output_folder = "Integer_Encoded_Region_Wise_Data"

# Define nucleotide-to-integer mapping
nucleotide_mapping = {'N': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4, 'n': 0, 'a': 1, 't': 2, 'c': 3, 'g': 4}

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to perform integer encoding on a DNA sequence
def integer_encode_sequence(sequence):
    return [nucleotide_mapping[nucleotide] if nucleotide in nucleotide_mapping else 0 for nucleotide in sequence]

# Function to perform integer encoding on a CSV file and save it
def encode_and_save(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # Integer encode the DNA sequences
    df['DNA'] = df['DNA'].apply(integer_encode_sequence)
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)

# Iterate through each CSV file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        input_file_path = os.path.join(input_folder, file_name)
        output_file_path = os.path.join(output_folder, file_name)
        encode_and_save(input_file_path, output_file_path)


In [None]:
import os
import pandas as pd
import numpy as np

# Define the input and output folders
input_folder = "Padded_Region_Wise_Data"
output_folder = "OneHot_Encoded_Region_Wise_Data"

# Define nucleotide-to-integer mapping
nucleotide_mapping = {'N': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4, 'n': 0, 'a': 1, 't': 2, 'c': 3, 'g': 4}

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to perform one-hot encoding on a DNA sequence
def one_hot_encode_sequence(sequence):
    one_hot_encoding = np.zeros((len(sequence), len(nucleotide_mapping)), dtype=int)
    for i, nucleotide in enumerate(sequence):
        if nucleotide.upper() in nucleotide_mapping:  # Ensure nucleotide is uppercase
            one_hot_encoding[i, nucleotide_mapping[nucleotide.upper()]] = 1
        else:
            # Treat anything other than 'N', 'A', 'T', 'C', 'G' as 'N'
            one_hot_encoding[i, 0] = 1
    return one_hot_encoding.tolist()  # Convert numpy array to list for DataFrame compatibility

# Function to perform one-hot encoding on a CSV file and save it
def encode_and_save(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path)
    
    # One-hot encode the DNA sequences
    df['DNA'] = df['DNA'].apply(one_hot_encode_sequence)
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)

# Iterate through each CSV file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        input_file_path = os.path.join(input_folder, file_name)
        output_file_path = os.path.join(output_folder, file_name)
        encode_and_save(input_file_path, output_file_path)
