In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def compute_kmer_scores_to_csv(input_csv, output_csv, kmer_column='K-mer', label_column='status'):
    """
    Compute k-mer frequency-based scores for each DNA sequence in a CSV file
    and save the resulting feature matrix and labels into a new CSV file.

    Parameters:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to save the output CSV file.
        kmer_column (str): Column name containing k-mer sequences.
        label_column (str): Column name containing labels.
    """
    # Load the CSV file
    df = pd.read_csv(input_csv)
    
    # Get the unique k-mers across all sequences
    all_kmers = set()
    for seq in df[kmer_column]:
        all_kmers.update(seq.split())
    kmer_vocab = sorted(all_kmers)  # Consistent ordering
    
    # Create a mapping from k-mers to indices
    kmer_to_index = {kmer: idx for idx, kmer in enumerate(kmer_vocab)}
    vocab_size = len(kmer_vocab)
    print(f"Number of unique k-mers: {vocab_size}")
    
    # Compute k-mer frequency vectors
    feature_matrix = []
    for seq in df[kmer_column]:
        # Initialize a frequency vector for the sequence
        kmer_counts = np.zeros(vocab_size, dtype=np.float32)
        for kmer in seq.split():
            kmer_counts[kmer_to_index[kmer]] += 1
        # Normalize frequencies
        kmer_counts /= kmer_counts.sum()  # Ensure probabilities sum to 1
        feature_matrix.append(kmer_counts)
    
    # Convert to DataFrame
    feature_df = pd.DataFrame(feature_matrix, columns=kmer_vocab)
    
    # Encode the labels as integers
    label_encoder = LabelEncoder()
    feature_df[label_column] = label_encoder.fit_transform(df[label_column])
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(f"Label mapping: {label_mapping}")
    
    # Save the feature matrix and labels to a new CSV file
    feature_df.to_csv(output_csv, index=False)
    print(f"K-mer feature matrix and labels saved to {output_csv}")

# Example usage
input_csv = '/home/user/torch_shrimp/dataset/Mixed/Cleansed-kmer/kmer_test5101.csv'
output_csv = '/home/user/torch_shrimp/until-tools/mod/k-mer/test5101.csv'
compute_kmer_scores_to_csv(input_csv, output_csv)


Number of unique k-mers: 64
Label mapping: {'AHPND': 0, 'WSSV': 1, 'healthy': 2}
K-mer feature matrix and labels saved to /home/user/torch_shrimp/until-tools/mod/k-mer/test5101.csv
