In [3]:
import numpy as np
from collections import Counter
from itertools import product
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [1]:
import numpy as np
from numba import jit

@jit(nopython=True)
def to_chaos_game(sequence, resolution, nucleotide_mapping):
    """
    Generate the chaos game representation of a DNA sequence.

    Parameters:
    - sequence (str): DNA sequence (composed of 'A', 'C', 'G', 'T').
    - resolution (int): The resolution of the output image (e.g., 512 for a 512x512 image).
    - nucleotide_mapping (list): Coordinates for 'A', 'C', 'G', 'T' in a tuple format.

    Returns:
    - np.array: Flattened array representing the chaos game image.
    """
    
    image = np.zeros((resolution, resolution), dtype=np.uint8)

    x, y = 0.5, 0.5
    scale = resolution - 1

    for char in sequence:
        if char == 'A':
            index = 0
        elif char == 'C':
            index = 1
        elif char == 'G':
            index = 2
        elif char == 'T':
            index = 3
        else:
            continue  # Skip unknown characters

        corner_x, corner_y = nucleotide_mapping[index]
        x = (x + corner_x) / 2
        y = (y + corner_y) / 2

        ix, iy = int(x * scale), int(y * scale)
        image[iy, ix] = 1

    return image.flatten()

def chaos_game_representation(partition, resolution):
    """
    Apply the chaos game representation to a partition of DNA sequences.

    Parameters:
    - partition (DataFrame): DataFrame with a column 'Sequence' containing DNA sequences.
    - resolution (int): Resolution of the output image for each sequence.

    Returns:
    - np.array: Array containing the chaos game representations for each sequence.
    """
    nucleotide_mapping = np.array([[0, 0], [0, 1], [1, 1], [1, 0]], dtype=np.float32)
    
    local_features = np.zeros((len(partition), resolution * resolution), dtype=np.uint8)
    for i, sequence in enumerate(partition['Sequence']):
        local_features[i, :] = to_chaos_game(sequence, resolution, nucleotide_mapping)
    return local_features

def remove_degenerate_nucleotides(genomes):
    genomes['Sequence'] = genomes['Sequence'].str.replace('[^ACTG]', '', regex=True)
    return genomes

In [4]:
training_sequences = pd.read_csv('train_consensus.csv')
testing_sequences = pd.read_csv('test_consensus.csv')

In [5]:
training_sequences

Unnamed: 0,AccessionID,Sequence,Glue,NextClade,Genome Detective,Consensus
0,EPI_ISL_17449010,AGTAGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAG...,1I_K.6,1I_K.6,1I_K.6,1I_K.6
1,EPI_ISL_17573827,ATGAACAACCAACGAAAAAAGACGGCTCGACCGTCTTTCAATATGC...,1I,1I,1I,1I
2,FJ882528.1,ACAAGAACAGTTTCGAATCGGAAGCTTGCTTAACGTAGTTCTAACA...,1I_K.7,1I_K.7,1I_K.7,1I_K.7
3,FJ850083.1,GACAAGAACAGTTTCGACTCGGAAGCTTGCTTAACGTAGTGCTAAC...,3III_C.2,3III_C.2,3III_C.2,3III_C.2
4,EPI_ISL_19038091,GACAAAGACAGATTCTTTGAGGGAGCTAAGCTCAACGTAGTTCTAA...,2III_D.2,2III_D.2,2III_D.2,2III_D.2
...,...,...,...,...,...,...
5311,EPI_ISL_773367,ACAAAGACAGTTTCTTTGAGGGAGCTAAGCTCAACGTAGTTCTAAC...,2V_A.1,2V_A.1.3,2V_A.1.3,2V_A.1.3
5312,EPI_ISL_19035034,GTTGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAGC...,1I_K.1.1,1I_K.1.1,1I_K.1.1,1I_K.1.1
5313,EPI_ISL_19058597,TTAGAGATCAGATCTGCTCTGATGAATAACCAACGGAAAAAGGCGA...,2II_F.1.1,2II_F.1.1.2,2II_F.1.1.2,2II_F.1.1.2
5314,EPI_ISL_17452772,AGTTGTTAGTCTGTGTGGACCGACAAGAACAGTTTCGAATCGGAAG...,1III_A,1III_A,1III_A,1III_A


In [6]:
X_train = chaos_game_representation(training_sequences, 128)
y_train_consensus = training_sequences['Consensus']

In [7]:
X_test = chaos_game_representation(testing_sequences, 128)
y_test_consensus = testing_sequences['Consensus']

In [8]:
def custom_accuracy_score(y_true, y_pred):
    # Check for length mismatch
    if len(y_true) != len(y_pred):
        raise ValueError("y_true and y_pred must have the same length")

    # Count matches between y_true and y_pred
    correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    return correct / len(y_true)

In [9]:
# Define label sets
y_train_labels = {
    'Consensus': training_sequences['Consensus'],
}

y_test_labels = {
    'Consensus': testing_sequences['Consensus'],
}

In [10]:
# Add the low_resource flag
low_resource = True

# Number of repetitions
n_repeats = 1

# Initialize a list to store results for accuracy
results = []

# Initialize DataFrames to store predictions with Accession_ID as the index
test_predictions = pd.DataFrame(index=testing_sequences['AccessionID'])
test_predictions_5000 = pd.DataFrame(index=testing_sequences['AccessionID'])

# Loop through each training label set
for train_label_name, y_train in y_train_labels.items():
    # Initialize a list to store test accuracies for each test label set
    model_accuracies = {test_label_name: [] for test_label_name in y_test_labels.keys()}
    model_accuracies_5000 = {test_label_name: [] for test_label_name in y_test_labels.keys()}
    
    # Repeat training/testing `n_repeats` times
    for i in range(n_repeats):
        # Split the data into training and validation sets (80:20 split)
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train, y_train, test_size=0.2, random_state=i+10, stratify=y_train
        )
        
        # If low_resource is True, limit X_train and y_train to 4 records per unique target class
        if low_resource:
            # Combine X_train_split and y_train_split for filtering
            train_data = pd.DataFrame(X_train_split)
            train_data['Target'] = y_train_split.tolist()
            
            # Sample 4 records per class
            limited_train_data = train_data.groupby('Target').apply(
                lambda x: x.sample(n=min(4, len(x)), random_state=i)
            ).reset_index(drop=True)
            
            # Separate back into features and targets
            X_train_split = limited_train_data.drop(columns=['Target']).to_numpy()
            y_train_split = limited_train_data['Target'].to_numpy()

        # Train the first Random Forest classifier
        rf = RandomForestClassifier(random_state=i, criterion='entropy', class_weight='balanced')
        rf.fit(X_train_split, y_train_split)

        # Extract the top 5000 most informative features based on feature importances
        feature_importances = rf.feature_importances_
        top_features_indices = np.argsort(feature_importances)[-5000:]  # Get indices of top 5000 features
        X_train_top5000 = X_train_split[:, top_features_indices]
        X_test_top5000 = X_test[:, top_features_indices]

        # Train the second Random Forest on top 5000 features
        rf_5000 = RandomForestClassifier(random_state=i, criterion='entropy', class_weight='balanced')
        rf_5000.fit(X_train_top5000, y_train_split)

        # Test the models on all testing label sets
        for test_label_name, y_test in y_test_labels.items():
            # Predictions for the first Random Forest
            y_test_pred = rf.predict(X_test)
            column_name = f"{test_label_name}_{i}"
            test_predictions[column_name] = y_test_pred
            accuracy = custom_accuracy_score(y_test, y_test_pred)
            model_accuracies[test_label_name].append(accuracy)

            # Predictions for the second Random Forest
            y_test_pred_5000 = rf_5000.predict(X_test_top5000)
            column_name_5000 = f"{test_label_name}_{i}_5000"
            test_predictions_5000[column_name_5000] = y_test_pred_5000
            accuracy_5000 = custom_accuracy_score(y_test, y_test_pred_5000)
            model_accuracies_5000[test_label_name].append(accuracy_5000)

    # After all repetitions, calculate avg and std for each test label set
    for test_label_name, accuracies in model_accuracies.items():
        avg_test_accuracy = np.mean(accuracies)
        std_test_accuracy = np.std(accuracies)
        results.append({
            'Train_Label_Set': train_label_name,
            'Test_Label_Set': test_label_name,
            'Model': 'Full_Features',
            'Avg_Test_Accuracy': avg_test_accuracy,
            'Std_Test_Accuracy': std_test_accuracy
        })

    for test_label_name, accuracies_5000 in model_accuracies_5000.items():
        avg_test_accuracy_5000 = np.mean(accuracies_5000)
        std_test_accuracy_5000 = np.std(accuracies_5000)
        results.append({
            'Train_Label_Set': train_label_name,
            'Test_Label_Set': test_label_name,
            'Model': 'Top_5000_Features',
            'Avg_Test_Accuracy': avg_test_accuracy_5000,
            'Std_Test_Accuracy': std_test_accuracy_5000
        })

# Convert results to a dataframe
results_df = pd.DataFrame(results)

# Save the results and predictions to CSV files with `lr_` prefix if low_resource is True
prefix = "lr_" if low_resource else ""
results_df.to_csv(f'{prefix}experiment_results.csv', index=False)
test_predictions.to_csv(f'{prefix}test_predictions.csv', index=True)  # Save with Accession_ID as the index
test_predictions_5000.to_csv(f'{prefix}test_predictions_5000.csv', index=True)  # Save for top 5000 features

  limited_train_data = train_data.groupby('Target').apply(


In [11]:
top_features_indices

array([ 7863,  4750,  3285, ...,  1760, 12601, 13121])

In [12]:
rf_5000

In [13]:
import numpy as np
from joblib import dump, load

array_file = "feature_set.npy"
np.save(array_file, top_features_indices)

# Save Random Forest model to disk
model_file = "dengue_random_forest.joblib"
dump(rf_5000, model_file)

['dengue_random_forest.joblib']

In [46]:
results_df

Unnamed: 0,Train_Label_Set,Test_Label_Set,Model,Avg_Test_Accuracy,Std_Test_Accuracy
0,Consensus,Consensus,Full_Features,0.969969,0.002211
1,Consensus,Consensus,Top_5000_Features,0.968715,0.003068
