Test to see how well the Levenshtein distance can be used as a predictor for relevant datasets:

In [1]:
# Import required modules
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Levenshtein import distance as levenshtein_distance
from scipy.stats import spearmanr

# General Functions

In [2]:
def evaluate_task(task_filename, reference_seq):
    
    # Load the task file and filter to just the test data
    task_df = pd.read_csv(task_filename)
    task_df = task_df.loc[task_df.set == "test"].copy()

    # Calculate levenshtein distance between each sequence and the reference
    levenshteins = np.array([levenshtein_distance(reference_seq, new_seq) 
                             for new_seq in task_df.sequence.values])

    # Now get spearman rho and record. Negative levenshtein because we
    # expect a smaller distance to be correlated to larger fitness.
    rho, _ = spearmanr(-levenshteins, task_df.target.values)
    
    return rho    

def evaluate_tasks(refseq_fileloc, taskfolder, task_to_file_dict):
    
    # Get the reference sequence
    reference_seq = str(next(SeqIO.parse(refseq_fileloc, "fasta")).seq)

    # Loop over each task
    results = []
    for taskname, taskfile in task_to_file_dict.items():
        rho = evaluate_task(os.path.join(taskfolder, taskfile), 
                            reference_seq)
        results.append([taskname, rho])
        
    return results

# Levenshtein for AAV

In [3]:
def levenshtein_to_fitness_aav():

    # Define the different aav inputs
    aav_refseq_file = "../tasks/aav/P03135.fasta"
    aav_taskfolder = "../tasks/aav/tasks"
    aav_task_to_file = {
        "design": "design_task_regression.csv",
        "design_reversed": "design_task_reversed_regression.csv",
        "natural1": "natural_task_1_regression.csv",
        "natural2": "natural_task_2_regression.csv"
    }

    return evaluate_tasks(aav_refseq_file,
                          aav_taskfolder,
                          aav_task_to_file)

levenshtein_to_fitness_aav()

[['design', 0.601041185010009],
 ['design_reversed', -0.06599785513223982],
 ['natural1', -0.06517495650637238],
 ['natural2', 0.5254415020200949]]

# Levenshtein for Cas

In [4]:
def levenshtein_to_fitness_cas():

    # Define the different cas inputs
    refseq_file = "../tasks/cas/cas9_sequence.fasta"
    taskfolder = "../tasks/cas/tasks/"
    task_to_file = {
        "neg": "pi_domain_log_negative_selection_regression.csv",
        "pos": "pi_domain_log_positive_selection_regression.csv"
    }

    return evaluate_tasks(refseq_file,
                          taskfolder,
                          task_to_file)

levenshtein_to_fitness_cas()

[['neg', -0.010226623065258516], ['pos', 0.13659949947831676]]