In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import OrderedDict
# biopython SeqIO
from Bio import SeqIO
from sklearn.metrics import roc_auc_score
import scipy
ALPHABET_PROTEIN = '-ACDEFGHIKLMNPQRSTVWY'

In [2]:
# Protein amino acid alphabet + gap character "-"
alphabet = '-ACDEFGHIKLMNPQRSTVWY'
# map amino acids to index
aa_to_i = {aa:i for i, aa in enumerate(alphabet)}
# map index to amino acids
i_to_a = {i:aa for i, aa in enumerate(alphabet)}

In [3]:
def encode(seqs, alphabet=ALPHABET_PROTEIN):
    '''
    Go from letters to numbers
    '''
    aa_to_i = OrderedDict((aa, i) for i, aa in enumerate( alphabet ))
    X = np.asarray([[aa_to_i[x] for x in seq] 
                    for seq in seqs])
    return X, aa_to_i

def check_sequence(s, alphabet=ALPHABET_PROTEIN):
    for aa in s:
        if aa not in ALPHABET_PROTEIN:
            return False
    return True

def one_hot_encode(s):
    return np.eye(21)[s].flatten()

In [4]:
# Read in the datasets
df = pd.read_csv('/home/hunter/projects/recombination/ProteinGym/Tranception/proteingym/Detailed_performance_files/Substitutions/Spearman/all_models_substitutions_Spearman_DMS_level.csv')
df = df.rename(columns={df.columns[0]: 'Dataset'})
df = df.loc[df.EVmutation.sort_values(ascending=False).index, ["Dataset", "EVmutation", "EVE_ensemble","UniProt_ID", "Neff_L_category", "Taxon"]]
df = df.loc[df.Neff_L_category == 'high']
df.head(20)

Unnamed: 0,Dataset,EVmutation,EVE_ensemble,UniProt_ID,Neff_L_category,Taxon
13,BLAT_ECOLX_Firnberg_2014,0.708,0.729,BLAT_ECOLX,high,Prokaryote
15,BLAT_ECOLX_Stiffler_2015,0.707,0.727,BLAT_ECOLX,high,Prokaryote
14,BLAT_ECOLX_Jacquier_2013,0.689,0.723,BLAT_ECOLX,high,Prokaryote
23,CP2C9_HUMAN_Amorosi_activity_2021,0.59,0.635,CP2C9_HUMAN,high,Human
22,CP2C9_HUMAN_Amorosi_abundance_2021,0.572,0.61,CP2C9_HUMAN,high,Human
41,KKA2_KLEPN_Melnikov_2014,0.53,0.603,KKA2_KLEPN,high,Prokaryote
21,CCDB_ECOLI_Tripathi_2016,0.506,0.528,CCDB_ECOLI,high,Prokaryote
12,BLAT_ECOLX_Deng_2012,0.504,0.508,BLAT_ECOLX,high,Prokaryote
39,IF1_ECOLI_Kelsic_2016,0.499,0.537,IF1_ECOLI,high,Prokaryote
7,A4GRB6_PSEAI_Chen_2020,0.492,0.641,A4GRB6_PSEAI,high,Prokaryote


In [11]:
# Grab the first dataset
dataset = df.Dataset.iloc[0]
print(dataset)
# Get all of the MSA files for all of the datasets
msa_files = list(Path('/home/hunter/projects/recombination/ProteinGym/MSA_files').glob("*.a2m"))
# Get all of the DMS mutation files for all of the datasets
mutation_files = list(Path('/home/hunter/projects/recombination/ProteinGym/ProteinGym_substitutions').glob("*.csv"))
# Get all of the sequence weight files for all of the datasets
weight_files = list(Path('/home/hunter/projects/recombination/ProteinGym/substitutions_MSAs_all_positions').glob("*.npy"))
# Get all of the results files for all of the datasets
res_files = list(Path('/home/hunter/projects/recombination/ProteinGym/substitutions/').glob("*.csv"))


# Find the MSA file for this dataset
msa_matches = [f for f in msa_files if dataset.split("_")[0] in f.name]
assert len(msa_matches) == 1
# Find the DMS mutation file for this dataset
mut_matches = [f for f in mutation_files if dataset in f.name]
assert len(mut_matches) == 1
# Find the sequence weight file for this dataset
weight_matches = [f for f in weight_files if dataset.split("_")[0] in f.name]
assert len(weight_matches) == 1
# Find the results file for this dataset
res_matches = [f for f in res_files if dataset == f.stem]
assert len(res_matches) == 1

msa_fn = msa_matches[0]
mut_fn = mut_matches[0]
weight_fn = weight_matches[0]
res_fn = res_matches[0]
mut_df = pd.read_csv(mut_fn)
res_df = pd.read_csv(res_fn)

BLAT_ECOLX_Firnberg_2014


PosixPath('/home/hunter/projects/recombination/ProteinGym/substitutions/BLAT_ECOLX_Firnberg_2014.csv')

In [6]:
msa_fn

PosixPath('/home/hunter/projects/recombination/ProteinGym/MSA_files/BLAT_ECOLX_full_11-26-2021_b02.a2m')

In [13]:
def process_msa_sequence(msa_fn):
    # Process MSA sequences
    # one-hot encoded

    # read in fasta files
    msa_sequences = [str(x.seq) for x in SeqIO.parse(msa_fn, 'fasta')]
    # get wild-type sequence
    wt_seq = msa_sequences[0]
    # keep all the columns
    columns_to_keep = [i for i in range(len(wt_seq))]
    # upper case all letters
    msa_sequences = [[s[i].upper() for i in columns_to_keep] for s in msa_sequences]
    # replace "." with "-"
    msa_sequences = [[aa.replace(".", "-") for aa in s] for s in msa_sequences]
    msa_sequences = np.asarray(msa_sequences)
    # keep sequences that pass check_sequence
    msa_sequences = [s for s in msa_sequences if check_sequence(s)]
    msa_sequences = np.asarray(msa_sequences)

    seqs_enc, aa_to_i = encode(msa_sequences)
    i_to_a = {i:aa for i, aa in enumerate(ALPHABET_PROTEIN)}
    weights = np.load(weight_fn)
    assert weights.shape[0] == len(msa_sequences)
    oh_enc_seq = []
    for s in seqs_enc:
        oh_enc_seq.append(one_hot_encode(s))
    oh_enc_seq = np.asarray(oh_enc_seq)
    return oh_enc_seq

In [14]:
one_hot_encode(seqs_enc)

print(oh_enc_seq.shape)

NameError: name 'seqs_enc' is not defined

In [15]:
seqs_enc.shape

NameError: name 'seqs_enc' is not defined

In [16]:
np.eye(3)[[2, 0, 1]]

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [17]:
# Get the DMS sequences, which we use to evaluate the performance of our model
mutant_sequences_to_keep = mut_df.mutant.map(lambda x: np.all([int(y[1:-1])-1 in columns_to_keep for y in x.split(":")]))
sub_mut_df = mut_df.loc[mutant_sequences_to_keep]
mut_seqs = sub_mut_df.mutated_sequence.map(lambda x: [aa_to_i[x[i]] for i in columns_to_keep]).to_list()
mut_seqs = np.asarray(mut_seqs)
# Get ground truth DMS scores
y_dms = sub_mut_df.DMS_score.to_numpy()
# Get ground truth DMS class (0/1 where 1 is fit and 0 is unfit)
y_bin = sub_mut_df.DMS_score_bin.to_numpy()
res_df = res_df.loc[mutant_sequences_to_keep]
print(dataset)
# Print EVMutation spearman
print(scipy.stats.spearmanr(res_df.EVmutation, res_df.DMS_score).correlation)
# Print EVMutation AUROC
print(roc_auc_score(y_bin, res_df.EVmutation))

NameError: name 'columns_to_keep' is not defined

In [None]:
re