In [2]:
### replicate results of EVmutation with the PABP_YEAST dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from model import CouplingsModel
import tools
import scipy
from pathlib import Path
from collections import OrderedDict
# biopython SeqIO
from Bio import SeqIO
from sklearn.metrics import roc_auc_score
import scipy

In [4]:
### training code for model
!plmc/bin/plmc -o PABP_YEAST/model/PABP_YEAST.model_params -c PABP_YEAST/model/PABP_YEAST.txt -f PABP_YEAST -le 16.2 -lh 0.01 -m 200 -t 0.2 -g EVmutation/example/PABP_YEAST.a2m

Found focus PABP_YEAST as sequence 1
151496 valid sequences out of 152041 
82 sites out of 96
Region starts at 115
Effective number of samples (to 1 decimal place): 18615.5	(80% identical neighborhood = 1.000 samples)
iter	time	cond	fx	-loglk	||h||	||e||
1	1.0	668.45	2803680.4	2803612.2	72.1	1.0
2	1.4	1317.47	2530380.0	2526756.6	72.1	14.8
3	1.9	1387.57	2507890.2	2498735.3	72.1	23.7
4	2.4	658.49	2409702.7	2401575.0	72.1	22.3
5	2.9	637.79	2383657.9	2376482.3	72.0	21.0
6	3.3	407.33	2361117.0	2353416.7	72.0	21.7
7	3.8	478.18	2297750.2	2288016.6	72.0	24.4
8	4.3	549.61	2213983.4	2198656.6	71.9	30.7
9	5.1	707.63	2182939.6	2163878.2	71.9	34.3
10	5.6	477.00	2147279.0	2123692.2	71.9	38.1
11	6.1	424.40	2114991.4	2087527.3	71.9	41.1
12	6.5	646.70	2086447.8	2052652.6	71.9	45.6
13	7.0	331.04	2063943.8	2030772.3	71.9	45.2
14	7.5	275.17	2043127.9	2009362.5	71.9	45.6
15	7.9	294.50	2018508.1	1981338.2	71.9	47.9
16	8.4	455.42	1995078.0	1951677.0	71.8	51.7
17	8.8	301.11	1976134.6	1926882.9	71.8	55.1
18	9.

In [8]:
# load parameters from file to create a pairwise model
c = CouplingsModel("PABP_YEAST/model/PABP_YEAST.model_params")
# read the experimental mutational scanning dataset for PABP by Melamed et al., RNA, 2013
data = pd.read_csv(
    "PABP_YEAST/data/PABP_YEAST_Fields2013-singles.csv", sep=";", comment="#"
)

# predict mutations using our model
data_pred = tools.predict_mutation_table(
    c, data, "effect_prediction_epistatic"
)

In [10]:
# can also add predictions by the corresponding independent model
c0 = c.to_independent_model()

data_pred = tools.predict_mutation_table(
    c0, data_pred, "effect_prediction_independent"
)

In [12]:
singles = tools.single_mutant_matrix(
    c, output_column="effect_prediction_epistatic"
)

singles.head()

Unnamed: 0,mutant,pos,wt,subs,frequency,effect_prediction_epistatic
0,K123A,123,K,A,0.077201,0.801569
1,K123C,123,K,C,0.001461,-3.336549
2,K123D,123,K,D,0.118235,-0.308808
3,K123E,123,K,E,0.110503,-1.06541
4,K123F,123,K,F,0.007791,-3.006763


In [13]:
data

Unnamed: 0,mutant,linear,log
0,G126A,0.711743,-0.490571
1,G126C,0.449027,-1.155127
2,G126E,0.588928,-0.763836
3,G126D,0.229853,-2.121218
4,G126N,0.679435,-0.557593
...,...,...,...
1183,P200Q,1.379698,0.464353
1184,P200S,1.052320,0.073573
1185,P200R,1.133948,0.181355
1186,P200T,1.005247,0.007550


In [16]:
pred = data_pred['effect_prediction_epistatic']
pred_ind = data_pred['effect_prediction_independent']
label = data_pred['linear']
scipy.stats.spearmanr(pred, label)

SpearmanrResult(correlation=0.5926175055168056, pvalue=1.495790932177082e-113)

Running SVM on this set of data

In [4]:
ALPHABET_PROTEIN = '-ACDEFGHIKLMNPQRSTVWY'
# Protein amino acid alphabet + gap character "-"
alphabet = '-ACDEFGHIKLMNPQRSTVWY'
# map amino acids to index
aa_to_i = {aa:i for i, aa in enumerate(alphabet)}
# map index to amino acids
i_to_a = {i:aa for i, aa in enumerate(alphabet)}

In [25]:
def encode(seqs, alphabet=ALPHABET_PROTEIN):
    '''
    Go from letters to numbers
    '''
    aa_to_i = OrderedDict((aa, i) for i, aa in enumerate( alphabet ))
    X = np.asarray([[aa_to_i[x] for x in seq] 
                    for seq in seqs])
    return X, aa_to_i

def check_sequence(s, alphabet=ALPHABET_PROTEIN):
    for aa in s:
        if aa not in ALPHABET_PROTEIN:
            return False
    return True

def one_hot_encode(s):
    ''' one hot encode sequence'''
    return np.eye(21)[s].flatten()
def process_msa_sequence(msa_sequences):
    ''' takes in list of sequences and one-hot encodes the sequences'''
    # get wild-type sequence
    wt_seq = msa_sequences[0]
    # keep all the columns
    columns_to_keep = [i for i in range(len(wt_seq))]
    # upper case all letters
    msa_sequences = [[s[i].upper() for i in columns_to_keep] for s in msa_sequences]
    # replace "." with "-"
    msa_sequences = [[aa.replace(".", "-") for aa in s] for s in msa_sequences]
    msa_sequences = np.asarray(msa_sequences)
    # keep sequences that pass check_sequence
    msa_sequences = [s for s in msa_sequences if check_sequence(s)]
    msa_sequences = np.asarray(msa_sequences)

    seqs_enc, aa_to_i = encode(msa_sequences)
    i_to_a = {i:aa for i, aa in enumerate(ALPHABET_PROTEIN)}
    oh_enc_seq = []
    for s in seqs_enc:
        oh_enc_seq.append(one_hot_encode(s))
    oh_enc_seq = np.asarray(oh_enc_seq)
    return oh_enc_seq

In [26]:
# read in fasta files
yeast_seq = [str(x.seq) for x in SeqIO.parse('EVmutation/example/PABP_YEAST.a2m', 'fasta')]
wildtype = yeast_seq[0]
yeast_seq = process_msa_sequence(yeast_seq)

In [27]:
yeast_seq.shape

(151496, 2016)

In [19]:
from sklearn.svm import OneClassSVM

clf = OneClassSVM(kernel='poly', degree=3, nu=0.1)
clf.fit(yeast_seq[:10000])

In [20]:
clf.predict(yeast_seq[:20])

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,
        1, -1, -1])

In [28]:
data = pd.read_csv(
    "PABP_YEAST/data/PABP_YEAST_Fields2013-singles.csv", sep=";", comment="#"
)
mutant, label = data['mutant'].to_numpy(), data['linear'].to_numpy()
mutant_data = []
for m in mutant:
    original_aa, loc, mutant_aa = m[0], int(m[1:4]), m[4]
    assert wildtype[loc] == original_aa

IndexError: string index out of range

In [None]:
G126A	

In [29]:
wildtype

'qrdpslrkKGSGNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAPHLSRkerdsq'