In [1]:
### replicate results of EVmutation with the PABP_YEAST dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from model import CouplingsModel
import tools
import scipy
from pathlib import Path
from collections import OrderedDict
# biopython SeqIO
from Bio import SeqIO
from sklearn.metrics import roc_auc_score
import scipy
from collections import OrderedDict
from sklearn.svm import OneClassSVM


In [2]:
ALPHABET_PROTEIN = '-ACDEFGHIKLMNPQRSTVWY'

In [3]:
# helper functions
def encode(seqs, alphabet=ALPHABET_PROTEIN):
    '''
    Go from letters to numbers
    '''
    aa_to_i = OrderedDict((aa, i) for i, aa in enumerate( alphabet ))
    X = np.asarray([[aa_to_i[x] for x in seq] 
                    for seq in seqs])
    return X, aa_to_i
def one_hot_encode(s):
    return np.vstack([np.zeros(20), np.eye(20)])[s].flatten()

def check_sequence(s, alphabet=ALPHABET_PROTEIN):
    for aa in s:
        if aa not in ALPHABET_PROTEIN:
            return False
    return True
def process_msa_sequence(msa_sequences):
    ''' takes in list of sequences and one-hot encodes the sequences'''
    pos_upper = [x for x in range(len(msa_sequences[0])) if not msa_sequences[0][x].islower()]
    msa_sequences = np.asarray([np.asarray(list(s))[pos_upper] for s in msa_sequences if not 'x' in s])
    msa_sequences = np.asarray([s for s in msa_sequences if check_sequence(s) and len(s)==82])
    msa_sequences = np.asarray(msa_sequences)

    seqs_enc, aa_to_i = encode(msa_sequences)
    oh_enc_seq = []
    for s in seqs_enc:
        oh_enc_seq.append(one_hot_encode(s))
    oh_enc_seq = np.asarray(oh_enc_seq)
    return oh_enc_seq

def valid_weights_from_model(c):
    ### returns only valid weights
    _w = c.weights
    _w_valid = []
    for i in range(c.weights.shape[0]):
        if _w[i] == 0: 
            continue
        _w_valid.append(1/_w[i])
    return _w_valid

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

Preprocessing sequence data

In [4]:
### get all available msa sequences
yeast_seq_str = []
fasta_sequences = SeqIO.parse(open("PABP_YEAST/data/PABP_YEAST.a2m"),'fasta')
for fasta in fasta_sequences:
    yeast_seq_str.append(str(fasta.seq))

processed = process_msa_sequence(yeast_seq_str)
c = CouplingsModel(f"PABP_YEAST/model/PABP_YEAST.model_params")
weights = valid_weights_from_model(c)
assert len(weights) == len(processed)
wildtype_processed, wildtype_weights = processed[0], weights[0]
processed, weights = np.asarray(processed[1:]), np.asarray(weights[1:])
processed, weights = unison_shuffled_copies(processed, weights)

In [52]:
# using the trained svm model on DMS data
wildtype = yeast_seq_str[0]
data = pd.read_csv(
    "PABP_YEAST/data/PABP_YEAST_Fields2013-singles.csv", sep=";", comment="#"
)
mutant, label = data['mutant'].to_numpy(), data['linear'].to_numpy()
for i in range(label.shape[0]):
    label[i] = 1 if label[i] > 0.7 else 0
mutant_data = []
for m in mutant:
    original_aa, loc, mutant_aa = m[0], int(m[1:4])-115, m[4]
    assert wildtype[loc] == original_aa
    mutant_data.append(wildtype[:loc]+mutant_aa+wildtype[loc+1:])
mutant_data = np.asarray(mutant_data)
mutant_data = process_msa_sequence(mutant_data)

mutant_data, label = np.asarray(mutant_data), np.asarray(label)
mutant_data, label = unison_shuffled_copies(mutant_data, label)
print(mutant_data.shape)

(1188, 1640)


Experiment 1: Train a OneClassSVM with polynomial degree 2 kernel on MSA sequences. 

In [None]:
### The classifier has some effect on MSA data. ~2700 out of 5000 total positive samples have been classified correctly from the MSA dataset

clf = OneClassSVM(kernel='poly', degree=2, nu=0.3)
train = processed[:50000]
clf.fit(train, sample_weight=weights[:50000])
pred = clf.predict(processed[50000:55000])
print(pred[np.where(pred==1)].shape)

In [43]:
### The classifier does shitty job for mutant sequences (DMS).
mutant_pred = clf.predict(mutant_data)
print(pred[np.where(mutant_pred==1)].shape)

NameError: name 'clf' is not defined

Experiment 2: Train a OneClassSVM with polynomial degree 2 kernel on DMS sequences.

In [44]:
### also does a shitty job. Probably because too many features and did not shuffle sequences.
import sklearn
clf2 = sklearn.svm.SVC(kernel='poly', degree=2)
split = mutant_data.shape[0]//5 * 4
mut_train = mutant_data[:split]
mut_test = mutant_data[split:]
clf2.fit(mut_train, label[:split])

NameError: name 'pred' is not defined

Experiement 3: Gaussian Kernel OneClassSVM

In [10]:
clf = OneClassSVM(kernel='rbf', nu=0.3)
train = processed[:50000]
clf.fit(train, sample_weight=weights[:50000])
pred = clf.predict(processed[50000:55000])
print(pred[np.where(pred==1)].shape)

(3552,)


In [12]:
### The classifier does shitty job for mutant sequences (DMS).
mutant_pred = clf.predict(mutant_data)
print(pred[np.where(mutant_pred==1)].shape)

(1188,)


Experiement 4: explicitly model pairwise features. the original d features become d^2 features. SVM still uses poly deg-2 kernel

In [6]:
d = processed.shape[1]
num_seq = processed.shape[0]
rich_msa = np.zeros((num_seq//15, d**2))

# scipy.sparse.csr_matrix((num_seq, d**2))

In [7]:
for seq in range(num_seq//15):
    for i in range(d):
        for j in range(i, d):
            if processed[seq][i] == 1 and processed[seq][j] == 1 or i == j:
                rich_msa[seq][i*d + j] = 1

In [8]:
rich_msa = scipy.sparse.csr_matrix(rich_msa, (num_seq//15, d**2))

In [9]:
test = np.zeros((mutant_data.shape[0], d**2))
for seq in range(mutant_data.shape[0]):
    for i in range(d):
        for j in range(i, d):
            if mutant_data[seq][i] == 1 and mutant_data[seq][j] == 1 or i == j:
                test[seq][i*d + j] = 1

test_sparse = scipy.sparse.csr_matrix(test, (mutant_data.shape[0], d**2))

In [54]:
### expand wildtype
wt = [0 for _ in range(d**2)]
for i in range(d):
    for j in range(i, d):
        if wildtype_processed[i] == 1 and wildtype_processed[j] == 1 or i == j:
            wt[i*d + j] = 1


In [56]:
for n in [0.1, 0.15, 0.2, 0.25, 0.3]:
    clf = OneClassSVM(kernel='linear', nu=n)
    clf.fit(rich_msa, sample_weight=weights[:rich_msa.shape[0]])
    pred = clf.predict(test_sparse)
    cnt = 0 
    for i in range(mutant_data.shape[0]):
        if (pred[i]== 1 and label[i] == 1) or (pred[i] == -1 and label[i] == 0):
            cnt += 1
    print(f'kernel: linear, nu: {n}, result: ', cnt)
    print('wt: ', clf.predict([wt]))

kernel: linear, nu: 0.1, result:  673
wt:  [1]
kernel: linear, nu: 0.15, result:  673
wt:  [1]
kernel: linear, nu: 0.2, result:  673
wt:  [1]
kernel: linear, nu: 0.25, result:  673
wt:  [1]
kernel: linear, nu: 0.3, result:  673
wt:  [1]


In [26]:
### prediction of wildtype
wt = [0 for _ in range(d**2)]
for i in range(d):
    for j in range(i, d):
        if wildtype_processed[i] == 1 and wildtype_processed[j] == 1 or i == j:
            wt[i*d + j] = 1
clf.predict([wt])

array([1])

Experiment 4.1: Hyperparameter tuning

In [None]:
for n in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
    for deg in [1, 2, 3]:
        clf = OneClassSVM(kernel='poly', degree = deg, nu=n)
        clf.fit(rich_msa, sample_weight=weights[:rich_msa.shape[0]])
        pred = clf.predict(test_sparse)
        cnt = 0 
        for i in range(mutant_data.shape[0]):
            if (pred[i]== 1 and label[i] == 1) or (pred[i] == -1 and label[i] == 0):
                cnt += 1
        print(f'kernel: poly, degree: {deg}, nu: {n}, result: ', cnt)
        print(pred[:20])
        print("wild type pred: ", clf.predict([wt]))
    
    clf = OneClassSVM(kernel='rbf', nu=n)
    clf.fit(rich_msa, sample_weight=weights[:rich_msa.shape[0]])
    pred = clf.predict(test_sparse)
    cnt = 0 
    for i in range(mutant_data.shape[0]):
        if (pred[i]== 1 and label[i] == 1) or (pred[i] == -1 and label[i] == 0):
            cnt += 1
    
    print(f'kernel: rbf, nu: {n}, result: ', cnt)
    print(pred[:20])
    print("wild type pred: ", clf.predict([wt]))

    clf = OneClassSVM(kernel='sigmoid', nu=n)
    clf.fit(rich_msa, sample_weight=weights[:rich_msa.shape[0]])
    pred = clf.predict(test_sparse)
    cnt = 0 
    for i in range(mutant_data.shape[0]):
        if (pred[i]== 1 and label[i] == 1) or (pred[i] == -1 and label[i] == 0):
            cnt += 1
    print(f'kernel: rbf, nu: {n}, result: ', cnt)
    print(pred[:20])
    print("wild type pred: ", clf.predict([wt]))


Results:
    Only (kernel = 'rbf', nu = 0.2) and (kernel = 'rbf', nu = 0.3) produced results in which the prediction of mutant sequences were not all 1's or all -1's. For nu = 0.2, 716 sequences were classified correctly. For nu = 0.3, 543 sequences were classified correctly. There are a total of 673 (pos) + 515 (neg) = 1188 sequences. 

Experiement 4.2: Train a OneClassSVM on DMS sequences (positives). Leave out some positives and negatives for test. Compare with training a two class SVM on positive and negative samples. 

In [57]:
for n in [0.1, 0.15, 0.2, 0.25, 0.3]:
    clf = OneClassSVM(kernel='linear', nu=n)
    clf.fit(test_sparse[:500])
    pred = clf.predict(test_sparse[500:])
    cnt = 0 
    for i in range(mutant_data.shape[0]):
        if (pred[i]== 1 and label[i] == 1) or (pred[i] == -1 and label[i] == 0):
            cnt += 1
    print(f'kernel: linear, nu: {n}, result: ', cnt)
    print('wt: ', clf.predict([wt]))

IndexError: index 688 is out of bounds for axis 0 with size 688