# HIV-1 Protease Cleavage

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

Data from UCI ML repository

https://archive.ics.uci.edu/ml/datasets/HIV-1+protease+cleavage

In [2]:
#filename = r'C:\Users\user\ML\HIV_protease\hiv+1+protease+cleavage\746Data.txt'

filename = r'C:\Users\user\ML\Classification\HIV_protease\hiv+1+protease+cleavage\1625Data.txt'

data = pd.read_csv(filename, header=None)
data.columns = ['protease', 'label']

Encode the acids into octomers

In [3]:
def encode_acid(a):
    """ encode alphabetic amino acid string into binary 20-ary string
    """
    alphabet = 'ARNDCQEGHILKMFPSTWYV'
    result = []
    for i in range(len(a)):
        for j in range(len(alphabet)):
            if alphabet[j] == a[i]:
                result.append(1)
            else:
                result.append(-1)
    
    return result

data['encoding'] = data['protease'].apply(encode_acid)

Prep the data to train and test

In [4]:
X = np.vstack(data['encoding'].values)
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(f'In the training set there are *{np.sum(y_train==1)}* samples that cleave, and *{np.sum(y_train==-1)}* which do not')
print(f'In the test set there are *{np.sum(y_test==1)}* samples that cleave, and *{np.sum(y_test==-1)}* which do not')

(1218, 160) (1218,) (407, 160) (407,)
In the training set there are *276* samples that cleave, and *942* which do not
In the test set there are *99* samples that cleave, and *308* which do not


Push to an SVM

In [5]:
svc = SVC(kernel='poly', gamma=0.1)
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
y_pred

array([-1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1,
       -1, -1,  1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,
       -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,  1, -1,  1, -1,
       -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
       -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
        1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,  1,
       -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1, -1,  1,  1, -1,  1, -1,
        1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1,  1,
       -1,  1, -1, -1, -1

In [6]:
accuracy_score(y_test, y_pred)

0.972972972972973