In [1]:
import os
import numpy as np
import pandas as pd

# Load Dataset

In [2]:
dset = pd.read_csv('./datasets/antiviral.csv')
dset = dset.dropna()
dset.head(2)

Unnamed: 0,AVP ID,Sequence,Length,Virus,PubMed/Patent_ID,Label,is_Train
0,AVP_0609,DLGPPISLERLDVGTNLGNAIAKLEAKELLESSD,15,MV,20347875,1,True
1,AVP_0619,HRIDLGPPISLERLDVGTNLGNAIAKLEAKELLE,15,MV,20347875,1,True


# Load seq2vec Models

In [3]:
from src.seq2features import Transformer, GetModels
from src.runBuilder import RunBuilder

In [4]:
param = {
    'alphabet' : ['prot_vec', 'hydro', 'conf_simil'],
    'kGram'    : [3],
    'vecSize'  : [100],
    'window'   : [5]
}

In [5]:
seq2vec_models = GetModels.from_param_dict('./model-creator/models/', param)
seq2vec_models

[<src.seq2features.W2V_Model at 0x7f779c866a00>,
 <src.seq2features.W2V_Model at 0x7f779c68aca0>,
 <src.seq2features.W2V_Model at 0x7f779c69d310>]

In [6]:
pd.DataFrame([each.__dict__ for each in seq2vec_models]).drop('location', axis=1)

Unnamed: 0,Model,kGram,window,vecSize
0,prot_vec,3,5,100
1,hydro,3,5,100
2,conf_simil,3,5,100


# Transform the dataset

## ProtVec Model

In [7]:
transformer = Transformer()
transformer.set_modelList(ProtVec=seq2vec_models[0])
transformer.set_data(data=dset.Sequence, target=dset.Label)

Done! Features are saved as model attributes; xData and yData


In [8]:
transformer.xData.shape

(1056, 100)

In [9]:
pd.DataFrame(transformer.xData).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.805205,0.515933,-0.173827,1.232417,-0.178105,2.904041,-5.894862,-0.517172,0.786282,0.249633,...,-3.029598,5.765355,2.388032,0.327205,0.092085,-1.056149,2.624676,-1.318894,-3.062136,-5.975313
1,-1.015575,0.710593,0.192947,1.52236,0.501692,3.106572,-5.38993,-1.202611,1.012515,-0.034357,...,-3.057328,5.770335,3.088787,-0.084201,-0.565125,-0.584599,1.880929,-1.318765,-2.397743,-6.208384


## RA2Vec Model

In [10]:
transformer = Transformer(transCodes_csv='./model-creator/data/mapping.csv')
transformer.set_modelList(RA2V_models=[each for each in seq2vec_models if each.Model != 'prot_vec'])
transformer.set_data(data=dset.Sequence, target=dset.Label)

Done! Features are saved as model attributes; xData and yData


In [11]:
transformer.xData.shape

(1056, 200)

In [12]:
pd.DataFrame(transformer.xData).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.546472,-4.321027,0.969843,0.477996,2.366844,4.241345,2.430762,-0.817556,2.277164,-0.8027,...,-3.240247,1.043945,-6.455848,2.997713,1.545122,-1.400428,1.14523,5.118212,-1.005992,-0.614887
1,0.645005,-4.445821,1.221185,0.481752,2.435462,4.081695,2.465331,-0.605753,2.214545,-0.861191,...,-3.336379,0.977783,-6.444966,3.287784,1.396907,-1.271383,1.205594,4.734825,-1.093482,-1.217669


## RA2Vec along with ProtVec 

In [13]:
transformer = Transformer(transCodes_csv='./model-creator/data/mapping.csv')
transformer.set_modelList(
    RA2V_models = [each for each in seq2vec_models if each.Model != 'prot_vec'],
    ProtVec     = seq2vec_models[0]
)
transformer.set_data(data=dset.Sequence, target=dset.Label)

Done! Features are saved as model attributes; xData and yData


In [14]:
transformer.xData.shape

(1056, 300)

In [15]:
pd.DataFrame(transformer.xData).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.546472,-4.321027,0.969843,0.477996,2.366844,4.241345,2.430762,-0.817556,2.277164,-0.8027,...,-3.029598,5.765355,2.388032,0.327205,0.092085,-1.056149,2.624676,-1.318894,-3.062136,-5.975313
1,0.645005,-4.445821,1.221185,0.481752,2.435462,4.081695,2.465331,-0.605753,2.214545,-0.861191,...,-3.057328,5.770335,3.088787,-0.084201,-0.565125,-0.584599,1.880929,-1.318765,-2.397743,-6.208384
