In [None]:
from Bio import SeqIO
import pandas as pd
import gensim, logging
from gensim.test.utils import get_tmpfile

In [None]:
amino = pd.read_csv('./data/mapping.csv', index_col = 0)
amino.head()

In [None]:
uniprot_sprot = './data/uniprot_sprot.fasta'

In [None]:
def get_transDict(trans, amino):
    if trans == 'ProtVec':
        return None

    dic = {}
    for i in range(amino.shape[0]):
        dic[ord(amino['one_letter_code'][i])] = ord(amino[trans][i])
    return dic

In [None]:
class SentenceGenerator(object):
    def __init__(self, filename, gms=3, transDict=None):
        self.filename = filename
        self.gms = gms
        self.transDict = transDict
        self.skipCount = 0
        
    def __iter__(self):
        fasta_sequences = SeqIO.parse(open(self.filename), 'fasta')
        
        for entry in fasta_sequences:        
            seq = str(entry.__dict__['_seq'])
            
            if (any(ele in seq for ele in ['B', 'J', 'O', 'U', 'X', 'Z'])) and (transDict != None):
                continue
                    
            if self.transDict != None:
                seq = seq.translate(self.transDict)
                
            for i in range(self.gms):
                sentense = []
                for j in range(0, len(seq), self.gms):
                    word = seq[(i+j) : ((i+j) + self.gms)]
                    if len(word) == self.gms:
                        sentense.append(word)

                yield(sentense)        

In [None]:
def train_word2vec_model(sentenceGenerator, model_name, size, window, kgrams, workers=11, min_count = 1, epochs = 5):
    
    model_name = f'./data/models/RA2V_{model_name}_G{kgrams}_S{size}_W{window}.model'
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

    path = get_tmpfile(model_name)
    
    model = gensim.models.Word2Vec(
        sentences = sentenceGenerator, 
        min_count = min_count, 
        
        size = size, 
        window = window,
        workers=workers,
        iter=epochs,
        sg=1)
    
    print('model trained successfully....')
    model.save(model_name)
    model.delete_temporary_training_data()
    
    return model

In [None]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        
        from collections import namedtuple
        from itertools import product
        
        Run = namedtuple('Run', params.keys())
        
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
            
        return runs

In [None]:
params = {
    'kGrams'   : [3,4,5]
    ,'vecSize' : [25,50,75]
    ,'window'  : [5]
    ,'trans'   : ['Hydropathy','Conf_sim']
}
runs = RunBuilder.get_runs(params)

In [None]:
for run in runs:
    
    print(f'kGrams:{run.kGrams} vecSize:{run.vecSize} window:{run.window} trans:{run.trans}')
    
    transDict = get_transDict(run.trans, amino)

    sentenceGenerator = SentenceGenerator(
        filename = uniprot_sprot, 
        gms      = run.kGrams, 
        transDict=transDict)

    model = train_word2vec_model(
        sentenceGenerator, 
        model_name = run.trans, 
        size       = run.vecSize, 
        window     = run.window ,
        kgrams     = run.kGrams,)