In [1]:
import numpy as np
import os
import pandas as pd
import scipy as sc
import warnings
import csv
from itertools import groupby
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import time
import random

In [2]:
#file_name = 'sig_extract/a.fa.hmmpfam2'
file_name = 'Adomain_Substrate.fa.hmmpfam2'

In [3]:
print('the start at ', time.ctime())

the start at  Mon Sep 28 09:07:57 2020


In [4]:
# Hamming Distance between two strings
def hamming(str1, str2):
    ret = 0
    for (a,b) in zip(str1, str2):
        ret += (a!=b)
    return ret

# Normalized Hamming Distance between two strings based on string length
def hamming_frac(str1, str2):
    return hamming(str1, str2)/min(len(str1), len(str2))

#print(hamming('aaaa', 'baab'))
#print(hamming_frac('aaaa', 'baab'))

In [5]:
def remove_multiple_spaces(string, remove_lead_trail=True):
    if remove_lead_trail:
        string = string.strip()
    while '  ' in string:
        string = string.replace('  ', ' ')
    return string

def get_state(state, line, print_state_transitions=False):
    #print(line)
    #print('incoming state ', state)
    newstate = ''
    if state in ['init', 'parsing']:
        if line.startswith('//') or line.startswith('- - - -'):
            newstate = 'recognise'
    elif state == 'recognise':
        if line.startswith('Alignments of top'):
            newstate = 'parsing'

    if print_state_transitions:
        if newstate != '':
            print('Changed state from ',state,' to ', newstate)
        else:
            newstate = state
    
    if newstate == '':
        newstate = state
    return newstate

def parse_hmmsearch_output(lines, hmmfiles):
    align_dict = {}
    align_code_dict = {}
    score_dict = {}
    detail_dict = {}
    line_score_idx = 0
    
    
    line_idx = 0
    line_align_idx = 0
    Id = ''
    state = 'init'
    curr_hmm = ''
    #print(lines)
    
    for line in lines:
        line_idx += 1        
        state = get_state(state, line)
        
        if state == 'recognise':
            if line.startswith('Query sequence'):
                Id = line.split(': ')[1].strip('\n')
                #print('set id to ',Id)
                detail_dict[Id] = {}

        elif state == 'parsing':
            hmmheader = np.asarray([line.startswith(hmmfile) for hmmfile in hmmfiles]).any()
            if hmmheader:
                line_align_idx = line_idx
                curr_hmm = line.split(':')[0]
                #print(curr_hmm)
                split = line.split(' ')
                #print(split)
                score_idx = split.index('score')+1
                from_idx = split.index('from')+1
                to_idx = split.index('to')+1
                detail_dict[Id][curr_hmm] = {'score':float(split[score_idx].strip(',')),'from': int(split[from_idx]),
                                             'to': int(split[to_idx].strip(':')), 'top':'', 'bottom':''}
                #print(detail_dict)
            elif line.startswith(' '):
                #print(line_idx, line_align_idx, line)
                if (line_idx - line_align_idx) % 4 == 1:
                    detail_dict[Id][curr_hmm]['top'] += remove_multiple_spaces(line).strip('*-><')
                elif (line_idx - line_align_idx) % 4 == 3:
                    detail_dict[Id][curr_hmm]['bottom'] += remove_multiple_spaces(line).split()[2]
        
    return detail_dict

def parse_hmmsearch_output_from_file(filename, hmmfile):
    with open(filename, 'r') as file:
        content = file.readlines()
    return parse_hmmsearch_output(content, hmmfile)

In [6]:
detail_dict = parse_hmmsearch_output_from_file(file_name, ['aa-activating-core.198-334', 'aroundLys517'])

In [7]:
def get_best_alignment(mydict):
    #print(mydict)
    ret_dict = {}
    for Id in mydict:
        #print(Id)
        start = False
        score = -1
        best_hmm = ''
        for hmm in mydict[Id]:
            #print(hmm, best_hmm)
            if best_hmm == '' or score < mydict[Id][hmm]['score']:
                best_hmm = hmm
        ret_dict[Id] = mydict[Id][best_hmm].copy()
        ret_dict[Id]['hmm'] = best_hmm
        
    return ret_dict

best_align_dict = get_best_alignment(detail_dict)

In [8]:
def get_hmm_alignment(mydict, hmm):
    ret_dict = {}
    for key in mydict:
        try:
            ret_dict[key] = mydict[key][hmm].copy()
        except:
            print('Could not get',hmm,' for Id',key)
    return ret_dict

a_align_dict = get_hmm_alignment(detail_dict, 'aa-activating-core.198-334')

In [9]:
def removetopindels(indict, print_change=False):
    mydict = indict.copy()
    for Id in mydict:
        top_tmp = ''
        bot_tmp = ''
        idx = mydict[Id]['from']
        idx_list = []
        for a,b in zip(mydict[Id]['top'], mydict[Id]['bottom']):
            if a != '.':
                top_tmp += a
                bot_tmp += b
                if b == '-':
                    idx_list.append(idx-0.5)
                else:
                    idx_list.append(idx)
            if b != '-':
                idx += 1
        if print_change and mydict[Id]['top'] != top_tmp:
            print('Id:',Id,' top changed from ',mydict[Id]['top'], 'to', top_tmp)
        if print_change and mydict[Id]['bottom'] != bot_tmp:
            print('Id:',Id,' bottom changed from ',mydict[Id]['bottom'], 'to', bot_tmp)
        mydict[Id]['top'] = top_tmp
        mydict[Id]['bottom'] = bot_tmp
        assert(len(mydict[Id]['bottom']) == len(idx_list))
        mydict[Id]['idx_list'] = idx_list.copy()
    return mydict

a_align_dict_no_indel = removetopindels(a_align_dict)

In [10]:
def extractCharacters(Id, target, source, source_idx_list, pattern, idxs) :
    assert len(source) == len(source_idx_list)
    try:
        start = target.index(pattern)
    except:
        print('Problem at Id ', Id, ' pattern ', pattern, ' target ', target)
    ret = ''
    pos = []
    for idx in idxs:
        ret += source[start+idx]
        pos.append(source_idx_list[start+idx])
    return ret, pos

In [11]:
#extractCharacters('gig', 'rty', 'ggui', [345,346,349,350], 't', [2])

In [12]:
def extract_sig(Id, top, bottom, idx_list):
    try:
        s1, p1 = extractCharacters(Id, top, bottom, idx_list, "KGVmveHrnvvnlvkwl", [12, 15, 16])
        s2, p2 = extractCharacters(Id, top, bottom, idx_list, "LqfssAysFDaSvweifgaLLnGgt", [3,8,9,10,11,12,13,14,17])
        s3, p3 = extractCharacters(Id, top, bottom, idx_list, "iTvlnltPsl", [4,5])
        s4, p4 = extractCharacters(Id, top, bottom, idx_list, "LrrvlvGGEaL", [4,5,6,7,8])
        s5, p5 = extractCharacters(Id, top, bottom, idx_list, "liNaYGPTEtTVcaTi", [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])

        return s1+s2+s3+s4+s5, p1+p2+p3+p4+p5
    except:
        return '', []
def extract_sig_dict(mydict):
    ret_dict = {}
    #ret_dict = mydict.copy()
    for Id in mydict:
        ret_dict[Id] = {}
        ret_dict[Id]['sig'], ret_dict[Id]['pos'] = extract_sig(Id, mydict[Id]['top'], mydict[Id]['bottom'], mydict[Id]['idx_list'])
        #ret_dict[Id]['sig'] = extract_sig(Id, mydict[Id]['top'], mydict[Id]['bottom'])
    return ret_dict

In [13]:
sig_dict = extract_sig_dict(a_align_dict_no_indel)

In [14]:
sig_dict

{'O30408_1|P': {'sig': 'LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC',
  'pos': [142,
   145,
   146,
   162,
   166,
   167,
   168,
   169,
   170,
   171,
   172,
   175,
   210,
   211,
   237,
   238,
   239,
   240,
   241,
   261,
   262,
   263,
   264,
   265,
   266,
   267,
   268,
   269,
   270,
   271,
   272,
   273,
   274,
   275]},
 'O30408_2|F': {'sig': 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY',
  'pos': [141,
   144,
   145,
   161,
   165,
   166,
   167,
   168,
   169,
   170,
   171,
   174,
   209,
   210,
   234,
   235,
   236,
   237,
   238,
   258,
   259,
   260,
   261,
   262,
   263,
   264,
   265,
   266,
   267,
   268,
   269,
   270,
   271,
   272]},
 'O30408_3|F': {'sig': 'LVFAFDASVWDGTLITAGSVNGYGPTESTVCATL',
  'pos': [142,
   145,
   146,
   162,
   166,
   167,
   168,
   169,
   170,
   171,
   172,
   175,
   210,
   211,
   231,
   232,
   233,
   234,
   235,
   252,
   253,
   254,
   255,
   256,
   257,
   258,
   259,
   260,
   261,
   262,
   263,


In [15]:
print('the end at ', time.ctime())

the end at  Mon Sep 28 09:07:57 2020
