# SeqMo-ID:

markdown about seqmoid

# Packages and settings

In [1]:
%matplotlib inline
import os
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(1, './src')

import reader

In [3]:
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = [10.0, 8.0]
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['grid.linestyle'] = '-'
plt.rcParams['legend.fontsize'] = 14
colors = [i['color'] for i in plt.rcParams['axes.prop_cycle']]

# Get Data from NCBI

In [12]:
### ADD BASH COMMANDS ###

## Test data
out.faa -> fasta file output from edirect pipeline

In [169]:
hog1 = reader.Reader('out.faa','test_hog1.faa','hog1')
ptp3 = reader.Reader('ptp3_out.faa','ptp3')
tuples, headers, seqs = ptp3.get_data()

## Extract gene IDs and strain IDs from gene annotations

In [170]:
# Parse headers to get geneID and strainID lists
geneID = []
strainID = []
for i in range(len(headers)):
    geneID.append(headers[i].split('>')[1].split(' ')[0])
    strainID.append(headers[i].split(']')[0].split(' ')[len(headers[i].split(']')[0].split(' '))-1])

# Identify conserved motifs

In [171]:
## USER INPUTS ## 

# Motif sequence 
mot2search = re.compile('SP')

# Reference sequence number
ref_num = 0

In [199]:
# Define functions 
def searchmotif(motif, seq):
        hits = motif.finditer(str(seq))
        n = 0
        mot_spec = []
        mot_freq = []
        for hsp in hits:
            n += 1
            mot_spec.append((hsp.start(), hsp.end(), hsp.group()))
            mot_freq.append(n)
        return mot_spec

class Scoring: 

    def extract(list):
        return [item[0] for item in list] 
   
    def refPoint(search):
        refPoints = [item[0] for item in search]
        return refPoints


    def diffScore(search):
        """ This function finds the difference in location
        between the nth and the (n + 1)th  occurrence of 
        the protein motif"""
        refPoints = Scoring.refPoint(search)
        diffScores = [y-x for x,y in zip(refPoints,refPoints[1:])]
        return diffScores

    def anchorDict(refs, diffs):
        keys = diffs
        values = [[x,y] for x,y in zip(refs,refs[1:])] 
        anchor = dict(zip(keys,values))
        return anchor
   
    def isConservedAt(testDict, anchorDict):
        dict1Set = set(testDict)
        dict2Set = set(anchorDict)
        conservList = []
        for key in dict1Set.intersection(dict2Set):
            conservList.append(anchorDict[key])
        results =  flat_list = [item for sublist in conservList for item in sublist]
        condensed_results = list(dict.fromkeys(results))
        return condensed_results

In [177]:
# Calculate all necessary information
diff = []
ref = []
dicts = []
for i in range(len(seqs)):
    seq1Motif = searchmotif(mot2search,seqs[i])
    diff.append(Scoring.diffScore(seq1Motif))
    ref.append(Scoring.refPoint(seq1Motif))
    dicts.append(Scoring.anchorDict(ref[i],diff[i]))

# Create tables summarizing conserved motifs

In [200]:
detailed = {'Gene ID':geneID, 'Strain ID':strainID}
detailed_data_frame = pd.DataFrame.from_dict(detailed)

conservation = np.empty([len(seqs),len(ref[ref_num])])
new_motifs = []
for i in range(len(seqs)):
    conserved = Scoring.isConservedAt(dicts[i],dicts[ref_num])
    for j in range(len(ref[ref_num])):
        if ref[0][j] in conserved:
            conservation[i][j] = True
        else:
            conservation[i][j] = False
    new_motifs += [len(ref[i])-len(conserved)]

for i in range(len(ref[ref_num])):
    detailed_data_frame['Reference position '+str(ref[0][i])] = conservation[:,i]==True

detailed_data_frame['# of new motifs'] = new_motifs

detailed_data_frame.head()

Unnamed: 0,Gene ID,Strain ID,Reference position 52,Reference position 57,Reference position 66,Reference position 247,Reference position 271,Reference position 294,Reference position 296,Reference position 304,Reference position 310,# of new motifs
0,DAA07735.2,S288C,True,True,True,True,True,True,True,True,True,0
1,EEU06494.1,JAY291,True,True,True,True,True,True,True,True,True,0
2,EGA79204.1,Vin13,True,True,True,True,True,True,True,True,True,0
3,CAY79250.1,EC1118,True,True,True,True,True,True,True,True,True,0
4,GAA22908.1,7,True,True,True,True,True,True,True,True,True,0


[294, 296, 52, 57, 304, 310, 296, 304, 57, 66, 66, 247, 271, 294, 247, 271]

In [184]:
dicts[0]

{5: [52, 57],
 9: [57, 66],
 181: [66, 247],
 24: [247, 271],
 23: [271, 294],
 2: [294, 296],
 8: [296, 304],
 6: [304, 310]}