# Notebook to analyze content of extracellular flexible loop 4 (EL4)

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
with open('data/EL4.fasta', 'r') as file:
    file_content = file.read()
    
    lines = file_content.split('\n')

headers = [h[1:3]+h[13:] for h in lines if h[0]=='>']
sequences = [h for h in lines if h[0]!='>']        

df = pd.DataFrame(list(zip(headers,sequences)), columns=['header','sequence'])
df

Unnamed: 0,header,sequence
0,P. blakesleeanus,CQLGKE--------C
1,R. oryzae,C---------TGSNC
2,S. japonicus,CHYTTSDAINRPSGC
3,S. pombe,CDYQAGAAINYPPGC
4,S. octosporus,CEYQASGAVNYPSGC
...,...,...
221,S. brasiliensis,CEYNTNVPITFPTGC
222,C. thermophilum,CEYNRDVPIKFPTRC
223,C. globosum,CEYNPDVPITFPTGC
224,T. terrestris,CNYNRDVPITFPTGC


In [3]:
# Check that ECL2 is delimited by the two conserved cysteines for all manually selected sequences
df['C_both_sides'] = df.sequence.apply(lambda x: True if (x[0]=='C') & (x[-1]=='C') else False)
df[df.C_both_sides != True]

Unnamed: 0,header,sequence,C_both_sides


In [4]:
# Length of loop
df['length'] = df.sequence.apply(lambda x: len(x) - x.count('-'))
df

Unnamed: 0,header,sequence,C_both_sides,length
0,P. blakesleeanus,CQLGKE--------C,True,7
1,R. oryzae,C---------TGSNC,True,6
2,S. japonicus,CHYTTSDAINRPSGC,True,15
3,S. pombe,CDYQAGAAINYPPGC,True,15
4,S. octosporus,CEYQASGAVNYPSGC,True,15
...,...,...,...,...
221,S. brasiliensis,CEYNTNVPITFPTGC,True,15
222,C. thermophilum,CEYNRDVPIKFPTRC,True,15
223,C. globosum,CEYNPDVPITFPTGC,True,15
224,T. terrestris,CNYNRDVPITFPTGC,True,15


In [5]:
df['R_count'] = df.sequence.str.count('R')
df['K_count'] = df.sequence.str.count('K')
df['RK_count'] = df.R_count + df.K_count
df

Unnamed: 0,header,sequence,C_both_sides,length,R_count,K_count,RK_count
0,P. blakesleeanus,CQLGKE--------C,True,7,0,1,1
1,R. oryzae,C---------TGSNC,True,6,0,0,0
2,S. japonicus,CHYTTSDAINRPSGC,True,15,1,0,1
3,S. pombe,CDYQAGAAINYPPGC,True,15,0,0,0
4,S. octosporus,CEYQASGAVNYPSGC,True,15,0,0,0
...,...,...,...,...,...,...,...
221,S. brasiliensis,CEYNTNVPITFPTGC,True,15,0,0,0
222,C. thermophilum,CEYNRDVPIKFPTRC,True,15,2,1,3
223,C. globosum,CEYNPDVPITFPTGC,True,15,0,0,0
224,T. terrestris,CNYNRDVPITFPTGC,True,15,1,0,1


In [11]:
df[df.RK_count == 0]

Unnamed: 0,header,sequence,C_both_sides,length,R_count,K_count,RK_count
1,R. oryzae,C---------TGSNC,True,6,0,0,0
3,S. pombe,CDYQAGAAINYPPGC,True,15,0,0,0
4,S. octosporus,CEYQASGAVNYPSGC,True,15,0,0,0
6,R. oryzae,CEYNADAPEDTPEGC,True,15,0,0,0
7,P. blakesleeanus,CEYNADAPPDIPAGC,True,15,0,0,0
18,U. maydis,CTTNSEYIVG-TGGC,True,14,0,0,0
69,R. sp.. NJR-2017. BBW,CDYNTNVPITYPTGC,True,15,0,0,0
70,R. sp.. NJR-2017. WRK4,CDYNTNVPITYPTGC,True,15,0,0,0
71,S. borealis,CTYNTNVPITYPTGC,True,15,0,0,0
80,E. necator,CNYNVNVPITFPTGC,True,15,0,0,0


In [12]:
df.RK_count.astype(bool).sum()

197