In [3]:
import datetime
import itertools
import pandas as pd

%matplotlib inline

In [4]:
STATE_DD_8T3 = {
    'G': 'H',
    'H': 'H',
    'I': 'H',
    
    'B': 'E',
    'E': 'E',
    
    'T': 'C',
    'S': 'C',
    'C': 'C',
}

In [5]:
STANDARD_AAS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
NON_STANDARD_AAS = list('BJOUXZ')

def mask_unk_aa(seq):
    for i in NON_STANDARD_AAS:
        seq = seq.replace(i, '*')
    return seq

In [6]:
def convert_8t3_state(sst):
    out = sst
    for i, j in STATE_DD_8T3.items():
        out = out.replace(i, j)
    return out

In [8]:
%%time
adf = pd.read_csv('./raw_data/2022_08_03_ss.csv')

CPU times: user 2.01 s, sys: 155 ms, total: 2.16 s
Wall time: 2.16 s


In [9]:
adf.head(1)

Unnamed: 0,pdb_id,chain_code,seq,sst
0,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...


# Check amino acid characters

In [10]:
%time all_chars = set(itertools.chain(*adf.seq.values))

CPU times: user 1.29 s, sys: 5.8 ms, total: 1.3 s
Wall time: 1.3 s


In [11]:
all_chars - set(STANDARD_AAS)

{'B', 'O', 'U', 'X', 'Z'}

In [12]:
set(STANDARD_AAS) - all_chars

set()

Mask these non-standard AAs with *

In [13]:
%time adf['seq'] = adf.seq.apply(mask_unk_aa).to_frame()

CPU times: user 249 ms, sys: 12.9 ms, total: 262 ms
Wall time: 258 ms


In [14]:
# make sure they are masked
assert set(itertools.chain(*adf.seq.values)) - set(STANDARD_AAS) == set(['*'])

In [15]:
adf.rename(columns={'sst': 'sst8'}, inplace=True)

# Create Q3

In [16]:
%time adf['sst3'] = adf['sst8'].apply(convert_8t3_state)

CPU times: user 712 ms, sys: 35.4 ms, total: 747 ms
Wall time: 774 ms


In [17]:
%time adf['len'] = adf.seq.apply(lambda s: len(s))

CPU times: user 149 ms, sys: 19.8 ms, total: 168 ms
Wall time: 166 ms


In [18]:
adf.head(1).T

Unnamed: 0,0
pdb_id,101M
chain_code,A
seq,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
sst8,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
sst3,CCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCHHHHHHCCC...
len,154


In [19]:
adf.sort_values(['len', 'pdb_id', 'chain_code'], inplace=True)

In [20]:
%time adf['has_nonstd_aa'] = adf.seq.apply(lambda s: '*' in s)

CPU times: user 88.5 ms, sys: 840 µs, total: 89.3 ms
Wall time: 87.9 ms


In [21]:
adf.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
377,1A30,C,EDL,CBC,CEC,3,False
2336,1B05,B,KCK,CBC,CEC,3,False
2369,1B0H,B,KAK,CBC,CEC,3,False
2419,1B1H,B,KFK,CBC,CEC,3,False
2473,1B2H,B,KAK,CBC,CEC,3,False


In [22]:
%%time
adf.to_csv('../2022_08_03-ss.cleaned.csv', index=False)

CPU times: user 6.24 s, sys: 375 ms, total: 6.61 s
Wall time: 6.7 s
