In [1]:
import datetime
import itertools
import pandas as pd

%matplotlib inline

In [2]:
STATE_DD_8T3 = {
    'G': 'H',
    'H': 'H',
    'I': 'H',
    
    'B': 'E',
    'E': 'E',
    
    'T': 'C',
    'S': 'C',
    'C': 'C',
}

In [3]:
STANDARD_AAS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
NON_STANDARD_AAS = list('BJOUXZ')

def mask_unk_aa(seq):
    for i in NON_STANDARD_AAS:
        seq = seq.replace(i, '*')
    return seq

In [4]:
def convert_8t3_state(sst):
    out = sst
    for i, j in STATE_DD_8T3.items():
        out = out.replace(i, j)
    return out

### ss.csv.gz generation

The ss.csv.gz file is generated from the ss.txt.gz file using transform_ss_txt_to_csv.py

In [7]:
%%time
adf = pd.read_csv('./raw_data/2023-01-22-ss.csv.gz')

CPU times: user 2.79 s, sys: 154 ms, total: 2.94 s
Wall time: 2.95 s


In [8]:
adf.head(1)

Unnamed: 0,pdb_id,chain_code,seq,sst
0,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...


# Check amino acid characters

In [9]:
%time all_chars = set(itertools.chain(*adf.seq.values))

CPU times: user 1.1 s, sys: 5.71 ms, total: 1.1 s
Wall time: 1.1 s


In [10]:
all_chars - set(STANDARD_AAS)

{'B', 'O', 'U', 'X', 'Z'}

In [11]:
set(STANDARD_AAS) - all_chars

set()

Mask these non-standard AAs with *

In [12]:
%time adf['seq'] = adf.seq.apply(mask_unk_aa).to_frame()

CPU times: user 244 ms, sys: 9.88 ms, total: 254 ms
Wall time: 253 ms


In [13]:
# make sure they are masked
assert set(itertools.chain(*adf.seq.values)) - set(STANDARD_AAS) == set(['*'])

In [14]:
adf.rename(columns={'sst': 'sst8'}, inplace=True)

# Create Q3

In [15]:
%time adf['sst3'] = adf['sst8'].apply(convert_8t3_state)

CPU times: user 759 ms, sys: 55.9 ms, total: 815 ms
Wall time: 814 ms


In [16]:
%time adf['len'] = adf.seq.apply(lambda s: len(s))

CPU times: user 143 ms, sys: 16.8 ms, total: 159 ms
Wall time: 159 ms


In [17]:
adf.head(1).T

Unnamed: 0,0
pdb_id,101M
chain_code,A
seq,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
sst8,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
sst3,CCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCHHHHHHCCC...
len,154


In [18]:
adf.sort_values(['len', 'pdb_id', 'chain_code'], inplace=True)

In [19]:
%time adf['has_nonstd_aa'] = adf.seq.apply(lambda s: '*' in s)

CPU times: user 91.2 ms, sys: 7.67 ms, total: 98.9 ms
Wall time: 96.1 ms


In [20]:
adf.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
377,1A30,C,EDL,CBC,CEC,3,False
2336,1B05,B,KCK,CBC,CEC,3,False
2369,1B0H,B,KAK,CBC,CEC,3,False
2419,1B1H,B,KFK,CBC,CEC,3,False
2473,1B2H,B,KAK,CBC,CEC,3,False


In [21]:
%%time
adf.to_csv('../2023_01-22-ss.cleaned.csv', index=False)

CPU times: user 6.75 s, sys: 500 ms, total: 7.25 s
Wall time: 7.27 s
