In [2]:
import pandas as pd
import numpy as np

In [3]:
UNIQUE_LETTERS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 
                  'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ]
UNIQUE_LETTERS = np.array(UNIQUE_LETTERS)

In [4]:
len(UNIQUE_LETTERS)

20

In [5]:
def set_length_limit(df, length_limit=40):
    """Return sequences of length above specified length_limit."""
    df.loc[:, 'length'] = df.loc[:, 'sequence'].map(lambda x: len(x))
    print(f'Before setting length limit of {length_limit}: ')
    print(df.describe())
    print()
    df = df.loc[df.length >= length_limit]
    print(f'After setting length limit of {length_limit}: ')
    print(df.describe())
    return df

In [6]:
# disordered
df_disordered = pd.read_csv("disordered_sequences.csv")
df_disordered = df_disordered[~df_disordered.sequence.isnull()]
df_disordered = df_disordered.drop_duplicates()
len(df_disordered)

3776

In [7]:
df_disordered = set_length_limit(df_disordered)

Before setting length limit of 40: 
            length
count  3776.000000
mean     68.569121
std     116.190494
min       5.000000
25%      16.000000
50%      30.000000
75%      77.000000
max    2371.000000

After setting length limit of 40: 
            length
count  1534.000000
mean    140.189048
std     156.487541
min      40.000000
25%      61.000000
50%      93.000000
75%     167.000000
max    2371.000000


In [8]:
# ordered
df1 = pd.read_csv("rcsb_pdb_sequence_9b20c2e6f4e2322c79be67683f6cf968_2501-3856.csv")
df2 = pd.read_csv("rcsb_pdb_sequence_9b20c2e6f4e2322c79be67683f6cf968_0001-2500.csv")
df_ordered = df1.append(df2)
df_ordered = df_ordered.loc[:, ['Sequence']]
df_ordered = df_ordered.rename(columns={'Sequence': 'sequence'})
df_ordered = df_ordered[~df_ordered.sequence.isnull()]
df_ordered = df_ordered.drop_duplicates()
len(df_ordered)

3268

In [9]:
df_ordered = set_length_limit(df_ordered)

Before setting length limit of 40: 
            length
count  3268.000000
mean    417.743574
std     432.200299
min       4.000000
25%     187.000000
50%     315.000000
75%     505.000000
max    5037.000000

After setting length limit of 40: 
            length
count  3214.000000
mean    424.391724
std     432.734049
min      40.000000
25%     192.000000
50%     319.000000
75%     508.750000
max    5037.000000


In [10]:
def generate_sub_sequence(df, size=40, strides=10):
    """Generate sub-sequence of specified size. 
       Moving at specified strides"""
    lst = []
    for sequence in df.sequence.values:
        for ix in range((len(sequence)-size)//strides+1):
            sub = sequence[strides*ix: strides*ix+size]
            lst.append(sub)
    return lst

In [11]:
def label_binarizer(df, size=40, strides=10):
    lst = []
    for sequence in df.sequence.values:
        for ix in range((len(sequence)-size)//strides+1):
            lst.append([list(sequence[strides*ix: strides*ix+size])])
    df_sub = pd.DataFrame(lst, columns=['sequence'])
    mlb = MultiLabelBinarizer()
    df_encoded = pd.DataFrame(mlb.fit_transform(df_sub['sequence']),
                              columns=mlb.classes_, index=df_sub.index)
    return df_encoded

In [12]:
def one_hot_encoding(df, protein_type, size=40, strides=10):
    """One-hot encoding the sequences.
       protein_type should be one of ['ordered', 'disordered']"""
    lst_sequences = generate_sub_sequence(df, size=size, strides=strides)
    num_obs = len(lst_sequences)
    # placeholder
    array_encoded = np.empty((num_obs, size, len(UNIQUE_LETTERS)))
    for ix, sequence in enumerate(lst_sequences):
        for iy, letter in enumerate(sequence):
            array_encoded[ix, iy,] = (UNIQUE_LETTERS==letter).astype(int)
    if protein_type == 'ordered':
        labels = np.array([1]*num_obs)
    elif protein_type == 'disordered':
        labels = np.array([0]*num_obs)
    else:
        raise Exception(f"Invalid type: {protein_type}")
    return array_encoded, labels

In [13]:
array_ordered, labels_ordered = one_hot_encoding(df_ordered, 'ordered')

In [14]:
array_ordered.shape

(125289, 40, 20)

In [15]:
array_disordered, labels_disordered = one_hot_encoding(df_disordered, 'disordered')

In [16]:
array_disordered.shape

(16239, 40, 20)

In [17]:
import pickle

In [18]:
with open("array_ordered.pkl", "wb") as f_write:
    pickle.dump(array_ordered, f_write)

In [20]:
with open("labels_ordered.pkl", "wb") as f_write:
    pickle.dump(labels_ordered, f_write)

In [19]:
with open("array_disordered.pkl", "wb") as f_write:
    pickle.dump(array_disordered, f_write)

In [21]:
with open("labels_disordered.pkl", "wb") as f_write:
    pickle.dump(labels_disordered, f_write)