In [None]:
"""
Text-based parser for ProteinNet Records.
"""

__author__ = "Mohammed AlQuraishi"
__copyright__ = "Copyright 2019, Harvard Medical School"
__license__ = "MIT"

#!/usr/bin/python

# imports
import sys
import re
import numpy as np

# Constants
NUM_DIMENSIONS = 3

# Functions for conversion from Mathematica protein files to TFRecords
_aa_dict = {'A': '0', 'C': '1', 'D': '2', 'E': '3', 'F': '4', 'G': '5', 'H': '6', 'I': '7', 'K': '8', 'L': '9', 'M': '10', 'N': '11', 'P': '12', 'Q': '13', 'R': '14', 'S': '15', 'T': '16', 'V': '17', 'W': '18', 'Y': '19'}
_dssp_dict = {'L': '0', 'H': '1', 'B': '2', 'E': '3', 'G': '4', 'I': '5', 'T': '6', 'S': '7'}
_mask_dict = {'-': '0', '+': '1'}


def letter_to_num(string, dict_):
    """ Convert string of letters to list of ints """
    patt = re.compile('[' + ''.join(dict_.keys()) + ']')
    num_string = patt.sub(lambda m: dict_[m.group(0)] + ' ', string)
    num = [int(i) for i in num_string.split()]
    return num

def read_record(file_, num_evo_entries):
    """ Read a Mathematica protein record from file and convert into dict. """
    
    dict_ = {}
    
            
    while True:
        next_line = file_.readline()
        if next_line == '[ID]' + '\n':
            id_ = file_.readline()[:-1]
            dict_.update({'id': id_})
        elif next_line == '[PRIMARY]' + '\n':
            primary = letter_to_num(file_.readline()[:-1], _aa_dict)
            dict_.update({'primary': primary})
        elif next_line == '[EVOLUTIONARY]' + '\n':
            evolutionary = []
            for residue in range(num_evo_entries): evolutionary.append([float(step) for step in file_.readline().split()])
            dict_.update({'evolutionary': evolutionary})
        elif next_line == '[SECONDARY]' + '\n':
            print(next_line)
            secondary = letter_to_num(file_.readline()[:-1], _dssp_dict)
            dict_.update({'secondary': secondary})
        elif next_line == '[TERTIARY]' + '\n':
            tertiary = []
            for axis in range(NUM_DIMENSIONS): tertiary.append([float(coord) for coord in file_.readline().split()])
            dict_.update({'tertiary': tertiary})
        elif next_line == '[MASK]' + '\n':
            mask = letter_to_num(file_.readline()[:-1], _mask_dict)
            dict_.update({'mask': mask})
        elif next_line == '\n':
            return dict_
        elif next_line == '':
            return None
        
        
    


In [None]:
def dict_to_protein_shape(dict_):
    protein = []
    amino_acid_f = []
    for i in range(len(dict_['primary'])):
        #primary structure
        prim_lab = [0] * 21
        index = dict_['primary'][i]
        prim_lab[index] = 1
    
        #secondary label
        sec_lab = [0] * 8
    
        #PSSM
        pssm = []
        for j in range(21):
            pssm.append(dict_['evolutionary'][j][i])
        
        amino_acid_f = prim_lab + sec_lab + pssm    
        protein.append(amino_acid_f)
        amino_acid_f = []

    while (len (protein) < 700):
        no_seq = [0] * 50
        protein.append(no_seq)
    
    #returns protein in shape (1,700,50)
    protein = np.array(protein)   
    return protein    

In [None]:
import gzip
def save_parti_dataset(proteins, caspName , index):
    
    dataset = np.zeros(( len(proteins), 700, 50))

    for i in range ( len(proteins) ):
        dataset[i] = proteins[i]
    
    f = gzip.GzipFile( caspName + "-" + str(index) + '.npy.gz', "w")
    np.save(f, dataset)
    f.close()
    print(dataset.shape)

In [None]:
# main. accepts two command-line arguments: input file and the number of entries in evo profiles, and outputs dicts to stdout

input_path =   "casp12/training_30"  #= sys.argv[1] 
num_evo_entries = 21 #int(sys.argv[2]) if len(sys.argv) == 3 else 20 # default number of evo entries

input_file = open(input_path, 'r')
   
proteins = [] #np.zeros((1,700,50)) 

i = 0
datIndex = 0
transkaya_conter = 0
while True:
    
    #progress bar
    if i % 1000 == 0:
        print(i)
        
    #save first partition of dataset - same size as transkaya        
    if transkaya_conter == 5278:
        print("saving dataset")
        save_parti_dataset(proteins, input_path, datIndex)
        proteins = []
        transkaya_conter = 0
        datIndex += 1
    
    #reading file
    dict_ = read_record(input_file, num_evo_entries)
    if dict_ is not None:
        protein = dict_to_protein_shape(dict_)
        
        #700 is max len for proteins
        if (protein.shape[0] == 700):
            proteins.append(protein)
            transkaya_conter += 1
            i += 1 
            #proteins = np.vstack((proteins,protein))
    else:
        input_file.close()
        break
    

#removing the first elemnt used for stacking    
#proteins = np.delete(proteins, 0)    

In [None]:
import mmap

#secondary label is not present in the files
with open('casp12/training_100', 'rb', 0) as file, \
     mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) as s:
    if s.find(b'SECONDARY') != -1:
        print('true')
    else: 
        print('false')

In [None]:
#finding the number of total proteins before creating numpy on disk
nr_windows = 0
for i in range(4):
    print(i)
    f = gzip.GzipFile( 'casp12/training_30-' + str(i) + 'window19Middle' +'.npy.gz', "r")
    dataset_part =  np.load(f)
    print(dataset_part.shape[0], " shape of dataset")
    nr_windows += dataset_part.shape[0]
    print("all togheter ", nr_windows)
    
    del dataset_part

In [None]:
filename = "casp12/proteinNet-all-windows-19-middle-30-thining"
# 19609184
#all 19 partitions in 1 dataset
fp = np.memmap(filename, dtype='float64', mode='w+', shape=(3763142, 19, 50))

In [None]:
#adding all the information to big dataset
l = 0
r = 0
for i in range(4):
    
    f = gzip.GzipFile( 'casp12/training_30-' + str(i) + 'window19Middle' +'.npy.gz', "r")
    dataset_part =  np.load(f)
    print(dataset_part.shape)
    
    r += dataset_part.shape[0]
    print(l, " ", r)
    fp[l:r] =  dataset_part
    l = r
    del dataset_part

In [None]:
fp.flush()
del fp

In [None]:
fp = np.memmap(filename, dtype='float64', mode='r', shape=(3763142, 19, 50))

In [None]:
print(fp.shape)

In [None]:
print(fp[0,])

In [None]:
g = gzip.GzipFile( 'casp12/proteinNet-all-windows-19-middle-30-thining' +'.npy.gz', "w")
np.save(g, fp)
g.close()

In [None]:
import gzip
f = gzip.GzipFile( 'casp12/proteinNet-all-windows-19-middle-30-thining' +'.npy.gz', "r")
dataset_part =  np.load(f)