# Convert patterns into 1D arrays

Import modules

In [1]:
import pandas as pd
import numpy as np
import pickle

Load data files

In [4]:
train_in = '/homes/2472402/data/patterned_train_data.pkl'
# test_in = '/homes/2472402/data/patterned_test_data.pkl'

train_pats = pickle.load(open(train_in, 'rb'))
# test_pats = pickle.load(open(test_in, 'rb'))

Define function to transform patterns (which are multi-row dataframes) into a single row dataframe

In [5]:
# function to collapse rows in df into a single row dataframe. 
# this is a inner function, it should not be applied on patterns containing dssp information
def _linearize(df):
    ndf = df.T.unstack().to_frame().T
    ndf.columns = ndf.columns.swaplevel()
    return ndf

# wrapper function of _linearize
# apply this on patterns containing dssp information
# it will return a single-row df with dssp information as the last column
def linearize(labelled_pat):
    dssp_col = labelled_pat.iloc[:,-1].tolist() # store dssp column as a list
    centroid_idx = int(len(dssp_col)/2) # index of centre of window
    target = dssp_col[centroid_idx] # store prediction target. will be H, E, or -
    unlabelled_pat = labelled_pat.iloc[:, :-1] # remove dssp column
    unlabelled_pat.reset_index(drop=True, inplace = True) # reset index so that column name will be same across all patterns
    linearized_pat = _linearize(unlabelled_pat) # convert unlabelled pattern into a single-row df
    linearized_pat['dssp'] = target # assign additional column containing prediction target to the single row df
    return linearized_pat

Test this function with a generic multi-row df

In [5]:
# example dataframe
df = pd.DataFrame.from_dict({1:[1,2,3,4,5], 2:[6,7,8,9,10], 3:[11,12,13,14,15]}, orient='index')
df.columns = ['A','B','C','D','E']
df

Unnamed: 0,A,B,C,D,E
1,1,2,3,4,5
2,6,7,8,9,10
3,11,12,13,14,15


In [6]:
df1 = _linearize(df)
df1

Unnamed: 0_level_0,A,B,C,D,E,A,B,C,D,E,A,B,C,D,E
Unnamed: 0_level_1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15


Test inner function (_linearize) on a single pattern that has had its dssp column removed

In [7]:
unlabelled_pat = train_pats[1].iloc[:,:-1]
_linearize(unlabelled_pat)

Unnamed: 0_level_0,A,R,N,D,C,Q,E,G,H,I,...,L,K,M,F,P,S,T,W,Y,V
Unnamed: 0_level_1,1,1,1,1,1,1,1,1,1,1,...,17,17,17,17,17,17,17,17,17,17
0,,,,,,,,,,,...,0.047426,0.993307,0.047426,0.017986,0.017986,0.268941,0.047426,0.731059,0.731059,0.017986


Test wrapper function on a single pattern which has not had its dssp column removed

In [8]:
labelled_pat = train_pats[1]
linearize(labelled_pat)

Unnamed: 0_level_0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,dssp
Unnamed: 0_level_1,0,0,0,0,0,0,0,0,0,0,...,16,16,16,16,16,16,16,16,16,Unnamed: 21_level_1
0,,,,,,,,,,,...,0.993307,0.047426,0.017986,0.017986,0.268941,0.047426,0.731059,0.731059,0.017986,-


Run the linearize function on all patterns in train and test datasets, and combine them into a single dataframe each

In [6]:
train_pat_list = [linearize(pat) for pat in train_pats]
# train_df = pd.concat(train_pat_list, ignore_index=True)

KeyboardInterrupt: 

In [None]:
# save as input for layer 2 network
pickle.dump(train_pat_list, 
            open('/homes/2472402/data/l2.input.pkl','wb'), 
            protocol=pickle.HIGHEST_PROTOCOL)

Do some checks on training dataframe to make sure data is intact

In [10]:
train_df.shape

(218421, 341)

In [11]:
len(train_pats)

218421

Repeat for test

In [12]:
test_pat_list = [linearize(pat) for pat in test_pats]
test_df = pd.concat(test_pat_list, ignore_index=True)

In [13]:
test_df.shape

(22734, 341)

In [14]:
len(test_pats)

22734

Save training and testing data as pickle objects

In [15]:
# specify names of training and testing dataframes
train_out = '/homes/2472402/data/linearized_train_data.pkl'
test_out = '/homes/2472402/data/linearized_test_data.pkl'

# save as pickle objects
with open(train_out, 'wb') as f:
    pickle.dump(train_df, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(test_out, 'wb') as f:
    pickle.dump(test_df, f, protocol=pickle.HIGHEST_PROTOCOL)