# Python Processing Step for Multilabel FAIMS Data

### Packages

In [2]:
#Load all my packages
import pandas as pd
import numpy as np
import sklearn
import sklearn.ensemble
import skmultilearn
from scipy import sparse
import matplotlib.pyplot as plt
import torch
import tensorflow as tf
from skmultilearn.model_selection import  iterative_train_test_split

### Functions

In [3]:
#Defining the function for adding the pyteomic pieces
from pyteomics import mass
from pyteomics import parser
from pyteomics import electrochem


def addfeatures(featurestable, seqlabel = 'Sequence'):
    Mass = list()
    pI = list()
    Charge = list()
    
    for i in range(0, featurestable.shape[0]):
        ps = parser.parse(featurestable[seqlabel][i], show_unmodified_termini=True)
        
        Mass.append(mass.calculate_mass(parsed_sequence=ps))
        Charge.append(electrochem.charge(ps, 2.5))
        pI.append(electrochem.pI(featurestable[seqlabel][i]))
        
    
    featurestable['pyMass'] = Mass
    featurestable['pI'] = pI
    featurestable['pyCharge'] = Charge
    
    return(featurestable)



#WANT TO TRY ONE-HOT WITH LIST THAT I THEN CONVERT INTO FRAME AFTERWARD
#WOULD ALSO ALLOW FOR THE USE OF THE KARAS PADDING FUNCTION SO THAT I CAN HIT THEM ALL WITH ZEROS AT THE SAME TIME
import numpy as np
from numpy import array
from numpy import argmax         #finds the index of the maximum value in a vector
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences

def simpleOneHot(data_frame, sequenceTag = 'ModSequence', alphabet = 'ACDEFGHIKLMNPQRSTVWY'):
    #Start by finding the max and calculating needed vector length
    VEC_LENGTH = max(data_frame['Length']) * len(alphabet)
    
    #Define what residues are possible
    AMINO_ACIDS = alphabet 
    
    #TURNING CHARACTERS INTO INTEGERS
    # Map character keys to integer values in a dictionary, then map integer keys to character values to revers transform
    char_to_int = dict((c, i) for i, c in enumerate(AMINO_ACIDS))   #character keys to integer values
    int_to_char = dict((i, c) for i, c in enumerate(AMINO_ACIDS))   #integer keys to character values
    
    
    hotlist = list()
    #Build out the rest of the sequences' one-hot arrays
    
    for i in range(0, data_frame.shape[0]):
        
        pep = data_frame[sequenceTag][i]
        #print(pep)
        integer_encode = [char_to_int[char] for char in pep]
        encoded = to_categorical(integer_encode, num_classes=22)
        flatencode = encoded.flatten()
        
        #numzeros = VEC_LENGTH - len(flatencode)
        #flatencode = np.append(flatencode, [[0] * numzeros])
        
        hotlist.append(flatencode)
    
    padded = pad_sequences(hotlist, padding= 'post', maxlen=VEC_LENGTH)
    
    hotarray = np.array(padded)
    
    hotarray.shape
    return(hotarray)

Using TensorFlow backend.


### Processing

In [5]:
#Bringing in the the final labelling scheme data and adding the other features
data_df = pd.read_csv("D:/Projects/FAIMS_MachineLearning/2020/March/50percentMaxPlusThreshold.csv", low_memory=False) #read in data generated from R preprocessing
data_df = addfeatures(data_df)
data_hotarray = simpleOneHot(data_frame=data_df, alphabet='ACDEFGHIKLMNPQRSTVWYam')
feature_subset = ['Charge', 'Length', 'pyMass', 'pI']
#Generating X and y, features and labels respectively 
X = np.concatenate((data_df[feature_subset], data_hotarray), axis = 1)
y = data_df.loc[ : ,  'X20':'X95'].values

In [6]:
#Making unique keys for Xs so can be reattached to after iterative train test split
keys = np.transpose(np.array([range(0, data_df.shape[0])]))
X_keys = np.concatenate((keys, X), axis=1)

In [7]:
#Split the 50%+ threshold data into train and test keeping label distribution proportional
X_train, y_train, X_test, y_test = iterative_train_test_split(X_keys, y, test_size=0.30)
X_train.shape

(85991, 1105)

In [8]:
#Pulling out training data to allow for other thresholds to be applied
fulltraining = pd.concat((pd.DataFrame(y_train), data_df.loc[X_train[:, 0],['Sequence','Charge', 'SeqCharge']].reset_index(drop=True), pd.DataFrame(X_train)), axis=1)
fulltraining.to_csv("50percentplusTraining.csv")

In [9]:
#Pulling out test data to test those other thresholds
fulltesting = pd.concat((pd.DataFrame(y_test), data_df.loc[X_test[:, 0],['Sequence','Charge', 'SeqCharge']].reset_index(drop=True), pd.DataFrame(X_test)), axis=1)
fulltesting.to_csv("50percentplusTesting.csv")