In [2]:
#Imports
import re
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
from sklear.preprocessing import OneHotEncoder

In [6]:
# Load the data
set_raw = pd.read_csv('../Data/training_set.csv', low_memory=False)

### Encoding and Writing into CSV

In [7]:
#Recognition Sequences
seq_raw = toList(set_raw,'Sequence')
seq_encoded = extract_onehot(seq_raw)
#Chiralities
chirality_raw = toList(set_raw,'Chirality')
extracted_chirality = getChirality(chirality_raw)
chirality_encoded = apply_chirality(extracted_chirality)

In [8]:
#Converting to Dictionary
seq_dict = toDictionary(seq_encoded)
chirality_dict = toDictionary(chirality_encoded)
label_dict = toDictionary(set_raw['Label'])
#Creating the dataframe
encoded_dataset = pd.DataFrame({'Sequence':seq_dict,'Chirality':chirality_dict,'Label':label_dict})
encoded_dataset.to_csv('../Data/encoded_dataset.csv')

### Binarized Dataset

In [98]:
#Encoded Chiralities as string
string_seq_enc = array_to_string(seq_enc_flat)
string_chirality_enc = array_to_string(chirality_enc_flat)
string_dataset = pd.DataFrame({'Sequence':toDictionary(string_seq_enc),'Chirality':toDictionary(string_chirality_enc),'Label':label_dict})
string_dataset.sample(10)

Unnamed: 0,Sequence,Chirality,Label
936,010001000100010000010100010000010100010001000100,10001000000000,N
795,010001000001000100010001000100010001000101000100,1000000001000000,N
258,000100010001000101000100010001000001000100010001,10000000100000000,N
621,010000010001000100010100010000010001000100010100,1000000000001000,N
339,000100010100000101000001000101000001010000010001,1000000001000000,N
805,010001000001000100010100010000010001000101000100,10000100000000,N
808,010001000001000100010100010000010001000101000100,100000000100000,N
270,000100010001010000010001000100010100000100010001,10000000100000000,N
565,000101000100010000010100010000010100010001000001,10000100000000,N
269,000100010001010000010001000100010100000100010001,100000000000100,N


### Flat Dataset

In [11]:
#Flatten
seq_enc_flat = Flatten(seq_encoded)
chirality_enc_flat = Flatten(chirality_encoded)
#Flat Dictionary
seq_dict_flat = toDictionary(seq_enc_flat)
chirality_dict_flat = toDictionary(chirality_enc_flat)
#Creating the dataframe
encoded_dataset_flat = pd.DataFrame({'Sequence':seq_dict_flat,'Chirality':chirality_dict_flat,'Label':set_raw['Label']})

Unnamed: 0,Sequence,Chirality,Label
914,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",N
637,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",N
884,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ...",N
229,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",N
365,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",N


### Functions

In [92]:
#One hot encoding function
def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in sequence]
    return np.eye(4,dtype=int)[seq2]

#Converts the dataframe column to a list
def toList(dataframe,column_name):
    return dataframe[column_name].tolist()

#Extracts the one hot encoded data from the sequence list
def extract_onehot(sequence_list):
    return np.array([onehote(i) for i in sequence_list])

#Convert array to dictionary with indexes as keys
def toDictionary(list):
    return {i:list[i] for i in range(len(list))}

#Get integers from chirality string
def toInt(chirality_string):
    buff = re.split(r"[();]",chirality_string)
    return [int(buff[1]),int(buff[2])]

#Get the chirality from the list of chirality strings
def getChirality(chirality_list):
    return [toInt(i) for i in chirality_list]

#Onehot encode the chirality number
def onehot_chirality(number):
    return np.eye(12,dtype=int)[number]
#Onehot encode the chirality
def extract_chirality(chirality):
    return np.array([onehot_chirality(i) for i in chirality])
#Apply onehot on the chirality list
def apply_chirality(chirality_list):
    return np.array([onehot_chirality(i) for i in chirality_list])
#Flatten
def Flatten(list):
    return [list[i].flatten() for i in range(len(list))]
#Int array to string
def int_to_string(int_array):
    buff = np.array2string(int_array,separator='')
    return buff.replace('[','').replace(']','')
def array_to_string(int_array):
    return np.array([int_to_string(i) for i in int_array])
#Binarize string
def binarize(string_array):
    return np.array([int(i,base=2) for i in string_array])

### Model Definition