In [116]:
#Imports
import re
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split

In [58]:
# Load the data
set_raw = pd.read_csv('../Data/training_set.csv', low_memory=False)

### Encoding and Writing into CSV

In [128]:
#Recognition Sequences
seq_raw = toList(set_raw,'Sequence')
seq_encoded = extract_onehot(seq_raw)
#Chiralities
chirality_raw = toList(set_raw,'Chirality')
extracted_chirality = getChirality(chirality_raw)
chirality_encoded = apply_chirality(extracted_chirality)

In [110]:
#Converting to Dictionary
seq_dict = toDictionary(seq_encoded)
chirality_dict = toDictionary(chirality_encoded)
label_dict = toDictionary(set_raw['Label'])
#Creating the dataframe
encoded_dataset = pd.DataFrame({'Sequence':seq_dict,'Chirality':chirality_dict,'Label':label_dict})
encoded_dataset.to_csv('../Data/encoded_dataset.csv')

### Flat Dataset

In [None]:
#Flatten
seq_enc_flat = Flatten(seq_encoded)
chirality_enc_flat = Flatten(chirality_encoded)
#Flat Dictionary
seq_dict_flat = toDictionary(seq_enc_flat)
chirality_dict_flat = toDictionary(chirality_enc_flat)
#Creating the dataframe
encoded_dataset_flat = pd.DataFrame({'Sequence':seq_dict_flat,'Chirality':chirality_dict_flat,'Label':set_raw['Label']})

### Functions

In [142]:
#One hot encoding function
def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in sequence]
    return np.eye(4,dtype=int)[seq2]

#Converts the dataframe column to a list
def toList(dataframe,column_name):
    return dataframe[column_name].tolist()

#Extracts the one hot encoded data from the sequence list
def extract_onehot(sequence_list):
    return np.array([onehote(i) for i in sequence_list])

#Convert array to dictionary with indexes as keys
def toDictionary(list):
    return {i:list[i] for i in range(len(list))}

#Get integers from chirality string
def toInt(chirality_string):
    buff = re.split(r"[();]",chirality_string)
    return [int(buff[1]),int(buff[2])]

#Get the chirality from the list of chirality strings
def getChirality(chirality_list):
    return [toInt(i) for i in chirality_list]

#Onehot encode the chirality number
def onehot_chirality(number):
    return np.eye(12,dtype=int)[number]
#Onehot encode the chirality
def extract_chirality(chirality):
    return np.array([onehot_chirality(i) for i in chirality])
#Apply onehot on the chirality list
def apply_chirality(chirality_list):
    return np.array([onehot_chirality(i) for i in chirality_list])
#Flatten
def Flatten(list):
    return [list[i].flatten() for i in range(len(list))]

### Model Definition

In [113]:
def mymodel():
    model = lm.LogisticRegression(solver='lbfgs', max_iter=1000)
    optimizer = Adam(learning_rate=0.00005) # 0.0099
    model.compile(loss='mean_squared_error',optimizer=optimizer)
    return model

In [None]:
#Train test split
X_train, X_test, Y_train, Y_test = train_test_split(encoded_dataset.drop('Label',axis = 1),encoded_dataset['Label'],test_size=0.2,random_state=42)
#Train the model
model = lm.LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train,Y_train)