### Imports

In [2]:
#Imports
import re
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
from vectorization import hash_vectorizer
from vectorization import vectorizer
from vectorization import term_freq_vectorizer

### Functions

In [3]:
#Functions

# One hot encoding ##############################

def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in sequence]
    return np.eye(4,dtype=int)[seq2]
#One hot encoding on a list
def seq_encoding(list):
    return np.array([onehote(i) for i in list])
#Dataframe column tolist().
def toList(dataframe,column_name):
    return dataframe[column_name].tolist()

#################################################

# One hot encoding on chirality #################

#Get integers from chirality string
def toInt(chirality_string):
    buff = re.split(r"[();]",chirality_string)
    return [int(buff[1]),int(buff[2])]

#Get the chirality from the list of chirality strings
def getChirality(chirality_list):
    return np.array([toInt(i) for i in chirality_list])

#Onehot encode the chirality number
def onehot_chirality(number):
    return np.eye(12,dtype=int)[number]
#Onehot encode the chirality
def extract_chirality(chirality):
    return np.array([onehot_chirality(i) for i in chirality])
#Apply onehot on the chirality list
def apply_chirality(chirality_list):
    return np.array([onehot_chirality(i) for i in chirality_list])

#################################################

# Misc ##########################################

#Flatten the array
def Flatten(list):
    return np.array([list[i].flatten() for i in range(len(list))])

#Make a tuple with index and value from a list
def make_tuple(list):
    return [(i+1,list[i]) for i in range(len(list))]
#Make a tuple from two lists
def make_tuple2(list1,list2):
    if(len(list1) != len(list2)):
        print("Error: Lists are not the same length")
        return None
    return [(list1[i],list2[i]) for i in range(len(list1))]
#################################################

### Data Import

In [4]:
# Load the data
set_raw = pd.read_csv('../Data/training_set.csv', low_memory=False)

# Preprocess the data
# Chirality
chirality = getChirality(set_raw['Chirality'])
chirality = pd.DataFrame(chirality, columns=['m','n'])

In [5]:
# Chirality
chirality = getChirality(set_raw['Chirality'])
chirality = pd.DataFrame(chirality, columns=['m','n'])
# Change chiralities to parameter k = m-2n.
chirality['k'] = chirality['m'] - 2*chirality['n']
# Remove the chirality columns
chirality.drop(['m','n'], axis=1, inplace=True)


In [17]:
chirality.head(12)

Unnamed: 0,k
0,6
1,4
2,9
3,-2
4,-3
5,-9
6,1
7,-2
8,7
9,-8


### Count Vectorizer

In [6]:
# Transform
# Sequences
seq = vectorizer.transform(set_raw['Sequence'])
seq = pd.DataFrame(seq.toarray(), columns=vectorizer.get_feature_names_out())
# Merge the dataframes
set = pd.concat([seq,chirality,set_raw['Label']],axis=1)
set.sample(5)

Unnamed: 0,c,cc,ccc,cct,ct,ctc,ctt,t,tc,tcc,tct,tt,ttc,ttt,k,Label
269,2,0,0,0,2,0,2,10,2,0,2,7,2,4,-9,N
0,5,1,0,1,4,3,0,7,4,1,3,2,1,1,6,N
678,6,2,0,2,3,0,3,6,3,2,0,3,3,0,1,N
632,4,0,0,0,3,0,3,8,3,0,2,5,3,2,7,N
501,8,5,4,1,3,2,0,4,3,1,2,0,0,0,-8,N


### Hash Vectorizer
#### DO NOT RUN THIS CODE SNIPPET UNLESS YOU HAVE 16GB OF RAM

In [7]:
# Transform
# Sequences
seq_hash = hash_vectorizer.transform(set_raw['Sequence'])
seq_hash = pd.DataFrame(seq_hash.toarray())
# Remove columns with all zeros
seq_hash = seq_hash.loc[:, (seq_hash != 0).any(axis=0)]

In [8]:
# Merge the dataframes
set_hash = pd.concat([seq_hash,chirality,set_raw['Label']],axis=1)
set_hash.sample(5)

Unnamed: 0,468425,532659,533825,539482,552695,694262,723874,772663,801063,811211,862625,992385,1023064,1038787,k,Label
79,-0.081923,-0.409616,0.0,0.0,0.0,-0.163846,0.163846,-0.327693,-0.24577,0.0,-0.573462,0.409616,0.0,-0.327693,-2,N
561,-0.180334,-0.541002,0.090167,-0.270501,-0.090167,-0.360668,0.180334,0.0,0.0,0.180334,-0.541002,0.180334,0.180334,-0.180334,-8,N
498,-0.081923,-0.327693,0.0,0.0,0.0,-0.409616,0.081923,-0.163846,-0.163846,0.0,-0.655386,0.24577,0.327693,-0.24577,1,N
155,-0.05726,-0.05726,0.0,0.0,0.0,-0.572598,0.0,0.0,0.0,0.0,-0.629858,0.0,0.515339,-0.05726,11,N
927,-0.081379,-0.325515,0.081379,-0.244137,-0.081379,-0.488273,0.081379,0.0,0.0,0.162758,-0.651031,0.081379,0.325515,-0.081379,-2,N


### Tfidf Vectorizer

In [9]:
# Transform
# Sequences
seq_tfidf = term_freq_vectorizer.transform(set_raw['Sequence'])
seq_tfidf = pd.DataFrame(seq_tfidf.toarray(), columns=term_freq_vectorizer.get_feature_names_out())
# Merge the dataframes
set_tfidf = pd.concat([seq_tfidf,chirality,set_raw['Label']],axis=1)
set_tfidf.head()

Unnamed: 0,c,cc,ccc,cct,ct,ctc,ctt,t,tc,tcc,tct,tt,ttc,ttt,k,Label
0,0.371174,0.085089,0.0,0.09028,0.31172,0.371297,0.0,0.519644,0.302763,0.088509,0.418401,0.195654,0.099841,0.137737,6,N
1,0.371174,0.085089,0.0,0.09028,0.31172,0.371297,0.0,0.519644,0.302763,0.088509,0.418401,0.195654,0.099841,0.137737,4,N
2,0.371174,0.085089,0.0,0.09028,0.31172,0.371297,0.0,0.519644,0.302763,0.088509,0.418401,0.195654,0.099841,0.137737,9,N
3,0.371174,0.085089,0.0,0.09028,0.31172,0.371297,0.0,0.519644,0.302763,0.088509,0.418401,0.195654,0.099841,0.137737,-2,N
4,0.371174,0.085089,0.0,0.09028,0.31172,0.371297,0.0,0.519644,0.302763,0.088509,0.418401,0.195654,0.099841,0.137737,-3,N


### Model

In [10]:
def mymodel():
    model = lm.LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
    return model

#### Count Vectorizer

In [11]:
# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(set.drop('Label',axis=1),
                                                    set['Label'],
                                                    test_size=0.2,
                                                    random_state=42)
# Model training
model = mymodel()
model.fit(X_train,Y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

#### Hash Vectorizer

In [12]:
# Train test split
X_train_hash, X_test_hash, Y_train_hash, Y_test_hash = train_test_split(set_hash.drop('Label',axis=1),
                                                                                        set_hash['Label'],
                                                                                        test_size=0.2,
                                                                                        random_state=42)
# Model training
model_hash = mymodel()
model_hash.fit(X_train_hash,Y_train_hash)



LogisticRegression(max_iter=1000, multi_class='multinomial')

#### Tfidf Vectorizer

In [13]:
# Train test split
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(set_tfidf.drop('Label',axis=1),
                                                                                        set_tfidf['Label'],
                                                                                        test_size=0.2,
                                                                                        random_state=42)
# Model training
model_tfidf = mymodel()
model_tfidf.fit(X_train_tfidf,Y_train_tfidf)

LogisticRegression(max_iter=1000, multi_class='multinomial')