### Imports

In [1]:
#Imports
import re
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
from vectorization import hash_vectorizer
from vectorization import vectorizer
from vectorization import term_freq_vectorizer

### Functions

In [2]:
#Functions

# One hot encoding ##############################

def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in sequence]
    return np.eye(4,dtype=int)[seq2]
#One hot encoding on a list
def seq_encoding(list):
    return np.array([onehote(i) for i in list])
#Dataframe column tolist().
def toList(dataframe,column_name):
    return dataframe[column_name].tolist()

#################################################

# One hot encoding on chirality #################

#Get integers from chirality string
def toInt(chirality_string):
    buff = re.split(r"[();]",chirality_string)
    return [int(buff[1]),int(buff[2])]

#Get the chirality from the list of chirality strings
def getChirality(chirality_list):
    return np.array([toInt(i) for i in chirality_list])

#Onehot encode the chirality number
def onehot_chirality(number):
    return np.eye(12,dtype=int)[number]
#Onehot encode the chirality
def extract_chirality(chirality):
    return np.array([onehot_chirality(i) for i in chirality])
#Apply onehot on the chirality list
def apply_chirality(chirality_list):
    return np.array([onehot_chirality(i) for i in chirality_list])

#################################################

# Misc ##########################################

#Flatten the array
def Flatten(list):
    return np.array([list[i].flatten() for i in range(len(list))])

#Make a tuple with index and value from a list
def make_tuple(list):
    return [(i+1,list[i]) for i in range(len(list))]
#Make a tuple from two lists
def make_tuple2(list1,list2):
    if(len(list1) != len(list2)):
        print("Error: Lists are not the same length")
        return None
    return [(list1[i],list2[i]) for i in range(len(list1))]
#################################################

### Data Import

In [8]:
# Load the data
set_raw = pd.read_csv('../Data/training_set.csv', low_memory=False)

# Preprocess the data
# Chirality
chirality = getChirality(set_raw['Chirality'])
chirality = pd.DataFrame(chirality, columns=['m','n'])

In [12]:
# Chirality
chirality = getChirality(set_raw['Chirality'])
chirality = pd.DataFrame(chirality, columns=['m','n'])
# Change chiralities to parameter k = m-2n.
chirality['k'] = chirality['m'] - 2*chirality['n']
# Remove the chirality columns
chirality.drop(['m','n'], axis=1, inplace=True)


Unnamed: 0,k
0,6
1,4
2,9
3,-2
4,-3
5,-9
6,1
8,7
9,-8
10,-5


### Count Vectorizer

In [15]:
# Transform
# Sequences
seq = vectorizer.transform(set_raw['Sequence'])
seq = pd.DataFrame(seq.toarray(), columns=vectorizer.get_feature_names_out())
# Merge the dataframes
set = pd.concat([seq,chirality,set_raw['Label']],axis=1)
set.sample(5)

Unnamed: 0,c,cc,ccc,cct,ct,ctc,ctt,t,tc,tcc,tct,tt,ttc,ttt,m,n,Label
275,2,0,0,0,2,0,2,10,2,0,2,7,2,4,11,0,N
30,5,0,0,0,5,3,1,7,5,0,5,1,1,0,7,3,N
914,10,7,4,2,2,2,0,2,2,2,0,0,0,0,11,1,N
947,10,7,4,2,2,2,0,2,2,2,0,0,0,0,11,0,N
322,2,0,0,0,2,0,2,10,2,0,2,7,2,4,9,7,N


### Hash Vectorizer
#### DO NOT RUN THIS CODE SNIPPET UNLESS YOU HAVE 16GB OF RAM

In [9]:
# Transform
# Sequences
seq_hash = hash_vectorizer.transform(set_raw['Sequence'])
seq_hash = pd.DataFrame(seq_hash.toarray())
# Remove columns with all zeros
seq_hash = seq_hash.loc[:, (seq_hash != 0).any(axis=0)]

Unnamed: 0,468425,532659,533825,539482,552695,694262,723874,772663,801063,811211,862625,992385,1023064,1038787
0,-0.086711,-0.606977,0.086711,-0.173422,0.0,-0.086711,0.086711,-0.260133,-0.260133,0.086711,-0.433555,0.346844,0.0,-0.346844
1,-0.086711,-0.606977,0.086711,-0.173422,0.0,-0.086711,0.086711,-0.260133,-0.260133,0.086711,-0.433555,0.346844,0.0,-0.346844
2,-0.086711,-0.606977,0.086711,-0.173422,0.0,-0.086711,0.086711,-0.260133,-0.260133,0.086711,-0.433555,0.346844,0.0,-0.346844
3,-0.086711,-0.606977,0.086711,-0.173422,0.0,-0.086711,0.086711,-0.260133,-0.260133,0.086711,-0.433555,0.346844,0.0,-0.346844
4,-0.086711,-0.606977,0.086711,-0.173422,0.0,-0.086711,0.086711,-0.260133,-0.260133,0.086711,-0.433555,0.346844,0.0,-0.346844


In [11]:
# Merge the dataframes
set_hash = pd.concat([seq_hash,chirality,set_raw['Label']],axis=1)
set_hash.sample(5)

Unnamed: 0,468425,532659,533825,539482,552695,694262,723874,772663,801063,811211,862625,992385,1023064,1038787,m,n,Label
694,-0.083624,-0.334497,0.167248,-0.167248,-0.167248,-0.418121,0.083624,0.0,0.0,0.0,-0.668994,0.167248,0.334497,-0.167248,9,7,N
159,-0.062378,-0.062378,0.0,0.0,0.0,-0.561405,0.062378,-0.062378,0.0,0.0,-0.686161,0.062378,0.436648,-0.062378,8,5,N
346,0.0,-0.655386,0.163846,-0.24577,-0.163846,0.0,0.0,-0.163846,-0.327693,0.0,-0.327693,0.327693,0.0,-0.327693,9,7,N
522,-0.264135,-0.528271,0.17609,-0.17609,-0.17609,-0.264135,0.264135,0.0,0.0,0.0,-0.528271,0.264135,0.0,-0.264135,7,3,N
537,-0.177471,-0.532414,0.088736,-0.088736,-0.088736,-0.177471,0.177471,-0.177471,-0.177471,0.0,-0.532414,0.354943,0.0,-0.354943,8,8,N


### Tfidf Vectorizer

In [13]:
# Transform
# Sequences
seq_tfidf = term_freq_vectorizer.transform(set_raw['Sequence'])
seq_tfidf = pd.DataFrame(seq_tfidf.toarray(), columns=term_freq_vectorizer.get_feature_names_out())
# Merge the dataframes
set_tfidf = pd.concat([seq_tfidf,chirality,set_raw['Label']],axis=1)
set_tfidf.head()

Unnamed: 0,c,cc,ccc,cct,ct,ctc,ctt,t,tc,tcc,tct,tt,ttc,ttt,m,n,Label
0,0.37026,0.084994,0.0,0.090234,0.311101,0.372473,0.0,0.518365,0.302075,0.088445,0.420487,0.195719,0.099896,0.138395,10,2,N
1,0.37026,0.084994,0.0,0.090234,0.311101,0.372473,0.0,0.518365,0.302075,0.088445,0.420487,0.195719,0.099896,0.138395,10,3,N
2,0.37026,0.084994,0.0,0.090234,0.311101,0.372473,0.0,0.518365,0.302075,0.088445,0.420487,0.195719,0.099896,0.138395,11,1,N
3,0.37026,0.084994,0.0,0.090234,0.311101,0.372473,0.0,0.518365,0.302075,0.088445,0.420487,0.195719,0.099896,0.138395,8,5,N
4,0.37026,0.084994,0.0,0.090234,0.311101,0.372473,0.0,0.518365,0.302075,0.088445,0.420487,0.195719,0.099896,0.138395,9,6,N


### Model

In [21]:
def mymodel():
    model = lm.LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
    return model

#### Count Vectorizer

In [22]:
# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(set.drop('Label',axis=1),
                                                    set['Label'],
                                                    test_size=0.2,
                                                    random_state=42)
# Model training
model = mymodel()
model.fit(X_train,Y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

#### Hash Vectorizer

In [26]:
# Train test split
X_train_hash, X_test_hash, Y_train_hash, Y_test_hash = train_test_split(set_hash.drop('Label',axis=1),
                                                                                        set_hash['Label'],
                                                                                        test_size=0.2,
                                                                                        random_state=42)
# Model training
model_hash = mymodel()
model_hash.fit(X_train_hash,Y_train_hash)



LogisticRegression(max_iter=1000, multi_class='multinomial')

#### Tfidf Vectorizer

In [29]:
# Train test split
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(set_tfidf.drop('Label',axis=1),
                                                                                        set_tfidf['Label'],
                                                                                        test_size=0.2,
                                                                                        random_state=42)
# Model training
model_tfidf = mymodel()
model_tfidf.fit(X_train_tfidf,Y_train_tfidf)

LogisticRegression(max_iter=1000, multi_class='multinomial')