### Feature extraction
Here are presentend 3 ways to extract features in a format supported by machine learning algorithms from datasets.

In [22]:
#Imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import re
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder

In [28]:
# Reading the file
set = pd.read_csv(r'C:\Users\User\Desktop\DNA_CNT\Data\seq_TC_12mer_2strun.csv')
set.columns = ['Sequence', 'Class']
seq =  set['Sequence']

In [None]:
# Data preprocessing
# Removing the non-nucleotide characters
seq = seq.apply(lambda x: re.sub('[^ACGT]', '', x))
# Removing the empty sequences
seq = seq.apply(lambda x: x.strip())

### Count Vectorizer
Outputs number of times each n-gram occurs in a sequence.

In [9]:
# Creating the vectorizer
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,3))
# Fitting the vectorizer
vectorizer.fit(seq)
# Transforming the sequences
seq_vector = vectorizer.transform(seq)
# Creating the dataframe
seq_vector = pd.DataFrame(seq_vector.toarray(), columns=vectorizer.get_feature_names_out())
seq_vector.head()

Unnamed: 0,c,cc,ccc,cct,ct,ctc,ctt,t,tc,tcc,tct,tt,ttc,ttt
0,5,1,0,1,4,2,1,7,4,1,3,2,1,1
1,5,0,0,0,5,3,1,7,5,0,5,1,1,0
2,5,0,0,0,5,3,1,7,5,0,5,1,1,0
3,5,1,0,1,4,2,1,7,4,1,3,2,1,1
4,5,1,0,1,4,3,1,7,4,1,3,2,0,1


### Hashing Vectorizer

In [20]:
# Creating the vectorizer
hash_vectorizer = HashingVectorizer(analyzer='char', ngram_range=(1,3))
# Fitting the vectorizer
hash_vectorizer.fit(seq)
# Transforming the sequences
seq_vector = hash_vectorizer.transform(seq)
# Creating the dataframe
seq_hash = pd.DataFrame(seq_vector.toarray()) # columns=vectorizer.get_feature_names_out()
# Remove columns with all zeros
seq_hash = seq_hash.loc[:, (seq_hash != 0).any(axis=0)]
seq_hash.head()

Unnamed: 0,468425,532659,533825,539482,552695,694262,723874,772663,801063,811211,862625,992385,1023064,1038787
0,-0.088045,-0.616316,0.088045,-0.17609,-0.088045,-0.088045,0.088045,-0.17609,-0.264135,0.088045,-0.440225,0.35218,0.0,-0.35218
1,0.0,-0.551677,0.078811,-0.078811,-0.078811,0.0,0.0,-0.236433,-0.394055,0.0,-0.394055,0.394055,0.0,-0.394055
2,0.0,-0.551677,0.078811,-0.078811,-0.078811,0.0,0.0,-0.236433,-0.394055,0.0,-0.394055,0.394055,0.0,-0.394055
3,-0.088045,-0.616316,0.088045,-0.17609,-0.088045,-0.088045,0.088045,-0.17609,-0.264135,0.088045,-0.440225,0.35218,0.0,-0.35218
4,-0.086711,-0.606977,0.0,-0.173422,-0.086711,-0.086711,0.086711,-0.260133,-0.260133,0.086711,-0.433555,0.346844,0.0,-0.346844


### Tfidf Vectorizer
Computes the term frequency–inverse sequence frequency of each n-gram.

In [23]:
# Creating the vectorizer
term_freq_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,3))
# Fitting the vectorizer
term_freq_vectorizer.fit(seq)
# Transforming the sequences
seq_vector = term_freq_vectorizer.transform(seq)
# Creating the dataframe
seq_tfidf = pd.DataFrame(seq_vector.toarray(), columns=term_freq_vectorizer.get_feature_names_out())
seq_tfidf.head()

Unnamed: 0,c,cc,ccc,cct,ct,ctc,ctt,t,tc,tcc,tct,tt,ttc,ttt
0,0.382863,0.087887,0.0,0.093305,0.321689,0.256767,0.114807,0.536008,0.312356,0.091456,0.434799,0.20238,0.103297,0.143106
1,0.317338,0.0,0.0,0.0,0.333293,0.319235,0.095159,0.444273,0.323623,0.0,0.600642,0.083872,0.085618,0.0
2,0.317338,0.0,0.0,0.0,0.333293,0.319235,0.095159,0.444273,0.323623,0.0,0.600642,0.083872,0.085618,0.0
3,0.382863,0.087887,0.0,0.093305,0.321689,0.256767,0.114807,0.536008,0.312356,0.091456,0.434799,0.20238,0.103297,0.143106
4,0.369826,0.084894,0.0,0.090128,0.310736,0.372037,0.110898,0.517757,0.301721,0.088342,0.419994,0.195489,0.0,0.138233
