In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [4]:
def kmer_count(sequence, k=3, step=1):
    kmers_count = {}
    s = 0
    for i in range(0, len(sequence) - k + 1, step):
        kmer = sequence[i:i + k]
        s += 1
        if kmer in kmers_count:
            kmers_count[kmer] += 1
        else:
            kmers_count[kmer] = 1
    for key, value in kmers_count.items():
        kmers_count[key] = value / s

    return kmers_count

* Core

In [2]:
dataset = pd.read_csv("./Output/Arabidopsis_thaliana_GHLH_and_CYP_gene.csv")
dataset.head()

Unnamed: 0,id,sequence,length,class
0,AT1G51140.1,AAGTTTCTCTCACGTTCTCTTTTTTAATTTTAATTTCTCGCCGGAA...,2297,0
1,AT1G73830.1,ACTTTCTATTTTCACCAATTTTCAAAAAAAAAATAAAAATTGAAAC...,1473,0
2,AT1G09530.1,AGTTACAGACGATTTGGTCCCCTCTCTTCTCTCTCTGCGTCCGTCT...,2958,0
3,AT1G49770.1,ATGACTAATGCTCAAGAGTTGGGGCAAGAGGGTTTTATGTGGGGCA...,2205,0
4,AT1G68810.1,AAACTTTTGTCTCTTTTTAACTCTCTTAACTTTCGTTTCTTCTCCT...,1998,0


In [26]:
k = 7
sequences   = dataset['sequence']
kmers_count = []
for i in range(len(sequences)):
    kmers_count.append(kmer_count(sequences[i], k=k, step=1))

In [27]:
v = DictVectorizer(sparse=False)
feature_values = v.fit_transform(kmers_count)
feature_names = v.get_feature_names_out()
X = pd.DataFrame(feature_values, columns=feature_names)
X.head()

Unnamed: 0,AAAAAAA,AAAAAAC,AAAAAAG,AAAAAAT,AAAAACA,AAAAACC,AAAAACG,AAAAACT,AAAAAGA,AAAAAGC,...,TTTTTCG,TTTTTCT,TTTTTGA,TTTTTGC,TTTTTGG,TTTTTGT,TTTTTTA,TTTTTTC,TTTTTTG,TTTTTTT
0,0.001309,0.0,0.000436,0.000436,0.0,0.0,0.0,0.0,0.000436,0.0,...,0.0,0.000873,0.000436,0.0,0.000873,0.000436,0.000436,0.000436,0.000436,0.000436
1,0.003408,0.0,0.0,0.002045,0.0,0.0,0.0,0.0,0.000682,0.0,...,0.0,0.001363,0.0,0.000682,0.000682,0.000682,0.000682,0.000682,0.002045,0.001363
2,0.0,0.0,0.0,0.000339,0.000339,0.0,0.0,0.0,0.0,0.000339,...,0.0,0.001016,0.000339,0.0,0.0,0.001694,0.000678,0.0,0.000339,0.001016
3,0.003183,0.000455,0.00091,0.000455,0.000455,0.000455,0.0,0.000455,0.000455,0.0,...,0.000455,0.00091,0.0,0.0,0.000455,0.0,0.00091,0.001364,0.0,0.001819
4,0.0,0.0,0.000502,0.000502,0.001004,0.0,0.0,0.000502,0.0,0.001004,...,0.0,0.001004,0.0,0.0,0.0,0.000502,0.001506,0.001004,0.000502,0.003012


In [28]:
y = dataset['class']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

* Feature Mask

In [30]:
mask = "AAAACAT-AAACGAT-AAAGCTT-AAATGGT-AACAATT-AACAGTG-AACGAGA-AACGCCG-AACTTAC-AAGACCA-AAGATGT-AAGATTG-AAGCTAC-AAGGCAA-AAGTAGA-AAGTTTC-AATAAAT-AATAATG-AATAGAC-AATCGAG-AATGAAC-ACAACCA-ACACGAC-ACACGTG-ACACTCG-ACAGCAA-ACCAACA-ACCACAG-ACCCCCT-ACCCGGA-ACCCTGA-ACCGCCT-ACCGGCA-ACCGTGG-ACCTTTT-ACGAAGA-ACGAGAT-ACGATAT-ACGATGC-ACGCAAC-ACGCCTA-ACGGACC-ACGGCAC-ACGGCTA-ACGGGTA-ACGTATT-ACGTCTG-ACGTGTT-ACTATCC-ACTCACA-ACTCCCC-ACTCTAT-ACTTCGC-ACTTTGT-AGAAGAA-AGAATAT-AGAATCC-AGACATC-AGAGAAA-AGAGAGG-AGAGATT-AGAGCAA-AGAGCAC-AGAGCCA-AGAGGGC-AGATGTG-AGATGTT-AGATTCG-AGATTTC-AGCAACA-AGCAAGA-AGCACAC-AGCAGAT-AGCATCG-AGCCGCG-AGCCTTA-AGCGTGC-AGCTTAA-AGCTTAG-AGGACCC-AGGATAA-AGGATAT-AGGCGAT-AGGGGTC-AGGTATT-AGGTGAG-AGTAACT-AGTACTC-AGTACTT-AGTAGGA-AGTAGTA-AGTCTCC-AGTGAAT-AGTGATT-AGTGGGC-AGTGTAG-AGTTCAA-AGTTTTG-ATAACGG-ATAACTT-ATAAGCG-ATACACG-ATACGGG-ATACTTC-ATAGCAT-ATAGGAT-ATATGGA-ATCAATT-ATCAGAT-ATCAGTT-ATCATCA-ATCATGG-ATCCAGA-ATCCCGC-ATCCTGA-ATCCTGC-ATCTAGA-ATCTCTC-ATCTGAC-ATGACGA-ATGATGA-ATGCCTC-ATGGAAA-ATGGACC-ATGGAGT-ATGGCCC-ATGGCGC-ATGGGAA-ATGGGCC-ATGGGCT-ATGGTAC-ATGTTGC-ATTACGA-ATTAGCC-ATTCAGA-ATTCATT-ATTCTGA-ATTGATT-ATTGCCT-ATTGGGA-ATTGTTC-ATTTGTC-ATTTTTG-CAAAAGG-CAAAGAA-CAACAAC-CAACAGT-CAACGAG-CAAGACT-CAAGGTA-CAATCTG-CAATGAA-CAATGCA-CAATTAT-CAATTGA-CAATTTC-CACAAGA-CACACGA-CACAGGG-CACATCA-CACATCT-CACCCCC-CACCTTG-CACGACA-CACGTGG-CAGAACG-CAGAAGC-CAGACAG-CAGACTG-CAGATAG-CAGCAGA-CAGCCCC-CAGGAGA-CAGGATA-CAGGTTG-CAGTAAC-CAGTTTC-CATACCG-CATACCT-CATATGG-CATCAGC-CATCGGA-CATCTGT-CATGACC-CATGACT-CATGTAC-CATTCTG-CATTTGG-CCAAAGG-CCAAGTG-CCACAAC-CCACCCC-CCACGCA-CCAGGAA-CCATCGC-CCATCGG-CCCAAGA-CCCAGCG-CCCATCA-CCCCACC-CCCCCAA-CCCGCGG-CCCTAAT-CCCTCGG-CCCTGAG-CCGCCTT-CCGCGCG-CCGGCAA-CCGGCGG-CCGTAAT-CCGTCTC-CCGTTAC-CCGTTCG-CCGTTGA-CCGTTTA-CCGTTTG-CCTAATT-CCTAGGA-CCTATTA-CCTTACG-CCTTTTA-CCTTTTT-CGAAAGT-CGAAGGC-CGAAGTC-CGAATCT-CGACTGT-CGAGATC-CGATGCT-CGATGTG-CGATTGG-CGCCCGA-CGCCTAT-CGCGATC-CGCTGCG-CGGAGGC-CGGCAAA-CGGGGCA-CGGGTAG-CGGTCCA-CGTATTC-CGTCCAT-CGTGACA-CGTGTCC-CGTTGGC-CGTTGGG-CTAAATT-CTAACTC-CTAAGCA-CTAAGGT-CTAATTC-CTAATTT-CTACAGA-CTACAGC-CTACCAG-CTACGCA-CTAGGTG-CTAGTTT-CTATAAG-CTCAACC-CTCAGGT-CTCCCGC-CTCGAAG-CTGAATC-CTGATGA-CTGATGT-CTGCAGA-CTGCTGC-CTGTATT-CTTCACC-CTTCGGG-CTTCGTC-CTTGGGC-CTTTATA-CTTTATT-CTTTCTG-CTTTGAG-CTTTTAA-CTTTTGC-CTTTTTC-CTTTTTT-GAAACCT-GAACAAG-GAACGAG-GAAGAAT-GAATAAG-GAATCGA-GAATCGG-GAATTCA-GACGACA-GACGCAC-GAGACAC-GAGAGGT-GAGCTTT-GAGGCGT-GAGGTGG-GAGTGCA-GAGTTTA-GATACGC-GATCCTG-GATCTGA-GATCTTC-GATGAAC-GATGGTA-GATGTAA-GATTATG-GCACGCA-GCACTTT-GCAGATT-GCAGTTT-GCATTAC-GCATTCG-GCCAGCA-GCCCCTC-GCCCGGG-GCCGGTC-GCCGTTG-GCCTCCA-GCCTCCT-GCCTTTC-GCGAGCG-GCGCCAG-GCGTCTC-GCTAACC-GCTAATT-GCTAGAA-GCTAGCC-GCTATCC-GCTGCCT-GCTGCTG-GCTGGTT-GCTGTGA-GCTTGAA-GGAACCT-GGAAGCT-GGAAGGT-GGACACC-GGACATA-GGACCGG-GGAGAGA-GGAGTGC-GGATAAG-GGATCCT-GGATGTC-GGATTAG-GGATTCA-GGCAAGC-GGCCAGT-GGCCTTC-GGCTTAA-GGGACCG-GGGTGGT-GGGTTAT-GGGTTTT-GGTAGGT-GGTCCGA-GGTGATC-GGTTATC-GGTTGTC-GTAAAAA-GTAAGTG-GTACTCC-GTATTGT-GTATTTC-GTCAAAC-GTCCAGC-GTCCAGG-GTCCCGG-GTCCGAT-GTCCTGG-GTCTACC-GTGACAG-GTGCGTT-GTGGCAC-GTGGGCG-GTGGGCT-GTGTGTT-GTGTTTC-GTTAAGT-GTTAGTT-GTTCGGA-GTTGATC-GTTGCAC-GTTTCAG-GTTTCTT-GTTTGAC-GTTTGGA-GTTTTCT-GTTTTGT-TAAAGAC-TAAAGTC-TAAATGG-TAACAAG-TAACTCG-TAAGCAT-TAAGGAT-TAAGGGA-TAAGGTA-TAAGTAG-TAATAAC-TAATTCA-TACAGGT-TACCAGG-TACGGGA-TACGGTA-TAGAAGC-TAGCCAA-TAGCTGT-TAGGGTT-TAGTCGC-TAGTTGC-TATATGC-TATATTC-TATGTAG-TATTGAT-TCAAAAC-TCAACGA-TCAACGG-TCAAGAG-TCACGAT-TCAGGTT-TCAGTGG-TCAGTTA-TCATGAC-TCATGCT-TCCAATG-TCCCTAC-TCCCTAT-TCCTCTA-TCCTGGT-TCCTTTA-TCGAATC-TCGATTA-TCGTTTT-TCTAATC-TCTCTAG-TCTGATG-TCTGCAG-TCTGCGC-TCTGGCA-TCTGGTA-TCTGTCG-TGAACAA-TGAATCA-TGAATCT-TGACGTT-TGACTGC-TGAGACA-TGAGAGG-TGATTCG-TGCAGAA-TGCAGCA-TGCATTC-TGCGCGT-TGGAAAA-TGGACCG-TGGAGCT-TGGCCCA-TGGGAAG-TGGGGAA-TGGGGAC-TGGGTCG-TGGTAAG-TGTAATC-TGTTCAG-TGTTCCC-TGTTTAA-TTAAAGG-TTAAGCC-TTAATCC-TTAATTC-TTACAGG-TTAGGAA-TTAGTAG-TTATCAG-TTCATGT-TTCCCAC-TTGAATC-TTGCGGC-TTTAAGT-TTTCATA-TTTCTGT-TTTCTTG"
mask = mask.split("-")
len(mask)

483

In [31]:
X_reducted = X[mask]
X_reducted

Unnamed: 0,AAAACAT,AAACGAT,AAAGCTT,AAATGGT,AACAATT,AACAGTG,AACGAGA,AACGCCG,AACTTAC,AAGACCA,...,TTAGTAG,TTATCAG,TTCATGT,TTCCCAC,TTGAATC,TTGCGGC,TTTAAGT,TTTCATA,TTTCTGT,TTTCTTG
0,0.000000,0.0,0.000000,0.000000,0.000000,0.000436,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000436,0.000436,0.000000,0.000873,0.000000,0.000000,0.000000,0.000000,0.000436
1,0.000000,0.0,0.000000,0.000000,0.000682,0.000682,0.000682,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000682,0.000000,0.000000,0.000000,0.000000,0.000682,0.000000,0.000000
2,0.000000,0.0,0.000339,0.000339,0.000678,0.000339,0.000339,0.000000,0.000339,0.000339,...,0.0,0.000000,0.000339,0.000339,0.001016,0.000000,0.000339,0.000000,0.000339,0.001016
3,0.000000,0.0,0.000455,0.000455,0.000910,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000455,0.000000,0.000000,0.000455,0.000000,0.000910,0.000455,0.000000,0.000910
4,0.001506,0.0,0.000502,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.001004,0.000502,0.000502,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.000350,0.0,0.000000,0.000350,0.000350,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000350,0.000350,0.000000,0.000350,0.000000,0.000000,0.000350,0.000000,0.000000
376,0.000000,0.0,0.000000,0.000000,0.000448,0.000000,0.000000,0.000448,0.000448,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000897
377,0.000000,0.0,0.001114,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001114
378,0.000489,0.0,0.000000,0.000245,0.000000,0.000000,0.000000,0.000000,0.000000,0.000245,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000489,0.000000,0.000000,0.000245,0.000245


In [32]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_reducted, y, train_size=0.8, stratify=y)

print("Shapes of train/test splits:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Shapes of train/test splits:
X_train: (304, 483)
X_test: (76, 483)
y_train: (304,)
y_test: (76,)


* Model

In [33]:
model = GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1))

In [None]:
def train(model, X, y):
    stratified_kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    cv_scores = cross_val_score(model, X, y, cv=stratified_kfold, scoring='accuracy')
    print("[ALL_FEATURE] -- Cross-validated accuracy: {:.4f} ± {:.4f}".format(np.mean(cv_scores), np.std(cv_scores)))