This file started 28.05.2019

In [1]:
from Bio import SeqIO

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

import os

from timeit import default_timer as timer

## TfidfVectorizer (+CountVectorizer) on all genes from all organisms

Создать большой список всех генов.

По ним сделать fit для TfidfVectorizer.

Потом сделать transform для всех генов.

На выходе должен получиться кортеж, который содержит:

(organism_name, gene, tfidf)

Можно сразу брать n-grams (3,5)

ВАЖНО: сколько слов в словаре. и сколько из них встречаются только 1 раз (в одном гене)

In [2]:
path_gene_nuc = '../../sf_data/prodigal_output/gene_nuc/'

[print(f) for f in os.listdir(path_gene_nuc)]

Bacillus_amyloliquefaciens_GCF_000196735.1_ASM19673v1_genomic.fasta
Bacillus_atrophaeus_GCF_000742675.1_ASM74267v1_genomic.fasta
Bacillus_halotolerans_GCF_001517105.1_ASM151710v1_genomic.fasta
Bacillus_licheniformis_GCF_000011645.1_ASM1164v1_genomic.fasta
Bacillus_mojavensis_GCF_000245335.1_ASM24533v1_genomic.fasta
Bacillus_paralicheniformis_GCF_000408885.1_ASM40888v1_genomic.fasta
Bacillus_siamensis_GCF_000262045.1_KCTC_13613_01_genomic.fasta
Bacillus_sonorensis_GCF_002202015.1_ASM220201v1_genomic.fasta
Bacillus_subtilis_GCF_000009045.1_ASM904v1_genomic.fasta
Bacillus_tequilensis_GCF_000507145.1_KCTC_13622_01_genomic.fasta
Bacillus_vallismortis_GCF_000245315.1_ASM24531v1_genomic.fasta
Bacillus_velezensis_GCF_002117165.1_ASM211716v1_genomic.fasta


[None, None, None, None, None, None, None, None, None, None, None, None]

In [3]:
organisms_names = [f[:f.index('_', f.index('_') + 1)] for f in os.listdir(path_gene_nuc)]
organisms_names

['Bacillus_amyloliquefaciens',
 'Bacillus_atrophaeus',
 'Bacillus_halotolerans',
 'Bacillus_licheniformis',
 'Bacillus_mojavensis',
 'Bacillus_paralicheniformis',
 'Bacillus_siamensis',
 'Bacillus_sonorensis',
 'Bacillus_subtilis',
 'Bacillus_tequilensis',
 'Bacillus_vallismortis',
 'Bacillus_velezensis']

In [4]:
full_path_gene_nuc = [path_gene_nuc + f for f in os.listdir(path_gene_nuc)]

In [5]:
# Read fasta data with deleting symbol N in nuc seq
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
all_genes_list = []

for (id_org, organism) in enumerate(organisms):
    for gene_record in organism:
        all_genes_list.append(str(gene_record.seq).replace('N', ''))

### Custom ngram-range

In [None]:
def vectorize(vectorizer):
    start = timer()
    vectorizer.fit(all_genes_list)
    end = timer()
    print(f'vectorizer.fit worked {end - start:.2f} sec')
    
    start = timer()
    all_genes_tfidf_3_5gram = tfidf_vectorizer_3_5gram.transform(all_genes_list)
    end = timer()
    print(f'{end - start} sec')

end = timer()
print(f'{end - start} sec')

def tfidf_vectorize(ngram_range=(3,3)):
    tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                       ngram_range=ngram_range)
    vectorize(tfidf_vectorizer)
    
def count_vectorizer(ngram_range=(3,3)):
    count_vectorizer = CountVectorizer(analyzer='char',
                                       ngram_range=ngram_range)
    
    

### (3-5) ngrams

#### Tfidf Vectorization

In [None]:
tfidf_vectorizer_3_5gram = TfidfVectorizer(analyzer='char',
                                           ngram_range=(3,5))

In [None]:
start = timer()

tfidf_vectorizer_3_5gram.fit(all_genes_list)

end = timer()
print(f'{end - start} sec')

In [None]:
start = timer()

all_genes_tfidf_3_5gram = tfidf_vectorizer_3_5gram.transform(all_genes_list)

end = timer()
print(f'{end - start} sec')

#### Count Vectorization

In [6]:
count_vectorizer_3_5gram = CountVectorizer(analyzer='char',
                                           ngram_range=(3,5))

In [7]:
start = timer()

count_vectorizer_3_5gram.fit(all_genes_list)

end = timer()
print(f'{end - start} sec')

147.66302881700085 sec


In [8]:
start = timer()

# bow for bag-of-words
all_genes_bow_3_5gram = count_vectorizer_3_5gram.transform(all_genes_list)

end = timer()
print(f'{end - start} sec')

165.24323143699985 sec


In [42]:
all_genes_bow_3_5gram_array = all_genes_bow_3_5gram.toarray()

### (3,6)-grams

#### Explore vectorized data

In [None]:
print('TfidfVectorized data:\n')
print(all_genes_tfidf_3_5gram.toarray())

In [None]:
# number of all genes (in all organisms)
# and
# length of vectorized data
print(len(all_genes_list), len(all_genes_tfidf_3_5gram.toarray()))

In [None]:
# first five features of TfidfVectorizer
tfidf_vectorizer_3_5gram.get_feature_names()[:5]

In [None]:
# first five features of CountVectorizer
count_vectorizer_3_5gram.get_feature_names()[:5]

In [None]:
# number of features (if 3-5grams)
print(f'tfidf: {len(tfidf_vectorizer_3_5gram.get_feature_names())} features')
print(f'count: {len(count_vectorizer_3_5gram.get_feature_names())} features')

In [None]:
# length of each list for each gene is the same (some ngrams are with 0, others with more value)
[len(all_genes_tfidf_3_5gram.toarray()[i]) for i in range(5)]

In [9]:
print('CountVectorized data\:n')
print(all_genes_bow_3_5gram.toarray())

CountVectorized data\:n
[[66 22  6 ...  3  1  2]
 [57 19  6 ...  0  1  1]
 [ 7  2  0 ...  0  0  1]
 ...
 [ 5  1  0 ...  0  0  1]
 [56 17  5 ...  0  1  1]
 [ 7  4  2 ...  0  0  0]]


#### Find ngrams, which appeares only in one gene

In [19]:
# sum for each gene
sum_for_each_gene = all_genes_bow_3_5gram.sum(axis=1)
sum_for_each_gene

matrix([[4014],
        [3402],
        [ 639],
        ...,
        [ 639],
        [3402],
        [ 324]])

In [10]:
# sum for each feature (ngram)
sum_for_each_ngram = all_genes_bow_3_5gram.sum(axis=0)
sum_for_each_ngram

matrix([[1901833,  773450,  293550, ...,   81289,   88432,  116538]],
       dtype=int64)

In [11]:
min_val_for_ngrams = sum_for_each_ngram.min()
indx_min_for_ngrams = np.where(sum_for_each_ngram ==\
                               sum_for_each_ngram.min())[1][0]

print(f'min value for ngrams is {min_val_for_ngrams}\n',
      f'ngram index = {indx_min_for_ngrams}\n',
      f'ngram = {count_vectorizer_3_5gram.get_feature_names()[indx_min_for_ngrams]}')

min value for ngrams is 3265
 ngram index = 487
 ngram = cctag


In [12]:
min_value_ngram_appearences = all_genes_bow_3_5gram.toarray()[:, 487]
print(min_value_ngram_appearences)

[0 0 0 ... 0 0 0]


In [14]:
len(min_value_ngram_appearences)

49807

In [13]:
print(min_value_ngram_appearences.sum())

3265


In [18]:
[print(ngram_num_in_gene) for ngram_num_in_gene in min_value_ngram_appearences if ngram_num_in_gene > 3]

4
4
4


[None, None, None]

feature 'cctag' occures in one gene less than 5 times

In [29]:
# very long!
#for (feature_num, feature_name) in zip(range(all_genes_bow_3_5gram.shape[1]),
#                                       count_vectorizer_3_5gram.get_feature_names()):
#    print(f'{feature_name} appeares {all_genes_bow_3_5gram.toarray()[:, feature_num]} in each gene')

aaa appeares [66 57  7 ...  5 56  7] in each gene
aaaa appeares [22 19  2 ...  1 17  4] in each gene
aaaaa appeares [6 6 0 ... 0 5 2] in each gene
aaaac appeares [3 5 1 ... 0 5 0] in each gene
aaaag appeares [9 5 0 ... 0 4 2] in each gene
aaaat appeares [4 3 1 ... 1 3 0] in each gene
aaac appeares [10 13  1 ...  0 13  1] in each gene
aaaca appeares [2 2 0 ... 0 1 1] in each gene
aaacc appeares [4 4 0 ... 0 5 0] in each gene
aaacg appeares [1 4 0 ... 0 4 0] in each gene
aaact appeares [3 3 1 ... 0 3 0] in each gene
aaag appeares [22 13  0 ...  1 14  2] in each gene
aaaga appeares [10  2  0 ...  0  3  1] in each gene
aaagc appeares [11  6  0 ...  1  6  0] in each gene
aaagg appeares [0 1 0 ... 0 1 0] in each gene
aaagt appeares [1 4 0 ... 0 4 1] in each gene
aaat appeares [12 12  4 ...  3 12  0] in each gene
aaata appeares [3 2 0 ... 0 2 0] in each gene
aaatc appeares [4 7 1 ... 1 7 0] in each gene
aaatg appeares [2 0 2 ... 1 0 0] in each gene
aaatt appeares [3 3 1 ... 1 3 0] in each gen

KeyboardInterrupt: 

In [31]:
# very long!
#for (feature_num, feature_name) in zip(range(all_genes_bow_3_5gram.shape[1]),
#                                       count_vectorizer_3_5gram.get_feature_names()):
#    print(f'{feature_name} is in ',
#          f'{np.count_nonzero(all_genes_bow_3_5gram.toarray()[:, feature_num])} genes')

aaa is in  49676 genes
aaaa is in  48876 genes
aaaaa is in  44930 genes
aaaac is in  41857 genes
aaaag is in  44747 genes
aaaat is in  41718 genes
aaac is in  47649 genes
aaaca is in  39702 genes
aaacc is in  30117 genes
aaacg is in  37426 genes
aaact is in  25779 genes
aaag is in  48615 genes
aaaga is in  42587 genes
aaagc is in  40115 genes
aaagg is in  37473 genes
aaagt is in  29927 genes
aaat is in  48175 genes
aaata is in  32397 genes
aaatc is in  36961 genes
aaatg is in  37784 genes
aaatt is in  34290 genes
aac is in  49415 genes
aaca is in  46363 genes
aacaa is in  35106 genes
aacac is in  21186 genes
aacag is in  34588 genes
aacat is in  29218 genes
aacc is in  40434 genes
aacca is in  20162 genes
aaccc is in  14167 genes
aaccg is in  27823 genes
aacct is in  19203 genes
aacg is in  44338 genes
aacga is in  29380 genes
aacgc is in  24958 genes
aacgg is in  32778 genes
aacgt is in  20880 genes
aact is in  39230 genes
aacta is in  12679 genes
aactc is in  14621 genes
aactg is in 

attta is in  34044 genes
atttc is in  31823 genes
atttg is in  33617 genes
atttt is in  38453 genes
caa is in  49551 genes
caaa is in  48026 genes
caaaa is in  42346 genes
caaac is in  30517 genes
caaag is in  33813 genes
caaat is in  32309 genes
caac is in  37141 genes
caaca is in  22458 genes
caacc is in  16458 genes
caacg is in  18190 genes
caact is in  14309 genes
caag is in  43375 genes
caaga is in  27191 genes
caagc is in  28896 genes
caagg is in  23047 genes
caagt is in  19316 genes
caat is in  44490 genes
caata is in  24495 genes
caatc is in  27546 genes
caatg is in  27161 genes
caatt is in  28157 genes
cac is in  47452 genes
caca is in  38480 genes
cacaa is in  22681 genes
cacac is in  13381 genes
cacag is in  21541 genes
cacat is in  19442 genes
cacc is in  33575 genes
cacca is in  14675 genes
caccc is in  10868 genes
caccg is in  19041 genes
cacct is in  13756 genes
cacg is in  34764 genes
cacga is in  17784 genes
cacgc is in  15615 genes
cacgg is in  18534 genes
cacgt is in

cttgg is in  21826 genes
cttgt is in  23916 genes
cttt is in  45208 genes
cttta is in  29410 genes
ctttc is in  26611 genes
ctttg is in  26238 genes
ctttt is in  32961 genes
gaa is in  49713 genes
gaaa is in  49023 genes
gaaaa is in  45788 genes
gaaac is in  35197 genes
gaaag is in  37509 genes
gaaat is in  38462 genes
gaac is in  44297 genes
gaaca is in  33009 genes
gaacc is in  20694 genes
gaacg is in  27629 genes
gaact is in  24104 genes
gaag is in  47117 genes
gaaga is in  39182 genes
gaagc is in  36629 genes
gaagg is in  32999 genes
gaagt is in  28105 genes
gaat is in  46390 genes
gaata is in  28070 genes
gaatc is in  29725 genes
gaatg is in  30131 genes
gaatt is in  33663 genes
gac is in  48694 genes
gaca is in  43827 genes
gacaa is in  30224 genes
gacac is in  17208 genes
gacag is in  28846 genes
gacat is in  27390 genes
gacc is in  36112 genes
gacca is in  17843 genes
gaccc is in  11378 genes
gaccg is in  22103 genes
gacct is in  15064 genes
gacg is in  41608 genes
gacga is in 

KeyboardInterrupt: 

In [55]:
features_appered_once = []

for (feature_num, feature_name) in zip(range(all_genes_bow_3_5gram.shape[1]),
                                       count_vectorizer_3_5gram.get_feature_names()):
    if(np.count_nonzero(all_genes_bow_3_5gram_array[:, feature_num]) == 1):
        features_appered_once.append(feature_name)
        
print(features_appered_once)

[]


### Output tuples

In [None]:
type(all_genes_tfidf_3_5gram.toarray()[0])

In [None]:
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
tmp_list_of_tuples = []

for (id_org, organism), org_name in zip(enumerate(organisms), organisms_names):
    for gene_record in organism:
        tmp_list_of_tuples.append((id_org, org_name, gene_record.id))

In [None]:
tmp_list_of_tuples[0]

In [None]:
tmp_list_of_tuples[-1]

In [None]:
lst_org_gene_tfidf = []

for (org_gene, gene_tfidf) in zip(tmp_list_of_tuples, all_genes_tfidf_3_5gram.toarray()):
    lst_org_gene_tfidf.append((org_gene[0], org_gene[1], org_gene[2], gene_tfidf))

In [None]:
lst_org_gene_tfidf[:3]

In [None]:
lst_org_gene_tfidf[-1]

In [None]:
lst_org_gene_bow = []

for (org_gene, gene_bow) in zip(tmp_list_of_tuples, all_genes_bow_3_5gram.toarray()):
    lst_org_gene_bow.append((org_gene[0], org_gene[1], org_gene[2], gene_bow))

In [None]:
lst_org_gene_bow[:3]

In [None]:
lst_org_gene_bow[-1]

#### Explore ngrams in vectorizer

In [None]:
names_tfidf_3_5grams = tfidf_vectorizer_3_5gram.get_feature_names()
names_bow_3_5grams = count_vectorizer_3_5gram.get_feature_names()

In [None]:
gene_num = 15

print(f'ngram tfidf values for gene number {gene_num}:\n')
for (ngram, tfidf_value) in zip(names_tfidf_3_5grams, lst_org_gene_tfidf[gene_num][3]):
    print(f'{ngram}\t{tfidf_value}')

In [None]:
print(f'ngram bow values for gene number {gene_num}:\n')
for (ngram, bow_value) in zip(names_bow_3_5grams, lst_org_gene_bow[gene_num][3]):
    print(f'{ngram}\t{bow_value}')

### Find ngrams, that are only in one gene / in one organism

In [None]:
#all_genes_bow_3_5gram.toarray().sum(axis=1)