Started 30.05.2019

It is first part of word2vec pipeline

<b>ngram extractor from gene sequence</b>

In [1]:
import numpy as np
import os
from timeit import default_timer as timer

from Bio import SeqIO
from nltk.util import ngrams
from collections import OrderedDict

### Read seq

In [2]:
path_gene_nuc = '../../sf_data/prodigal_output/gene_nuc/'

In [3]:
organisms_names = [f[:f.index('_', f.index('_') + 1)] for f in os.listdir(path_gene_nuc)]
organisms_names

['Bacillus_amyloliquefaciens',
 'Bacillus_atrophaeus',
 'Bacillus_halotolerans',
 'Bacillus_licheniformis',
 'Bacillus_mojavensis',
 'Bacillus_paralicheniformis',
 'Bacillus_siamensis',
 'Bacillus_sonorensis',
 'Bacillus_subtilis',
 'Bacillus_tequilensis',
 'Bacillus_vallismortis',
 'Bacillus_velezensis']

In [4]:
full_path_gene_nuc = [path_gene_nuc + f for f in os.listdir(path_gene_nuc)]

In [5]:
# Read fasta data with deleting symbol N in nuc seq
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
all_genes_list = []

for (id_org, organism) in enumerate(organisms):
    for gene_record in organism:
        all_genes_list.append(str(gene_record.seq).replace('N', ''))

### Extract ngrams from one gene (safing order)

In [10]:
# just example
gene_x = all_genes_list[7]
print(f'{gene_x}\n\nlen={len(gene_x)}')

ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA

len=255


In [44]:
ngram_range = list(range(3,4))
ngram_range

[3]

In [45]:
ngrams_with_order = []

for i in range(len(gene_x)):
    for n in ngram_range:
        ngrams_with_order.append(gene_x[i:i+n])

In [46]:
ngrams_with_order = [ngram for ngram in ngrams_with_order if len(ngram) >= min(ngram_range)]

In [47]:
print(ngrams_with_order)

['ATG', 'TGA', 'GAC', 'ACG', 'CGT', 'GTC', 'TCA', 'CAA', 'AAA', 'AAT', 'ATC', 'TCA', 'CAT', 'ATC', 'TCA', 'CAT', 'ATG', 'TGC', 'GCC', 'CCC', 'CCC', 'CCT', 'CTT', 'TTA', 'TAT', 'ATG', 'TGA', 'GAC', 'ACC', 'CCT', 'CTG', 'TGG', 'GGG', 'GGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAC', 'ACA', 'CAC', 'ACG', 'CGT', 'GTG', 'TGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAA', 'AAT', 'ATG', 'TGG', 'GGG', 'GGC', 'GCA', 'CAG', 'AGA', 'GAA', 'AAC', 'ACA', 'CAA', 'AAA', 'AAG', 'AGG', 'GGG', 'GGC', 'GCA', 'CAG', 'AGC', 'GCG', 'CGA', 'GAA', 'AAA', 'AAC', 'ACC', 'CCG', 'CGC', 'GCG', 'CGA', 'GAG', 'AGG', 'GGT', 'GTT', 'TTA', 'TAA', 'AAG', 'AGC', 'GCC', 'CCA', 'CAA', 'AAT', 'ATC', 'TCC', 'CCC', 'CCA', 'CAC', 'ACA', 'CAA', 'AAA', 'AAT', 'ATC', 'TCT', 'CTG', 'TGT', 'GTT', 'TTC', 'TCT', 'CTC', 'TCA', 'CAG', 'AGT', 'GTT', 'TTC', 'TCG', 'CGG', 'GGA', 'GAT', 'ATC', 'TCG', 'CGC', 'GCA', 'CAG', 'AGT', 'GTC', 'TCT', 'CTG', 'TGC', 'GCA', 'CAA', 'AAC', 'ACT', 'CTC', 'TCG', 'CGA', 'GAC', 'ACT', 'CTG', 'TGC', 'GCG', 'CGT', 'GTG', 'TGA'

In [48]:
# one of the ways to delete duplicates and safe the order
# but it may be inefficient because of index
ngrams_with_order_unique = sorted(set(ngrams_with_order), key=ngrams_with_order.index)

In [49]:
print(ngrams_with_order_unique)

['ATG', 'TGA', 'GAC', 'ACG', 'CGT', 'GTC', 'TCA', 'CAA', 'AAA', 'AAT', 'ATC', 'CAT', 'TGC', 'GCC', 'CCC', 'CCT', 'CTT', 'TTA', 'TAT', 'ACC', 'CTG', 'TGG', 'GGG', 'GGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAC', 'GTG', 'GCA', 'CAG', 'AGA', 'GAA', 'AAC', 'AAG', 'AGG', 'AGC', 'GCG', 'CGA', 'CCG', 'CGC', 'GAG', 'GGT', 'GTT', 'TAA', 'CCA', 'TCC', 'TCT', 'TGT', 'TTC', 'CTC', 'AGT', 'TCG', 'CGG', 'GGA', 'GAT', 'ACT', 'TAG', 'GTA', 'ATA', 'TTG', 'TTT']


In [50]:
print(f'ngram_range = {ngram_range}\n',
      f'len(ngrams_with_order) = {len(ngrams_with_order)}\n',
      f'len(ngrams_with_order_unique) = {len(ngrams_with_order_unique)}')

ngram_range = [3]
 len(ngrams_with_order) = 253
 len(ngrams_with_order_unique) = 63


### Extract ngrams from list of genes (saving order for each gene)

Time Statistics for Simple algorithm

For all genes (49807):

3-grams = 150 sec = 2.5 min

(3,4)-grams = 320 sec = 5.3 min (but one time kernel died here)

Starts (3,8)-grams test

* 5000 genes -> 29.7 min

* 10000 genes -> 61.3 min
  
* 15000 genes -> 92.6 min
  
* 17250 genes -> 105.5 min

After that kernel died (may be memory overload)

In [6]:
len(all_genes_list)

49807

In [13]:
ngram_range = list(range(3,5))
ngram_range

[3, 4]

In [14]:
start = timer()

gene_ngrams = []

for (id_gene, gene) in enumerate(all_genes_list):
    if(id_gene % 3000) == 0:
        print(f'time: {timer()}\tid_gene={id_gene}')
    
    ngrams_with_order = []
    
    [ngrams_with_order.append(gene[indx:indx+n])
     for indx in range(len(gene)) for n in ngram_range]
    
    ngrams_with_order = [ngram for ngram in ngrams_with_order if len(ngram) >= min(ngram_range)]
    ngrams_with_order_unique = sorted(set(ngrams_with_order), key=ngrams_with_order.index)
    
    gene_ngrams.append((gene, ngrams_with_order_unique))
    
end = timer()
print(f'\nend time: {end}')
print(f'\ncommon time: {end - start:.2} sec')

time: 23594.54707745	id_gene=0
time: 23613.185459801	id_gene=3000
time: 23631.813330557	id_gene=6000
time: 23651.295328262	id_gene=9000
time: 23670.666137944	id_gene=12000
time: 23689.690551298	id_gene=15000
time: 23709.409350258	id_gene=18000
time: 23728.016730018	id_gene=21000
time: 23748.603795443	id_gene=24000
time: 23769.300753399	id_gene=27000
time: 23788.622285335	id_gene=30000
time: 23806.838323196	id_gene=33000
time: 23827.149895554	id_gene=36000
time: 23847.232104294	id_gene=39000
time: 23869.455597786	id_gene=42000
time: 23888.380526415	id_gene=45000
time: 23908.191213454	id_gene=48000

end time: 23920.580942985

common time: 3.3e+02 sec


In [17]:
print(gene_ngrams[7])

('ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA', ['ATG', 'ATGA', 'TGA', 'TGAC', 'GAC', 'GACG', 'ACG', 'ACGT', 'CGT', 'CGTC', 'GTC', 'GTCA', 'TCA', 'TCAA', 'CAA', 'CAAA', 'AAA', 'AAAT', 'AAT', 'AATC', 'ATC', 'ATCA', 'TCAT', 'CAT', 'CATC', 'CATG', 'ATGC', 'TGC', 'TGCC', 'GCC', 'GCCC', 'CCC', 'CCCC', 'CCCT', 'CCT', 'CCTT', 'CTT', 'CTTA', 'TTA', 'TTAT', 'TAT', 'TATG', 'GACC', 'ACC', 'ACCT', 'CCTG', 'CTG', 'CTGG', 'TGG', 'TGGG', 'GGG', 'GGGC', 'GGC', 'GGCT', 'GCT', 'GCTA', 'CTA', 'CTAC', 'TAC', 'TACA', 'ACA', 'ACAC', 'CAC', 'CACA', 'CACG', 'CGTG', 'GTG', 'GTGC', 'TGCT', 'ACAA', 'CAAT', 'AATG', 'ATGG', 'GGCA', 'GCA', 'GCAG', 'CAG', 'CAGA', 'AGA', 'AGAA', 'GAA', 'GAAC', 'AAC', 'AACA', 'AAAG', 'AAG', 'AAGG', 'AGG', 'AGGG', 'CAGC', 'AGC', 'AGCG', 'GCG', 'GCGA', 'CGA', 'CGAA', 'GAAA', 'AA

In [18]:
len(gene_ngrams)

49807

### Using nltk.util.ngrams()

In [None]:
#http://www.nltk.org/_modules/nltk/util.html

In [62]:
gene_x_3grams = list(ngrams(gene_x, 3))
gene_x_3grams[:5]

[('A', 'T', 'G'),
 ('T', 'G', 'A'),
 ('G', 'A', 'C'),
 ('A', 'C', 'G'),
 ('C', 'G', 'T')]

In [63]:
gene_x_3grams_unique = sorted(set(gene_x_3grams), key=gene_x_3grams.index)
gene_x_3grams_unique[:5]

[('A', 'T', 'G'),
 ('T', 'G', 'A'),
 ('G', 'A', 'C'),
 ('A', 'C', 'G'),
 ('C', 'G', 'T')]

In [43]:
len(gene_x_3grams), len(gene_x_3grams_unique)

(253, 63)

In [61]:
gene_x_3grams_unique_lst = [''.join(ngram) for ngram in gene_x_3grams_unique]
print(gene_x_3grams_unique_lst)

['ATG', 'TGA', 'GAC', 'ACG', 'CGT', 'GTC', 'TCA', 'CAA', 'AAA', 'AAT', 'ATC', 'CAT', 'TGC', 'GCC', 'CCC', 'CCT', 'CTT', 'TTA', 'TAT', 'ACC', 'CTG', 'TGG', 'GGG', 'GGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAC', 'GTG', 'GCA', 'CAG', 'AGA', 'GAA', 'AAC', 'AAG', 'AGG', 'AGC', 'GCG', 'CGA', 'CCG', 'CGC', 'GAG', 'GGT', 'GTT', 'TAA', 'CCA', 'TCC', 'TCT', 'TGT', 'TTC', 'CTC', 'AGT', 'TCG', 'CGG', 'GGA', 'GAT', 'ACT', 'TAG', 'GTA', 'ATA', 'TTG', 'TTT']


using nltk.util.ngrams() is faster than Simple extract algorithm

3-grams -> 53 sec < 1 min

(3,4)-grams -> 270 sec = 4.5 min

(3,5)-grams ~ 16 min

(3,6)-grams ~ 42 min

(3,8)-grams ~ 2 h

In [6]:
def get_ngrams_for_gene(id_gene, gene_seq, n_min=3, n_max=3):
    if(id_gene % 3000) == 0:
        print(f'time: {timer()}\tid_gene={id_gene}')
    
    gene_ngrams = []
    
    for n in range(n_min, n_max+1):
        ngrams_lst = list(ngrams(gene_seq, n))
        ngrams_set = sorted(set(ngrams_lst), key=ngrams_lst.index)
        ngrams_set = [''.join(ngram) for ngram in ngrams_set]
        
        gene_ngrams.append(ngrams_set)
    
    return (gene_seq, gene_ngrams)

In [19]:
ngram_range_min = 3
ngram_range_max = 4

start = timer()
print(f'ngram_range = ({ngram_range_min}, {ngram_range_max})\n')

nltk_gene_ngrams = [get_ngrams_for_gene(id_gene, gene, n_min=ngram_range_min, n_max=ngram_range_max)
                    for (id_gene, gene) in enumerate(all_genes_list)]

end = timer()
print(f'\nend time: {end}')
print(f'\ncommon time: {end - start:.2} sec')

ngram_range = (3, 4)

time: 24006.613189151	id_gene=0
time: 24022.129358984	id_gene=3000
time: 24038.056454398	id_gene=6000
time: 24054.63016484	id_gene=9000
time: 24070.315974468	id_gene=12000
time: 24086.111833888	id_gene=15000
time: 24102.331006637	id_gene=18000
time: 24117.754916375	id_gene=21000
time: 24136.004660633	id_gene=24000
time: 24152.586541377	id_gene=27000
time: 24168.053311949	id_gene=30000
time: 24182.75913325	id_gene=33000
time: 24198.325440516	id_gene=36000
time: 24214.066148711	id_gene=39000
time: 24229.430968752	id_gene=42000
time: 24244.581115966	id_gene=45000
time: 24308.284221694	id_gene=48000

end time: 24318.329087161

common time: 3.1e+02 sec


In [20]:
print(nltk_gene_ngrams[7])

('ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA', [['ATG', 'TGA', 'GAC', 'ACG', 'CGT', 'GTC', 'TCA', 'CAA', 'AAA', 'AAT', 'ATC', 'CAT', 'TGC', 'GCC', 'CCC', 'CCT', 'CTT', 'TTA', 'TAT', 'ACC', 'CTG', 'TGG', 'GGG', 'GGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAC', 'GTG', 'GCA', 'CAG', 'AGA', 'GAA', 'AAC', 'AAG', 'AGG', 'AGC', 'GCG', 'CGA', 'CCG', 'CGC', 'GAG', 'GGT', 'GTT', 'TAA', 'CCA', 'TCC', 'TCT', 'TGT', 'TTC', 'CTC', 'AGT', 'TCG', 'CGG', 'GGA', 'GAT', 'ACT', 'TAG', 'GTA', 'ATA', 'TTG', 'TTT'], ['ATGA', 'TGAC', 'GACG', 'ACGT', 'CGTC', 'GTCA', 'TCAA', 'CAAA', 'AAAT', 'AATC', 'ATCA', 'TCAT', 'CATC', 'CATG', 'ATGC', 'TGCC', 'GCCC', 'CCCC', 'CCCT', 'CCTT', 'CTTA', 'TTAT', 'TATG', 'GACC', 'ACCT', 'CCTG', 'CTGG', 'TGGG', 'GGGC', 'GGCT', 'GCTA', 'CTAC', 'TACA', 'ACAC', 'CACA', 'CACG', 'CGTG',

In [21]:
len(nltk_gene_ngrams)

49807