Started 30.05.2019

It is first part of word2vec pipeline

<b>ngram extractor from gene sequence</b>

In [12]:
import numpy as np
import os
import csv
from timeit import default_timer as timer

from Bio import SeqIO
from nltk.util import ngrams
from collections import OrderedDict

### Read seq

In [2]:
path_gene_nuc = '../../sf_data/prodigal_output/gene_nuc/'

In [3]:
organisms_names = [f[:f.index('_', f.index('_') + 1)] for f in os.listdir(path_gene_nuc)]
organisms_names

['Bacillus_amyloliquefaciens',
 'Bacillus_atrophaeus',
 'Bacillus_halotolerans',
 'Bacillus_licheniformis',
 'Bacillus_mojavensis',
 'Bacillus_paralicheniformis',
 'Bacillus_siamensis',
 'Bacillus_sonorensis',
 'Bacillus_subtilis',
 'Bacillus_tequilensis',
 'Bacillus_vallismortis',
 'Bacillus_velezensis']

In [4]:
full_path_gene_nuc = [path_gene_nuc + f for f in os.listdir(path_gene_nuc)]

In [9]:
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
org_gene_lst = []

for (id_org, organism), org_name in zip(enumerate(organisms), organisms_names):
    for gene_record in organism:
        org_gene_lst.append(('_'.join((str(id_org), org_name)), gene_record.id,
                             str(gene_record.seq).replace('N', '')))

In [11]:
org_gene_lst[7]

('0_Bacillus_amyloliquefaciens',
 'NC_014551.1_8',
 'ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA')

In [10]:
org_gene_lst[-1]

('11_Bacillus_velezensis',
 'NZ_CP011937.1_3760',
 'TTGTGGATAAGTTGTGAAAAAGACAGGAAACACAGCCTATCCACATGTGGACAGACTGTGTATGGCTATGAAAAAGTTATTCACACTTTCCCCGATTCATCCGAGGACACT')

In [5]:
# Read fasta data with deleting symbol N in nuc seq
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
all_genes_list = []

for (id_org, organism) in enumerate(organisms):
    for gene_record in organism:
        all_genes_list.append(str(gene_record.seq).replace('N', ''))

### Функция n-gram extractor (save order (3,8))
Based on my Simple algorithm

Input: [(org_id + org_name, gene_id, gene_seq)]

Output: [(org_id + org_name, gene_id, gene_seq, [ngram_seq])]

In [53]:
def ngram_extractor_csv(file_name, org_g_lst, n_min=3, n_max=3):
    ngram_range = list(range(n_min, n_max+1))
    
    with open(file_name, mode='w') as ngrams_file:
        ngrams_writer = csv.writer(ngrams_file, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        start = timer()
        
        for (g_num, (org, g_id, g_seq)) in enumerate(org_g_lst):
            if(g_num) % 50 == 0:
                print(f'time: {timer()-start:.2} sec\tid_gene={g_num}')
            
            ngrams_with_order = []
            
            [ngrams_with_order.append(g_seq[indx:indx+n])
             for indx in range(len(g_seq))
             for n in ngram_range]
        
            ngrams_with_order = [ngram for ngram in ngrams_with_order
                                 if len(ngram) >= min(ngram_range)]
        
            ngrams_with_order_unique = sorted(set(ngrams_with_order), key=ngrams_with_order.index)

            ngrams_writer.writerow((org, g_id, g_seq, ngrams_with_order_unique))

In [54]:
ngram_extractor_csv('gene_ngrams_1.csv', org_gene_lst[:500], n_min=3, n_max=8)

time: 4.3e-06 sec	id_gene=0
time: 4.3e+01 sec	id_gene=50
time: 9.7e+01 sec	id_gene=100
time: 1.3e+02 sec	id_gene=150
time: 1.9e+02 sec	id_gene=200
time: 2.2e+02 sec	id_gene=250
time: 2.6e+02 sec	id_gene=300
time: 4.2e+02 sec	id_gene=350
time: 4.7e+02 sec	id_gene=400
time: 5.1e+02 sec	id_gene=450


In [None]:
#ngram_extractor_csv('gene_ngrams_0_4999.csv', org_gene_lst[:5000], n_min=3, n_max=8)

### Using nltk.util.ngrams()

In [None]:
#http://www.nltk.org/_modules/nltk/util.html

In [62]:
gene_x_3grams = list(ngrams(gene_x, 3))
gene_x_3grams[:5]

[('A', 'T', 'G'),
 ('T', 'G', 'A'),
 ('G', 'A', 'C'),
 ('A', 'C', 'G'),
 ('C', 'G', 'T')]

In [63]:
gene_x_3grams_unique = sorted(set(gene_x_3grams), key=gene_x_3grams.index)
gene_x_3grams_unique[:5]

[('A', 'T', 'G'),
 ('T', 'G', 'A'),
 ('G', 'A', 'C'),
 ('A', 'C', 'G'),
 ('C', 'G', 'T')]

In [43]:
len(gene_x_3grams), len(gene_x_3grams_unique)

(253, 63)

In [61]:
gene_x_3grams_unique_lst = [''.join(ngram) for ngram in gene_x_3grams_unique]
print(gene_x_3grams_unique_lst)

['ATG', 'TGA', 'GAC', 'ACG', 'CGT', 'GTC', 'TCA', 'CAA', 'AAA', 'AAT', 'ATC', 'CAT', 'TGC', 'GCC', 'CCC', 'CCT', 'CTT', 'TTA', 'TAT', 'ACC', 'CTG', 'TGG', 'GGG', 'GGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAC', 'GTG', 'GCA', 'CAG', 'AGA', 'GAA', 'AAC', 'AAG', 'AGG', 'AGC', 'GCG', 'CGA', 'CCG', 'CGC', 'GAG', 'GGT', 'GTT', 'TAA', 'CCA', 'TCC', 'TCT', 'TGT', 'TTC', 'CTC', 'AGT', 'TCG', 'CGG', 'GGA', 'GAT', 'ACT', 'TAG', 'GTA', 'ATA', 'TTG', 'TTT']


using nltk.util.ngrams() is faster than Simple extract algorithm

3-grams -> 53 sec < 1 min

(3,4)-grams -> 270 sec = 4.5 min

(3,5)-grams ~ 16 min

(3,6)-grams ~ 42 min

(3,8)-grams ~ 2 h

In [6]:
def get_ngrams_for_gene(id_gene, gene_seq, n_min=3, n_max=3):
    if(id_gene % 3000) == 0:
        print(f'time: {timer()}\tid_gene={id_gene}')
    
    gene_ngrams = []
    
    for n in range(n_min, n_max+1):
        ngrams_lst = list(ngrams(gene_seq, n))
        ngrams_set = sorted(set(ngrams_lst), key=ngrams_lst.index)
        ngrams_set = [''.join(ngram) for ngram in ngrams_set]
        
        gene_ngrams.append(ngrams_set)
    
    return (gene_seq, gene_ngrams)

In [19]:
ngram_range_min = 3
ngram_range_max = 4

start = timer()
print(f'ngram_range = ({ngram_range_min}, {ngram_range_max})\n')

nltk_gene_ngrams = [get_ngrams_for_gene(id_gene, gene, n_min=ngram_range_min, n_max=ngram_range_max)
                    for (id_gene, gene) in enumerate(all_genes_list)]

end = timer()
print(f'\nend time: {end}')
print(f'\ncommon time: {end - start:.2} sec')

ngram_range = (3, 4)

time: 24006.613189151	id_gene=0
time: 24022.129358984	id_gene=3000
time: 24038.056454398	id_gene=6000
time: 24054.63016484	id_gene=9000
time: 24070.315974468	id_gene=12000
time: 24086.111833888	id_gene=15000
time: 24102.331006637	id_gene=18000
time: 24117.754916375	id_gene=21000
time: 24136.004660633	id_gene=24000
time: 24152.586541377	id_gene=27000
time: 24168.053311949	id_gene=30000
time: 24182.75913325	id_gene=33000
time: 24198.325440516	id_gene=36000
time: 24214.066148711	id_gene=39000
time: 24229.430968752	id_gene=42000
time: 24244.581115966	id_gene=45000
time: 24308.284221694	id_gene=48000

end time: 24318.329087161

common time: 3.1e+02 sec


In [20]:
print(nltk_gene_ngrams[7])

('ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA', [['ATG', 'TGA', 'GAC', 'ACG', 'CGT', 'GTC', 'TCA', 'CAA', 'AAA', 'AAT', 'ATC', 'CAT', 'TGC', 'GCC', 'CCC', 'CCT', 'CTT', 'TTA', 'TAT', 'ACC', 'CTG', 'TGG', 'GGG', 'GGC', 'GCT', 'CTA', 'TAC', 'ACA', 'CAC', 'GTG', 'GCA', 'CAG', 'AGA', 'GAA', 'AAC', 'AAG', 'AGG', 'AGC', 'GCG', 'CGA', 'CCG', 'CGC', 'GAG', 'GGT', 'GTT', 'TAA', 'CCA', 'TCC', 'TCT', 'TGT', 'TTC', 'CTC', 'AGT', 'TCG', 'CGG', 'GGA', 'GAT', 'ACT', 'TAG', 'GTA', 'ATA', 'TTG', 'TTT'], ['ATGA', 'TGAC', 'GACG', 'ACGT', 'CGTC', 'GTCA', 'TCAA', 'CAAA', 'AAAT', 'AATC', 'ATCA', 'TCAT', 'CATC', 'CATG', 'ATGC', 'TGCC', 'GCCC', 'CCCC', 'CCCT', 'CCTT', 'CTTA', 'TTAT', 'TATG', 'GACC', 'ACCT', 'CCTG', 'CTGG', 'TGGG', 'GGGC', 'GGCT', 'GCTA', 'CTAC', 'TACA', 'ACAC', 'CACA', 'CACG', 'CGTG',

In [21]:
len(nltk_gene_ngrams)

49807