Started 30.05.2019

It is first part of word2vec pipeline

<b>ngram extractor from gene sequence</b>

In [1]:
import numpy as np
import os
import csv
from timeit import default_timer as timer

from Bio import SeqIO
from nltk.util import ngrams
from collections import OrderedDict

### Read seq

In [2]:
path_gene_nuc = '../../sf_data/prodigal_output/gene_nuc/'

In [3]:
organisms_names = [f[:f.index('_', f.index('_') + 1)] for f in os.listdir(path_gene_nuc)]
organisms_names

['Bacillus_amyloliquefaciens',
 'Bacillus_atrophaeus',
 'Bacillus_halotolerans',
 'Bacillus_licheniformis',
 'Bacillus_mojavensis',
 'Bacillus_paralicheniformis',
 'Bacillus_siamensis',
 'Bacillus_sonorensis',
 'Bacillus_subtilis',
 'Bacillus_tequilensis',
 'Bacillus_vallismortis',
 'Bacillus_velezensis']

In [4]:
full_path_gene_nuc = [path_gene_nuc + f for f in os.listdir(path_gene_nuc)]

In [5]:
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
org_gene_lst = []

for (id_org, organism), org_name in zip(enumerate(organisms), organisms_names):
    for gene_record in organism:
        org_gene_lst.append(('_'.join((str(id_org), org_name)), gene_record.id,
                             str(gene_record.seq).replace('N', '')))

In [6]:
org_gene_lst[7]

('0_Bacillus_amyloliquefaciens',
 'NC_014551.1_8',
 'ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA')

In [7]:
org_gene_lst[-1]

('11_Bacillus_velezensis',
 'NZ_CP011937.1_3760',
 'TTGTGGATAAGTTGTGAAAAAGACAGGAAACACAGCCTATCCACATGTGGACAGACTGTGTATGGCTATGAAAAAGTTATTCACACTTTCCCCGATTCATCCGAGGACACT')

In [52]:
len(org_gene_lst)

49807

### func: n-gram extractor (save order (3,8))
Based on my Simple algorithm

Input: [(org_id + org_name, gene_id, gene_seq)]

Output: [(org_id + org_name, gene_id, gene_seq, [ngram_seq])]

In [60]:
def ngram_extractor_csv(file_name, org_g_lst, n_min=3, n_max=3):
    ngram_range = list(range(n_min, n_max+1))
    
    with open(file_name, mode='w') as ngrams_file:
        ngrams_writer = csv.writer(ngrams_file, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        start = timer()
        
        for (g_num, (org, g_id, g_seq)) in enumerate(org_g_lst):
            if(g_num) % 100 == 0:
                print(f'time: {timer()-start:.2} sec\tid_gene={g_num}')
            
            ngrams_with_order = []
            
            [ngrams_with_order.append(g_seq[indx:indx+n])\
             for indx in range(len(g_seq) - (n_min-1))
             for n in ngram_range
             if (indx+n) <= len(g_seq)]

            ngrams_writer.writerow((org, g_id, g_seq, ngrams_with_order))

In [61]:
# rather quick
# 310 sec for all genes!

start = timer()

for n in range(5):
    gene_number = 10000
    min_bound = n*gene_number
    max_bound = min((n+1)*gene_number-1, len(org_gene_lst)-1)
    
    csv_name = 'data/gene_ngrams_' + str(min_bound) + '_' + str(max_bound) + '.csv'
    
    print(f'{min_bound}-{max_bound} genes:\n')
    ngram_extractor_csv(csv_name, org_gene_lst[min_bound:max_bound+1], n_min=3, n_max=8)
    
    print(f'\n=====\ntotal time: {timer()-start:.2} sec\t({min_bound}-{max_bound} genes)\n\n')

0-9999 genes:

time: 2.7e-06 sec	id_gene=0
time: 0.62 sec	id_gene=100
time: 1.2 sec	id_gene=200
time: 1.8 sec	id_gene=300
time: 2.6 sec	id_gene=400
time: 3.1 sec	id_gene=500
time: 3.6 sec	id_gene=600
time: 4.3 sec	id_gene=700
time: 5.0 sec	id_gene=800
time: 5.6 sec	id_gene=900
time: 6.0 sec	id_gene=1000
time: 6.6 sec	id_gene=1100
time: 7.2 sec	id_gene=1200
time: 7.8 sec	id_gene=1300
time: 8.3 sec	id_gene=1400
time: 8.9 sec	id_gene=1500
time: 9.5 sec	id_gene=1600
time: 1e+01 sec	id_gene=1700
time: 1.1e+01 sec	id_gene=1800
time: 1.2e+01 sec	id_gene=1900
time: 1.3e+01 sec	id_gene=2000
time: 1.3e+01 sec	id_gene=2100
time: 1.4e+01 sec	id_gene=2200
time: 1.5e+01 sec	id_gene=2300
time: 1.5e+01 sec	id_gene=2400
time: 1.6e+01 sec	id_gene=2500
time: 1.6e+01 sec	id_gene=2600
time: 1.7e+01 sec	id_gene=2700
time: 1.8e+01 sec	id_gene=2800
time: 1.8e+01 sec	id_gene=2900
time: 1.9e+01 sec	id_gene=3000
time: 1.9e+01 sec	id_gene=3100
time: 2e+01 sec	id_gene=3200
time: 2.1e+01 sec	id_gene=3300
time: 2.1e

time: 4.3e+01 sec	id_gene=6800
time: 4.4e+01 sec	id_gene=6900
time: 4.5e+01 sec	id_gene=7000
time: 4.5e+01 sec	id_gene=7100
time: 4.6e+01 sec	id_gene=7200
time: 4.7e+01 sec	id_gene=7300
time: 4.7e+01 sec	id_gene=7400
time: 4.8e+01 sec	id_gene=7500
time: 4.8e+01 sec	id_gene=7600
time: 4.9e+01 sec	id_gene=7700
time: 5e+01 sec	id_gene=7800
time: 5e+01 sec	id_gene=7900
time: 5.1e+01 sec	id_gene=8000
time: 5.1e+01 sec	id_gene=8100
time: 5.2e+01 sec	id_gene=8200
time: 5.3e+01 sec	id_gene=8300
time: 5.3e+01 sec	id_gene=8400
time: 5.4e+01 sec	id_gene=8500
time: 5.4e+01 sec	id_gene=8600
time: 5.6e+01 sec	id_gene=8700
time: 5.6e+01 sec	id_gene=8800
time: 5.7e+01 sec	id_gene=8900
time: 5.7e+01 sec	id_gene=9000
time: 5.8e+01 sec	id_gene=9100
time: 5.9e+01 sec	id_gene=9200
time: 5.9e+01 sec	id_gene=9300
time: 6e+01 sec	id_gene=9400
time: 6.1e+01 sec	id_gene=9500
time: 6.1e+01 sec	id_gene=9600
time: 6.2e+01 sec	id_gene=9700
time: 6.2e+01 sec	id_gene=9800
time: 6.3e+01 sec	id_gene=9900

=====
total t