It is first part of word2vec pipeline

<b>ngram extractor from gene sequence</b>

In [1]:
import numpy as np
import os
import csv
from timeit import default_timer as timer

from Bio import SeqIO
from nltk.util import ngrams
from collections import OrderedDict

### Read seq

In [2]:
path_gene_nuc = '../../sf_data/prodigal_output/gene_nuc/'

In [3]:
organisms_names = [f[:f.index('_', f.index('_') + 1)] for f in os.listdir(path_gene_nuc)]
organisms_names

['Bacillus_amyloliquefaciens',
 'Bacillus_atrophaeus',
 'Bacillus_halotolerans',
 'Bacillus_licheniformis',
 'Bacillus_mojavensis',
 'Bacillus_paralicheniformis',
 'Bacillus_siamensis',
 'Bacillus_sonorensis',
 'Bacillus_subtilis',
 'Bacillus_tequilensis',
 'Bacillus_vallismortis',
 'Bacillus_velezensis']

In [4]:
full_path_gene_nuc = [path_gene_nuc + f for f in os.listdir(path_gene_nuc)]

In [5]:
organisms = [SeqIO.parse(gene_nuc_file, 'fasta') for gene_nuc_file in full_path_gene_nuc]
org_gene_lst = []

for (id_org, organism), org_name in zip(enumerate(organisms), organisms_names):
    for gene_record in organism:
        org_gene_lst.append(('_'.join((str(id_org), org_name)), gene_record.id,
                             str(gene_record.seq).replace('N', '')))

In [6]:
org_gene_lst[7]

('0_Bacillus_amyloliquefaciens',
 'NC_014551.1_8',
 'ATGACGTCAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGGCAGAACAAAGGGCAGCGAAACCGCGAGGTTAAGCCAATCCCACAAATCTGTTCTCAGTTCGGATCGCAGTCTGCAACTCGACTGCGTGAAGCTGGAATCGCTAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGA')

In [7]:
org_gene_lst[-1]

('11_Bacillus_velezensis',
 'NZ_CP011937.1_3760',
 'TTGTGGATAAGTTGTGAAAAAGACAGGAAACACAGCCTATCCACATGTGGACAGACTGTGTATGGCTATGAAAAAGTTATTCACACTTTCCCCGATTCATCCGAGGACACT')

In [52]:
len(org_gene_lst)

49807

### func: n-gram extractor (save order (3,8))
Based on my Simple algorithm

Input: [(org_id + org_name, gene_id, gene_seq)]

Output: [(org_id + org_name, gene_id, gene_seq, [ngram_seq])]

In [60]:
def ngram_extractor_csv(file_name, org_g_lst, n_min=3, n_max=3):
    ngram_range = list(range(n_min, n_max+1))
    
    with open(file_name, mode='w') as ngrams_file:
        ngrams_writer = csv.writer(ngrams_file, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        start = timer()
        
        for (g_num, (org, g_id, g_seq)) in enumerate(org_g_lst):
            if(g_num) % 100 == 0:
                print(f'time: {timer()-start:.2} sec\tid_gene={g_num}')
            
            ngrams_with_order = []
            
            [ngrams_with_order.append(g_seq[indx:indx+n])\
             for indx in range(len(g_seq) - (n_min-1))
             for n in ngram_range
             if (indx+n) <= len(g_seq)]

            ngrams_writer.writerow((org, g_id, g_seq, ngrams_with_order))

In [None]:
# rather quick
# 310 sec for all genes!

start = timer()

for n in range(5):
    gene_number = 10000
    min_bound = n*gene_number
    max_bound = min((n+1)*gene_number-1, len(org_gene_lst)-1)
    
    csv_name = 'data/gene_ngrams_' + str(min_bound) + '_' + str(max_bound) + '.csv'
    
    print(f'{min_bound}-{max_bound} genes:\n')
    ngram_extractor_csv(csv_name, org_gene_lst[min_bound:max_bound+1], n_min=3, n_max=8)
    
    print(f'\n=====\ntotal time: {timer()-start:.2} sec\t({min_bound}-{max_bound} genes)\n\n')