# Bacterial Ensemble Data Processing

This notebook creates a language model dataset from an ensemble of bacterial genomes. This dataset will be used for unsupervised learning, so it will simply be the text of the genome.

#### Data Source
All genomes are downloaded from [NCBI](https://www.ncbi.nlm.nih.gov/genome/browse#!/overview/)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [13]:
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx

In [3]:
sys.path.append("..")
from utils import *

In [6]:
path = Path('data/')

Genome files used:

In [8]:
os.listdir(path/'fasta')

['NZ_CP008758.1.fasta',
 'NC_006351.1.fasta',
 'NC_017831.1.fasta',
 'NZ_CP009929.1.fasta',
 'NC_006349.2.fasta',
 'NC_002516.2.fasta',
 'NZ_CP013450.1.fasta',
 'NC_017832.1.fasta',
 'NZ_CP008782.1.fasta',
 'NC_006350.1.fasta',
 'NZ_CP069336.1.fasta',
 'NZ_CP009727.1.fasta',
 'NC_006348.1.fasta',
 'NC_000913.3.fasta',
 'NZ_CM001156.1.fasta',
 'NZ_CP009728.1.fasta',
 'NZ_CP013451.1.fasta',
 'NZ_CP050021.1.fasta',
 'NZ_CP013413.1.fasta',
 'NZ_CP008759.1.fasta',
 'NZ_CM001157.1.fasta',
 'NZ_CP013411.1.fasta',
 'NZ_CP068710.1.fasta',
 'NZ_CP068709.1.fasta',
 'NC_002695.2.fasta',
 'NZ_CP013452.1.fasta',
 'NZ_CP008781.1.fasta',
 'NZ_CP009930.1.fasta',
 'NZ_CP050020.1.fasta']

In [25]:
valid_pct = 0.1
dfs_trn = []
dfs_val = []
for file in os.listdir(path/'fasta'):
    for record in SeqIO.parse(path/'fasta'/file, "fasta"):
        source = record.description
    
    data = process_fasta(path/'fasta'/file, 2000, 900)
    
    df = pd.DataFrame(data, columns=['Sequence'])
    df['Source'] = source
    cut = int((1-valid_pct) * len(df)) + 1
    train_df, valid_df = df[:cut], df[cut:]
    dfs_trn.append(train_df)
    dfs_val.append(valid_df)

df_trn = pd.concat(dfs_trn)
df_trn['set'] = 'train'
df_val = pd.concat(dfs_val)
df_val['set'] = 'valid'

data_df = pd.concat(dfs_trn+dfs_val)
data_df.reset_index(inplace=True, drop=True)
data_df.to_csv(path/'bacterial_data.csv', index=False)

In [26]:
data_df.head()

Unnamed: 0,Sequence,Source
0,CGTTCGATCATCGGTTCCCGCATTCATACCACCCCGACACGACGGC...,NZ_CP008758.1 Burkholderia pseudomallei strain...
1,CGTCGTGTTCGACGGCGCGCGCCATCACAGCATGGCGCGCGATCGC...,NZ_CP008758.1 Burkholderia pseudomallei strain...
2,CAGGCGATCGATTTCACGACGGTGATCTATCGCGTGCCGACGCAGC...,NZ_CP008758.1 Burkholderia pseudomallei strain...
3,AATGCGTGGCGAACGGCTCACAGGCGCAGGCGTGCGATGCGCTCGA...,NZ_CP008758.1 Burkholderia pseudomallei strain...
4,AGCGCGGCCGGAAACGACGGGTATTCGCGCGCCGGCCGGGCGAGCT...,NZ_CP008758.1 Burkholderia pseudomallei strain...


In [27]:
data_df.shape

(113675, 2)