# Bacterial Ensemble Data Processing

This notebook creates a language model dataset from an ensemble of bacterial genomes. This dataset will be used for unsupervised learning, so it will simply be the text of the genome.

#### Data Source
All genomes are downloaded from [NCBI](https://www.ncbi.nlm.nih.gov/genome/browse#!/overview/)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx

In [3]:
sys.path.append("../../..")
from utils import *

In [6]:
path = Path('/mnt/wd_4tb/shared_disk_wd4tb/mattscicluna/data/genomic_ulmfit/bacterial_genomes/')

Genome files used:

In [9]:
os.listdir(path/'genome_fastas')

['GCF_000752395.1_Bacillus_andreraoultii_genomic.fna']

In [27]:
valid_pct = 0.1
dfs_trn = []
dfs_val = []
for file in os.listdir(path/'genome_fastas'):
    source = '_'.join(file.split('.')[1].split('_')[1:-1])
    #source = file.split('.')[0]
    
    data = process_fasta(path/'genome_fastas'/file, 2000, 900)
    
    df = pd.DataFrame(data, columns=['Sequence'])
    df['Source'] = source
    cut = int((1-valid_pct) * len(df)) + 1
    train_df, valid_df = df[:cut], df[cut:]
    dfs_trn.append(train_df)
    dfs_val.append(valid_df)

df_trn = pd.concat(dfs_trn)
df_trn['set'] = 'train'
df_val = pd.concat(dfs_val)
df_val['set'] = 'valid'

data_df = pd.concat(dfs_trn+dfs_val)
data_df.reset_index(inplace=True, drop=True)
data_df.to_csv(path/'bacterial_data.csv', index=False)

In [28]:
data_df.head()

Unnamed: 0,Sequence,Source
0,AGACGCTCTATCCAATTGAGCTACGGGCGCATATAAATGGTGCCGA...,Bacillus_andreraoultii
1,TATAGGAATTGTATTTACGGGATTTCCGCATAAATTTTACACATTT...,Bacillus_andreraoultii
2,AAGTCAATGATTATCTTCCAACGAAAGTCCGGGTTTTATCGTCTAT...,Bacillus_andreraoultii
3,CATGAGCTAGCGAAATCGCACTTTCGAGTAGAACGTGAACAGACGT...,Bacillus_andreraoultii
4,TAAATGGTTTAATTAACTATAACATACTTGACCTTGCGAAAAAAAC...,Bacillus_andreraoultii


In [41]:
# small test to see if our datasets are the same
assert "AGACGCTCTATCCAATTGAGCTACGGGCGCATATAAATGGTGCCGA" == data_df.iloc[0]['Sequence'][:46]
assert "TATAGGAATTGTATTTACGGGATTTCCGCATAAATTTTACACATTT" == data_df.iloc[1]['Sequence'][:46]
assert "AAGTCAATGATTATCTTCCAACGAAAGTCCGGGTTTTATCGTCTAT" == data_df.iloc[2]['Sequence'][:46]
assert "CATGAGCTAGCGAAATCGCACTTTCGAGTAGAACGTGAACAGACGT" == data_df.iloc[3]['Sequence'][:46]
assert "TAAATGGTTTAATTAACTATAACATACTTGACCTTGCGAAAAAAAC" == data_df.iloc[4]['Sequence'][:46]