# Bacterial Ensemble Data Processing

This notebook creates a language model dataset from an ensemble of bacterial genomes. This dataset will be used for unsupervised learning, so it will simply be the text of the genome.

#### Data Source
All genomes are downloaded from [NCBI](https://www.ncbi.nlm.nih.gov/genome/browse#!/overview/)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx

In [3]:
sys.path.append("../../..")
from utils import *

In [4]:
path = Path('/mnt/wd_4tb/shared_disk_wd4tb/mattscicluna/data/genomic_ulmfit/bacterial genomes/')

Genome files used:

In [5]:
os.listdir(path/'genome_fastas')

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/wd_4tb/shared_disk_wd4tb/mattscicluna/data/genomic_ulmfit/bacterial genomes/genome_fastas'

In [None]:
valid_pct = 0.1
dfs_trn = []
dfs_val = []
for file in os.listdir(path/'genome_fastas'):
    source = file.split('.')[0]
    
    data = process_fasta(path/'genome_fastas'/file, 2000, 900)
    
    df = pd.DataFrame(data, columns=['Sequence'])
    df['Source'] = source
    cut = int((1-valid_pct) * len(df)) + 1
    train_df, valid_df = df[:cut], df[cut:]
    dfs_trn.append(train_df)
    dfs_val.append(valid_df)

df_trn = pd.concat(dfs_trn)
df_trn['set'] = 'train'
df_val = pd.concat(dfs_val)
df_val['set'] = 'valid'

data_df = pd.concat(dfs_trn+dfs_val)
data_df.reset_index(inplace=True, drop=True)
data_df.to_csv(path/'bacterial_data.csv', index=False)

In [None]:
data_df.head()

In [None]:
data_df.shape