In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from Bio.Blast import NCBIWWW, NCBIXML

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [20]:
folder_path = "../data/"
processed_path = "../data_processed/"

In [3]:
train_values = pd.read_csv(folder_path+"train_values.csv")
test_values = pd.read_csv(folder_path+"test_values.csv")

In [4]:
train_values.head()

Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,growth_strain_ccdb_survival,growth_strain_dh10b,growth_strain_dh5alpha,growth_strain_neb_stable,growth_strain_other,growth_strain_stbl3,growth_strain_top10,growth_strain_xl1_blue,growth_temp_30,growth_temp_37,growth_temp_other,selectable_markers_blasticidin,selectable_markers_his3,selectable_markers_hygromycin,selectable_markers_leu2,selectable_markers_neomycin,selectable_markers_other,selectable_markers_puromycin,selectable_markers_trp1,selectable_markers_ura3,selectable_markers_zeocin,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## 1. generate fasta files

In [41]:
num_groups = 100

In [42]:
num_seqs_train = train_values.shape[0]
num_seqs_test = test_values.shape[0]
print(num_seqs_train, "in training data")
print(num_seqs_test, "in testing data")

print(int(num_seqs_train/num_groups), "per group in trianing")
print(int(num_seqs_test/num_groups), "per group in testing")

63017 in training data
18816 in testing data
630 per group in trianing
188 per group in testing


In [43]:
group_id_train = pd.cut(train_values.index.values, bins = num_groups, labels = False)
group_id_test = pd.cut(test_values.index.values, bins = num_groups, labels = False)

train_values["group_id"] = group_id_train
test_values["group_id"] = group_id_test

### 1.1 train data

In [44]:
for i in range(num_groups):
    df_tmp = train_values[train_values.group_id == i].copy()
    with open(processed_path+f"train_seqs_group_{i}.fasta", "w") as f:
        for i, row in df_tmp.iterrows():
            seq = row["sequence"]
            seq_id = row["sequence_id"]
            f.write(">" + seq_id + "\n" + seq + "\n")


### 1.2 test data

In [45]:
for i in range(num_groups):
    df_tmp = test_values[test_values.group_id == i].copy()
    with open(processed_path+f"test_seqs_group_{i}.fasta", "w") as f:
        for i, row in df_tmp.iterrows():
            seq = row["sequence"]
            seq_id = row["sequence_id"]
            f.write(">" + seq_id + "\n" + seq + "\n")

## 2. BLAST

In [46]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO 

In [47]:
sequence_data = open(processed_path+f"test_seqs_group_{10}.fasta").read() 
result_handle = NCBIWWW.qblast("blastn", "nt", sequence_data) 

KeyboardInterrupt: 