# Change from genebank to fasta-format

Strategy:
- Make GEM for all genomes

In [1]:
from Bio import Entrez, SeqIO
import pandas as pd
import os

### Testing parsing genbank files

In [2]:
MAG_data = [] # (gene_id, database_ref, translation)

# For each scaffold in the bin

scaffolds = 0
for seq_record in SeqIO.parse("input/genbank/CH1-bin.0.gbk","genbank"):
    scaffolds = scaffolds +1 
    # For each gene sequence in the scaffold
    for feature in seq_record.features:
        
        # If the translation field is in the data
        if "translation" in feature.qualifiers.keys():
            
            db_ref = None
            
            if "db_xref" in feature.qualifiers.keys():
                db_ref = feature.qualifiers["db_xref"][0]

            gene_id = feature.qualifiers["gene"][0]
            translation = feature.qualifiers["translation"][0]

            MAG_data.append((gene_id,db_ref,translation))

In [3]:
scaffolds

321

In [4]:
MAG_data_df = pd.DataFrame(MAG_data,columns=["gene_id", "database_ref", "translation"])
MAG_data_df

Unnamed: 0,gene_id,database_ref,translation
0,CH1-bin.0_NODE_133_length_76246_cov_6.126393_1,,YVYSQMIAGRSAPRYGEAKNSWLTGTAAWAFVNISQAILGIQPDFD...
1,CH1-bin.0_NODE_133_length_76246_cov_6.126393_2,ko:K05350,MEFPKQFVWGAASSSYQTEGNRSGRGDSIWDEFCTRPGAIRNNETG...
2,CH1-bin.0_NODE_133_length_76246_cov_6.126393_3,,MDISTHKRRKLLPAAAAVLFSLSVFSATAFATSNIPPQGQEINAGV...
3,CH1-bin.0_NODE_133_length_76246_cov_6.126393_4,ko:K02030,MKKALSALLLVCAIGFSMGGCSGVSSSSSSPTDPSWNTVRTKDTLV...
4,CH1-bin.0_NODE_133_length_76246_cov_6.126393_5,,MESVWGTVSNSVNEDYFKGAPMKMSTLLATANQKLSAFIRRRSFSE...
...,...,...,...
3228,CH1-bin.0_k141_4018_length_2502_cov_7.0000_1,,MKEAKMKSLFCKLKNKLGAAAV
3229,CH1-bin.0_k141_4018_length_2502_cov_7.0000_2,,MPGITLKNGLISYYGNPAGYTEKEKAVVDSIFQNDELTAWLKSRSL...
3230,CH1-bin.0_k141_4018_length_2502_cov_7.0000_3,,MKKNQMPQIGLPADACERSGFTDKDTLELHAGQNALVFMKDKMTAL...
3231,CH1-bin.0_k141_4018_length_2502_cov_7.0000_4,ko:K06223,MSWIGGKKSLRELIVTLFPLYYERYIEVFGGGGWVLFHKLPGNDFE...


In [142]:
len(set(MAG_data_df.translation.values))

3233

In [143]:
len(set(MAG_data_df.gene_id.values))

3233

### Make fasta file from genbank

In [144]:
directory_in_str = "input/genbank/"
directory = os.fsencode(directory_in_str)
    
gene_ids = {} 

# For each mag
for file in os.listdir(directory):
    filename = os.fsdecode(file)

    if filename.endswith(".gbk"): 

        print(filename)
        
        gene_id_new = 0
        gene_ids[(filename[:-4],"old_id")] = []
        gene_ids[(filename[:-4],"new_id")] = []
        ofile = open("output/MAGs_fasta/"+filename[:-4]+".faa","w")
        
        # For each scaffold
        for seq_record in SeqIO.parse("input/genbank/"+filename,"genbank"):

            # For each gene sequence in the scaffold
            for feature in seq_record.features:

                # If the translation field is in the data -> could be enzyme
                if "translation" in feature.qualifiers.keys():
                    translation = feature.qualifiers["translation"][0]
                    gene_id_old = feature.qualifiers["gene"][0]
                    
                    
                    
                    ofile.write(">"+"gene"+str(gene_id_new)+"\n"+translation+"\n")
                    
                    gene_ids[(filename[:-4],"old_id")].append(gene_id_old)
                    gene_ids[(filename[:-4],"new_id")].append("gene"+str(gene_id_new))
                    
                    gene_id_new = gene_id_new+1
        ofile.close()


CH15-bin.17.gbk
CH9-bin.4.gbk
CH7-bin.14.gbk
CH1-bin.8.gbk
CH13-bin.20.gbk
CH13-bin.21.gbk
CH1-bin.9.gbk
CH7-bin.15.gbk
CH9-bin.5.gbk
CH15-bin.16.gbk
CH15-bin.8.gbk
CH7-bin.17.gbk
CH13-bin.22.gbk
CH7-bin.16.gbk
CH9-bin.6.gbk
CH15-bin.15.gbk
CH7-bin.9.gbk
CH15-bin.11.gbk
CH7-bin.12.gbk
CH9-bin.2.gbk
CH7-bin.13.gbk
CH15-bin.10.gbk
CH7-bin.8.gbk
CH15-bin.12.gbk
CH7-bin.11.gbk
CH9-bin.1.gbk
CH13-bin.19.gbk
CH13-bin.25.gbk
CH13-bin.24.gbk
CH13-bin.8.gbk
CH9-bin.0.gbk
CH15-bin.13.gbk
CH8-bin.16.gbk
CH3-bin.0.gbk
CH8-bin.4.gbk
CH8-bin.5.gbk
CH3-bin.1.gbk
CH8-bin.17.gbk
CH8-bin.29.gbk
CH8-bin.7.gbk
CH8-bin.6.gbk
CH3-bin.2.gbk
CH8-bin.14.gbk
CH8-bin.2.gbk
CH8-bin.11.gbk
CH8-bin.0.gbk
CH8-bin.12.gbk
CH14-bin.2.gbk
CH8-bin.22.gbk
CH8-bin.20.gbk
CH14-bin.1.gbk
CH8-bin.21.gbk
CH8-bin.25.gbk
CH14-bin.4.gbk
CH8-bin.8.gbk
CH8-bin.9.gbk
CH8-bin.27.gbk
CH7-bin.6.gbk
CH15-bin.22.gbk
CH15-bin.2.gbk
CH7-bin.21.gbk
CH1-bin.1.gbk
CH13-bin.14.gbk
CH1-bin.0.gbk
CH13-bin.4.gbk
CH7-bin.20.gbk
CH15-bin.3.gbk
CH15

In [145]:
gene_ids_df = pd.DataFrame.from_dict({ key:pd.Series(value) for key, value in gene_ids.items()})

gene_ids_df.head(10)

Unnamed: 0_level_0,CH15-bin.17,CH15-bin.17,CH9-bin.4,CH9-bin.4,CH7-bin.14,CH7-bin.14,CH1-bin.8,CH1-bin.8,CH13-bin.20,CH13-bin.20,...,CH1-bin.5,CH1-bin.5,CH13-bin.1,CH13-bin.1,CH7-bin.25,CH7-bin.25,CH15-bin.6,CH15-bin.6,CH7-bin.2,CH7-bin.2
Unnamed: 0_level_1,old_id,new_id,old_id,new_id,old_id,new_id,old_id,new_id,old_id,new_id,...,old_id,new_id,old_id,new_id,old_id,new_id,old_id,new_id,old_id,new_id
0,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_1,gene0,CH9-bin.4_NODE_5_length_306115_cov_17.353493_1,gene0,CH7-bin.14_NODE_40_length_181369_cov_8.935609_1,gene0,CH1-bin.8_NODE_4_length_584485_cov_43.940039_1,gene0,CH13-bin.20_NODE_283_length_49011_cov_7.985906_1,gene0,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_1,gene0,CH13-bin.1_NODE_1_length_928262_cov_19.438597_1,gene0,CH7-bin.25_NODE_53_length_154058_cov_7.769940_1,gene0,CH15-bin.6_NODE_6_length_825317_cov_75.898953_1,gene0,CH7-bin.2_NODE_4_length_505470_cov_55.993093_1,gene0
1,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_2,gene1,CH9-bin.4_NODE_5_length_306115_cov_17.353493_2,gene1,CH7-bin.14_NODE_40_length_181369_cov_8.935609_2,gene1,CH1-bin.8_NODE_4_length_584485_cov_43.940039_2,gene1,CH13-bin.20_NODE_283_length_49011_cov_7.985906_2,gene1,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_2,gene1,CH13-bin.1_NODE_1_length_928262_cov_19.438597_2,gene1,CH7-bin.25_NODE_53_length_154058_cov_7.769940_2,gene1,CH15-bin.6_NODE_6_length_825317_cov_75.898953_2,gene1,CH7-bin.2_NODE_4_length_505470_cov_55.993093_2,gene1
2,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_3,gene2,CH9-bin.4_NODE_5_length_306115_cov_17.353493_3,gene2,CH7-bin.14_NODE_40_length_181369_cov_8.935609_3,gene2,CH1-bin.8_NODE_4_length_584485_cov_43.940039_3,gene2,CH13-bin.20_NODE_283_length_49011_cov_7.985906_3,gene2,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_3,gene2,CH13-bin.1_NODE_1_length_928262_cov_19.438597_3,gene2,CH7-bin.25_NODE_53_length_154058_cov_7.769940_3,gene2,CH15-bin.6_NODE_6_length_825317_cov_75.898953_3,gene2,CH7-bin.2_NODE_4_length_505470_cov_55.993093_3,gene2
3,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_4,gene3,CH9-bin.4_NODE_5_length_306115_cov_17.353493_4,gene3,CH7-bin.14_NODE_40_length_181369_cov_8.935609_4,gene3,CH1-bin.8_NODE_4_length_584485_cov_43.940039_4,gene3,CH13-bin.20_NODE_283_length_49011_cov_7.985906_4,gene3,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_4,gene3,CH13-bin.1_NODE_1_length_928262_cov_19.438597_4,gene3,CH7-bin.25_NODE_53_length_154058_cov_7.769940_4,gene3,CH15-bin.6_NODE_6_length_825317_cov_75.898953_4,gene3,CH7-bin.2_NODE_4_length_505470_cov_55.993093_4,gene3
4,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_5,gene4,CH9-bin.4_NODE_5_length_306115_cov_17.353493_5,gene4,CH7-bin.14_NODE_40_length_181369_cov_8.935609_5,gene4,CH1-bin.8_NODE_4_length_584485_cov_43.940039_5,gene4,CH13-bin.20_NODE_283_length_49011_cov_7.985906_5,gene4,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_5,gene4,CH13-bin.1_NODE_1_length_928262_cov_19.438597_5,gene4,CH7-bin.25_NODE_53_length_154058_cov_7.769940_5,gene4,CH15-bin.6_NODE_6_length_825317_cov_75.898953_5,gene4,CH7-bin.2_NODE_4_length_505470_cov_55.993093_5,gene4
5,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_6,gene5,CH9-bin.4_NODE_5_length_306115_cov_17.353493_6,gene5,CH7-bin.14_NODE_40_length_181369_cov_8.935609_6,gene5,CH1-bin.8_NODE_4_length_584485_cov_43.940039_6,gene5,CH13-bin.20_NODE_283_length_49011_cov_7.985906_6,gene5,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_6,gene5,CH13-bin.1_NODE_1_length_928262_cov_19.438597_6,gene5,CH7-bin.25_NODE_53_length_154058_cov_7.769940_6,gene5,CH15-bin.6_NODE_6_length_825317_cov_75.898953_6,gene5,CH7-bin.2_NODE_4_length_505470_cov_55.993093_6,gene5
6,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_7,gene6,CH9-bin.4_NODE_5_length_306115_cov_17.353493_7,gene6,CH7-bin.14_NODE_40_length_181369_cov_8.935609_7,gene6,CH1-bin.8_NODE_4_length_584485_cov_43.940039_7,gene6,CH13-bin.20_NODE_283_length_49011_cov_7.985906_7,gene6,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_7,gene6,CH13-bin.1_NODE_1_length_928262_cov_19.438597_7,gene6,CH7-bin.25_NODE_53_length_154058_cov_7.769940_7,gene6,CH15-bin.6_NODE_6_length_825317_cov_75.898953_7,gene6,CH7-bin.2_NODE_4_length_505470_cov_55.993093_7,gene6
7,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_8,gene7,CH9-bin.4_NODE_5_length_306115_cov_17.353493_8,gene7,CH7-bin.14_NODE_40_length_181369_cov_8.935609_8,gene7,CH1-bin.8_NODE_4_length_584485_cov_43.940039_8,gene7,CH13-bin.20_NODE_283_length_49011_cov_7.985906_8,gene7,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_8,gene7,CH13-bin.1_NODE_1_length_928262_cov_19.438597_8,gene7,CH7-bin.25_NODE_53_length_154058_cov_7.769940_8,gene7,CH15-bin.6_NODE_6_length_825317_cov_75.898953_8,gene7,CH7-bin.2_NODE_4_length_505470_cov_55.993093_8,gene7
8,CH15-bin.17_NODE_1_length_1041084_cov_76.159582_9,gene8,CH9-bin.4_NODE_5_length_306115_cov_17.353493_9,gene8,CH7-bin.14_NODE_40_length_181369_cov_8.935609_9,gene8,CH1-bin.8_NODE_4_length_584485_cov_43.940039_9,gene8,CH13-bin.20_NODE_283_length_49011_cov_7.985906_9,gene8,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_9,gene8,CH13-bin.1_NODE_1_length_928262_cov_19.438597_9,gene8,CH7-bin.25_NODE_53_length_154058_cov_7.769940_9,gene8,CH15-bin.6_NODE_6_length_825317_cov_75.898953_9,gene8,CH7-bin.2_NODE_4_length_505470_cov_55.993093_9,gene8
9,CH15-bin.17_NODE_1_length_1041084_cov_76.15958...,gene9,CH9-bin.4_NODE_5_length_306115_cov_17.353493_10,gene9,CH7-bin.14_NODE_40_length_181369_cov_8.935609_10,gene9,CH1-bin.8_NODE_4_length_584485_cov_43.940039_10,gene9,CH13-bin.20_NODE_283_length_49011_cov_7.985906_10,gene9,...,CH1-bin.5_NODE_124_length_82203_cov_6.596983_10,gene9,CH13-bin.1_NODE_1_length_928262_cov_19.438597_10,gene9,CH7-bin.25_NODE_53_length_154058_cov_7.769940_10,gene9,CH15-bin.6_NODE_6_length_825317_cov_75.898953_10,gene9,CH7-bin.2_NODE_4_length_505470_cov_55.993093_10,gene9
