# BGC class analysis

In [86]:
import json
import os
from Bio import SeqIO
from pathlib import Path
import pandas as pd
import yaml

In [87]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

{'bgcflow_dir': '/datadrive/bgcflow'}

In [88]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])

project_name = "mq_strepto"
processed_dir = bgcflow_dir / "data" / "processed" / project_name

In [102]:
df_bgcs = pd.read_csv(processed_dir/ "tables" / "df_regions_antismash_7.0.0.csv", index_col=0, low_memory=False)

In [None]:
for bgc_id in df_bgcs.index:
    genome_id = df_bgcs.loc[bgc_id, "genome_id"]
    bgc_file = bgc_id + ".gbk"
    gbk_path = bgcflow_dir / "data/interim/antismash/7.0.0/" / genome_id / bgc_file
    
    categories = []

    if os.path.isfile(gbk_path):
        records = SeqIO.parse(gbk_path, "genbank")
        for record in records:
            for feat in record.features:
                if feat.type == "protocluster":
                    categories.append(feat.qualifiers["category"])

    else:
        print(genome_id, bgc_id)

    df_bgcs.loc[bgc_id, "proto_category"] = categories

In [107]:
df_bgcs.to_csv(processed_dir/ "tables" / "df_regions_antismash_cat_7.0.0.csv")

In [108]:
df_bgcs

Unnamed: 0_level_0,genome_id,region,accession,start_pos,end_pos,contig_edge,product,region_length,most_similar_known_cluster_id,most_similar_known_cluster_description,most_similar_known_cluster_type,similarity,source,gbk_path,proto_category
bgc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
NC_003155.5.region001,GCF_000009765.2,1.10,NC_003155.5,76963,97013,False,['terpene'],20050,BGC0000683,avermitilol,Terpene,1.000000,bgcflow,data/interim/antismash/7.0.0/GCF_000009765.2/N...,[terpene]
NC_003155.5.region002,GCF_000009765.2,1.20,NC_003155.5,292967,312632,False,['lassopeptide'],19665,BGC0001539,cattlecin,RiPP,0.750000,bgcflow,data/interim/antismash/7.0.0/GCF_000009765.2/N...,[RiPP]
NC_003155.5.region003,GCF_000009765.2,1.30,NC_003155.5,479882,586556,False,['T1PKS'],106674,BGC0000059,filipin,Polyketide,1.000000,bgcflow,data/interim/antismash/7.0.0/GCF_000009765.2/N...,[PKS]
NC_003155.5.region004,GCF_000009765.2,1.40,NC_003155.5,734502,778127,False,['NRPS'],43625,BGC0001574,diisonitrile antibiotic SF2768,NRP,0.666667,bgcflow,data/interim/antismash/7.0.0/GCF_000009765.2/N...,[NRPS]
NC_003155.5.region005,GCF_000009765.2,1.50,NC_003155.5,958642,1026513,False,"['PKS-like', 'NRPS-like', 'T1PKS', 'NRPS']",67871,BGC0001202,landepoxcin,NRP+Polyketide,0.111111,bgcflow,data/interim/antismash/7.0.0/GCF_000009765.2/N...,"[PKS, NRPS, PKS, NRPS, NRPS]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CP109072.region022,NBC_01808,1.22,CP109072,7727300,7823761,False,"['other', 'T3PKS', 'NRPS']",96461,BGC0002314,corbomycin,NRP,0.851852,bgcflow,data/interim/antismash/7.0.0/NBC_01808/CP10907...,"[other, PKS, NRPS]"
CP109072.region023,NBC_01808,1.23,CP109072,7851222,7892626,False,['terpene'],41404,BGC0000663,hopene,Terpene,0.692308,bgcflow,data/interim/antismash/7.0.0/NBC_01808/CP10907...,[terpene]
CP109072.region024,NBC_01808,1.24,CP109072,7913630,7961223,False,['NRPS'],47593,BGC0001975,atratumycin,NRP,0.052632,bgcflow,data/interim/antismash/7.0.0/NBC_01808/CP10907...,[NRPS]
CP109072.region025,NBC_01808,1.25,CP109072,8000859,8014473,False,['NI-siderophore'],13614,BGC0002466,peucechelin,NRP,0.200000,bgcflow,data/interim/antismash/7.0.0/NBC_01808/CP10907...,[other]
