In [None]:
import pandas as pd
from pathlib import Path
import yaml
import shutil

## Description
This notebook is to clean up the dataset

In [None]:
data_dir = Path("../data/external/")
G1034_dir = data_dir / "G1034_20230801"
samples_all = pd.read_csv(str(data_dir / "further_filtered_trans_bgcs_selecting_odd_ones_for_removing_v7.csv"), sep=";")

In [None]:
## Remove obsolette duplicates
double = samples_all.bgc_id.value_counts().to_dict()
samples_double = samples_all[samples_all.bgc_id.isin([i for i in double.keys() if double[i] == 2])].sort_values("bgc_id")
id_to_drop = samples_double[samples_double["validated-member-of-dataset"] == "no"]
samples_all = samples_all.drop(id_to_drop.index)

## Remove empty rows
null_rows = samples_all[samples_all.isnull().all(axis=1)]
samples_all = samples_all.drop(null_rows.index)

## Prepare a taxonomy file for all project

In [None]:
gtdb_tax = pd.read_csv(G1034_dir / "tables/gtdbtk.bac120.summary.tsv", sep="\t")
gtdb_tax = gtdb_tax[gtdb_tax.user_genome.isin(samples_all.genome_id)]
outfile = Path("../config/gtdbtk.bac120.summary.tsv")
outfile.parent.mkdir(exist_ok=True, parents=True)
gtdb_tax.to_csv(outfile, sep="\t", index=False)

## Create project containing all unfiltered trans-AT PKS

In [None]:
# clean up curated samples & create a project containing all unfiltered trans-AT PKS
samples_all = samples_all.drop(columns=["product_x"]).rename(columns={"product_y" : "product"})
project_name = 'Trans_AT_network_raw'
outdir = Path(f"../config/{project_name}")
outdir.mkdir(exist_ok=True, parents=True)
samples_all.to_csv(outdir / "samples.csv", index=False)
samples_all

In [None]:
project_config = {'name': project_name,
                  'pep_version': '2.1.0',
                  'description': f'A selection of BGCs producing {project_name}',
                  'gtdb-tax': '../gtdbtk.bac120.summary.tsv',
                  'sample_table': 'samples.csv',
                  'rules': {'bigslice': False,
                            'bigscape': True,
                            'query-bigslice': False,
                            'clinker': False,
                            'interproscan': False,
                            'mmseqs2': False}
                 }

# Write data to a YAML file
with open(outdir / 'project_config.yaml', 'w') as yaml_file:
    yaml.dump(project_config, yaml_file, default_flow_style=False)

print(f"  - pep: config/{project_name}/project_config.yaml")

## Create selected trans-AT PKS sub projects

In [None]:
## Create project containing all curated trans-AT PKS
samples_all = samples_all.set_index("bgc_id", drop=False)
samples_validated = samples_all[samples_all["validated-member-of-dataset"] == "yes"]
for item in Path("../config/").glob("*"):
    if item.is_dir() and item.stem not in ['Trans_AT_network_all',
                                           '.ipynb_checkpoints',
                                           'Trans_AT_network_raw',
                                          'cycloheximide']:
        project_name = item.stem
        print(project_name)
        df = pd.read_csv(item / 'samples.csv').set_index("bgc_id", drop=False)
        df_mibig = df[df.bgc_id.str.startswith("BGC")]
        df_non_mibig = df[~df.bgc_id.str.startswith("BGC")]
        df_samples = samples_validated.set_index("bgc_id", drop=False).loc[df_non_mibig.index, :]
        df_final = pd.concat([df_samples, df_mibig]).set_index("bgc_id", drop=False)
        print(df_final.bgc_id.to_list())
        for bgc_id in df_final.index:
            original_gbk = df.loc[bgc_id, "gbk_path"]
            original_gbk_path = Path("..") / original_gbk
            target_gbk = df_final.loc[bgc_id, "gbk_path"].replace("data/interim", "data/external/G1034_20230801")
            target_gbk_path = Path("..") / target_gbk
            assert original_gbk_path.stem == target_gbk_path.stem
            if bgc_id.startswith("BGC"):
                target_gbk_path = Path(f"../data/external/MIBIG/{bgc_id}.gbk")
                df_final.loc[bgc_id, "genome_id"] = "unknown"
                df_final.loc[bgc_id, "validated-member-of-dataset"] = "yes"
                # add to raw and curated dataset
                for c in df_final.columns:
                    samples_all.loc[bgc_id, c] = df_final.loc[bgc_id, c]
            target_gbk_path.parent.mkdir(exist_ok=True, parents=True)                
            shutil.copy(original_gbk_path, target_gbk_path)
            df_final.loc[bgc_id, "gbk_path"] = str(target_gbk_path).replace("../", "")
            df_final.loc[bgc_id, "project_name"] = project_name
            samples_all.loc[bgc_id, "gbk_path"] = df_final.loc[bgc_id, "gbk_path"]
            samples_all.loc[bgc_id, "project_name"] = df_final.loc[bgc_id, "project_name"]
        print(df_final.bgc_id.to_list())
        
        outdir = Path(f"../config/{project_name}")
        outdir.mkdir(exist_ok=True, parents=True)
        df_final.to_csv(outdir / "samples_curated.csv", index=False)

        project_config = {'name': project_name,
                  'pep_version': '2.1.0',
                  'description': f'A selection of BGCs producing {project_name}',
                  'gtdb-tax': '../gtdbtk.bac120.summary.tsv',
                  'sample_table': 'samples_curated.csv',
                  'rules': {'bigslice': False,
                            'bigscape': False,
                            'query-bigslice': False,
                            'clinker': True,
                            'interproscan': False,
                            'mmseqs2': True}
                 }

        # Write data to a YAML file
        with open(outdir / 'project_config.yaml', 'w') as yaml_file:
            yaml.dump(project_config, yaml_file, default_flow_style=False)

        print(f"  - pep: config/{project_name}/project_config.yaml")

In [None]:
project_name = 'Trans_AT_network_raw'
outdir = Path(f"../config/{project_name}")
outdir.mkdir(exist_ok=True, parents=True)

for i in samples_all.index:
    gbk_path = samples_all.loc[i, "gbk_path"]
    if gbk_path.startswith("data/interim"):
        input_file = gbk_path
        output_file = gbk_path.replace("data/interim/", "data/external/G1034_20230801/")
        print(input_file, output_file)
        output_file_path = Path("..") / output_file
        output_file_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(Path("..") / input_file, output_file_path)
        samples_all.loc[i, "gbk_path"] = output_file
samples_all.to_csv(outdir / "samples_curated.csv", index=False)        

## Create project containing all curated trans-AT PKS

In [None]:
samples_validated = samples_all[samples_all["validated-member-of-dataset"] == "yes"]
project_name = 'Trans_AT_network_all'
outdir = Path(f"../config/{project_name}")
outdir.mkdir(exist_ok=True, parents=True)
samples_validated.to_csv(outdir / "samples_curated.csv", index=False)

In [None]:
project_config = {'name': project_name,
                  'pep_version': '2.1.0',
                  'description': f'A selection of BGCs producing {project_name}',
                  'gtdb-tax': '../gtdbtk.bac120.summary.tsv',
                  'sample_table': 'samples_curated.csv',
                  'rules': {'bigslice': False,
                            'bigscape': True,
                            'query-bigslice': False,
                            'clinker': False,
                            'interproscan': False,
                            'mmseqs2': False}
                 }

# Write data to a YAML file
with open(outdir / 'project_config.yaml', 'w') as yaml_file:
    yaml.dump(project_config, yaml_file, default_flow_style=False)

print(f"  - pep: config/{project_name}/project_config.yaml")