In [115]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys, PandasTools
import pubchempy as pcp
import glob
import pickle
import urllib.request, json
import re
from sklearn.decomposition import FastICA, PCA, IncrementalPCA

# PubChem

Note that all steps involving the PubChem data are rather lengthy as they involve huge amounts of data. Hence, we refrain from providing the intermediate datasets and refer the user to the original website to download the data and pre-process it according to our script.

PubChem Download: https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/

#### Extract unique SMILES from PubChem

1. First, we downloaded all compounds in sdf format in batches from PubChem and unpacked each of the batches.
2. Starting from the completed unpacking step, we restrict each file to only the canonical SMILES to reduce the workload of subsequent steps.
3. We keep only the unique SMILES within each batch, and then iteratively concatenate all batches while only keeping unique SMILES.
4. Finally, we store the list of all unique SMILES as unique_smiles.txt

In [38]:
# PATH_TO_UNZIPPED_SDF = ''  # set path to downloaded and unpacked PubChem sdf files
# PATH_TO_EXTRACTED_SMILES = ''  # set path to folder where the extracted smiles will be stored

In [34]:
# # convert sdf to SMILES files (not unique!)
# for f in [glob.glob(PATH_TO_UNZIPPED_SDF)]:
#     print(f)
#     try:
#         df = PandasTools.LoadSDF(f, molColName=None)
#         df.to_csv(PATH_TO_EXTRACTED_SMILES+'\\'+f.split('.')[0]+'.txt', columns=['PUBCHEM_OPENEYE_CAN_SMILES'], header=False, index=False)
#     except:
#         print('ERROR in', f)

In [35]:
# # merge files; keep only unique smiles
# files_smiles = glob.glob(PATH_TO_EXTRACTED_SMILES+'\\*')
# df = pd.read_csv(files_smiles[0], header=None)
# df = df.drop_duplicates(subset=0, keep='first')
# df = df.set_index(0)

In [36]:
# for f in files_smiles[1:]:
#     try:
#         df2 = pd.read_csv(f, header=None)
#         df2 = df2.drop_duplicates(subset=0, keep='first')
#         df2 = df2.set_index(0)
#         df = df.combine_first(df2)
#     except:
#         print('Stopped at', f)
#         break

In [23]:
#df.to_csv(PATH_TO_EXTRACTED_SMILES+'\\unique_smiles.txt', header=False, index=True)

#### Calculate incremental PCA

To locate other datasets in the PubChem PCA space, we need to train PCA for PubChem. 

In [40]:
PATH_TO_PCA_MODELS = 'Data\\PCA'  # set path to folder where all intermediate results for PCA calculations should be stored

In [10]:
# # a balanced batch-size is 5*n_features, that's just below 1000 here.
# chunksize = 10**3
# done = 0
# pca_pubchem = IncrementalPCA(n_components=10)
# for chunk in pd.read_csv(PATH_TO_EXTRACTED_SMILES+'\\unique_smiles.txt', header=None, chunksize=chunksize):
#     res, smiles = [], []
#     for i in range(len(chunk)):
#         try:
#             mol = Chem.MolFromSmiles(chunk[0][done+i])
#             maccs = MACCSkeys.GenMACCSKeys(mol)
#             res.append(maccs)
#         except: 
#             continue
#     pca_pubchem.partial_fit(res)
#     done += chunksize
#     if (done % (100*chunksize)) == 0: # write only after each 10th iteration
#         pickle.dump(pca_pubchem, open(PATH_TO_PCA_MODELS+'\\PCA_PubChem_model.pickle', 'wb'))
#         print('%.2f percent done' % (done / 634417.97), end='\r')

In [9]:
pca_pubchem = pickle.load(open(PATH_TO_PCA_MODELS+'\\PCA_PubChem_model.pickle', 'rb'))

#### Calculate histogram to estimate the PubChem density in PCA space

In [37]:
# out_file = PATH_TO_PCA_MODELS+'\\PCA_PubChem.txt'
# open(out_file, mode='w').close()
# #num_lines = sum(1 for line in open(PATH_TO_EXTRACTED_SMILES+'\\unique_smiles.txt')) #63441797

# chunksize = 10**5
# done = 0
# for chunk in pd.read_csv(PATH_TO_EXTRACTED_SMILES+'\\unique_smiles.txt', header=None, chunksize=chunksize):
#     res, smiles = [], []
#     for i in range(len(chunk)):
#         try:
#             mol = Chem.MolFromSmiles(chunk[0][done+i])
#             maccs = MACCSkeys.GenMACCSKeys(mol)
#             smiles.append(chunk[0][done+i])
#             res.append(maccs)
#         except: 
#             continue
#     pd.DataFrame(pca_pubchem.transform(res)).assign(smiles = smiles).to_csv(out_file, header=False, index=False, mode='a')
#     done += chunksize
#     print('%.2f percent done' % (done / 634417.97), end='\r')

In [15]:
# # find min/max in the PCA space 
# low, high = np.array([np.Inf]*10), np.array([-np.Inf]*10)
# for chunk in pd.read_csv(PATH_TO_PCA_MODELS+'\\PCA_PubChem.txt', chunksize=10**5, header=None):
#     low = np.minimum(chunk[range(10)].min(axis=0), low)
#     high = np.maximum(chunk[range(10)].max(axis=0), high)

In [16]:
# # write to file so you won't have to search for that again
# df = pd.DataFrame(np.vstack((low, high)))
# df['index'] = ['min', 'max']
# df = df.set_index('index')
# df.to_csv(PATH_TO_PCA_MODELS+'PCA_PubChem_minmax.txt', header=True, index=True)

In [17]:
# # read in min/max for all PCs
df = pd.read_csv(PATH_TO_PCA_MODELS+'\\PCA_PubChem_minmax.txt', header=0, index_col='index')
low_pubchem, high_pubchem = df.loc['min'].to_numpy(), df.loc['max'].to_numpy()
range_hist_pubchem = df[['0', '1']].to_numpy().transpose()

In [18]:
# # build chunk-wise histogram
# bins = 30
# H, xedges, yedges = np.histogram2d([], [], bins=bins, range=range_hist_pubchem) # empty histogram
# for chunk in pd.read_csv(PATH_TO_PCA_MODELS+'\\PCA_PubChem.txt', chunksize=10**5, header=None):
#     H_chunk, _, _ = np.histogram2d(chunk[0], chunk[1], bins=bins, range=range_hist_pubchem)
#     H += H_chunk

In [19]:
# # store histogram in file for later use
# hist_dict = {'H': H, 'xedges': xedges, 'yedges': yedges}
# np.save(PATH_TO_PCA_MODELS+'\\PCA_PubChem_hist_30.npy', hist_dict)

In [20]:
# # read histogram from file
hist_dict = np.load(PATH_TO_PCA_MODELS+'\\PCA_PubChem_hist_30.npy', allow_pickle=True)
H, xedges, yedges = hist_dict[()]['H'], hist_dict[()]['xedges'], hist_dict[()]['yedges']

**Note**: We similarly obtained the PubChem histogram and min/max for the PCA spaces resulting from the SOIL and BBD datasets (see below).

The results are stored in the same file format with prefix PCA_SOIL or PCA_BBD instead of PCA_PubChem.

## Agrochemical Subset

Download from https://pubchem.ncbi.nlm.nih.gov/#input_type=list&query=VRLzGHyuGRIuPJElE13YCQ42k1bcapT67t-PtvXOnbf116E&collection=compound&alias=PubChem%3A%20PubChem%20Compound%20TOC%3A%20Agrochemical%20Information

The pre-processing of this subset of PubChem is equivalent to that of the PubChem data above. Additionally, we can extract information on the type of the compounds and what they are used for. Those free texts are processed similar to those in SOIL/BBD (see below).

In [99]:
# # PATH_TO_AGRO_SDF = ''  # set path to downloaded and unpacked Agrochemical sdf files
# DF_AGRO = PandasTools.LoadSDF(PATH_TO_AGRO_SDF, molColName=None)
# DF_AGRO = DF_AGRO.rename(columns={'PUBCHEM_OPENEYE_CAN_SMILES': 'smiles'})
# DF_AGRO = DF_AGRO.drop_duplicates(keep='first', subset='smiles')
# mol = [Chem.MolFromSmiles(s) for s in DF_AGRO.smiles]
# DF_AGRO.drop([i for i, j in enumerate(mol) if j is None], inplace=True)
# DF_AGRO = DF_AGRO.reset_index(drop=True)
# agro_maccs = [np.array(MACCSkeys.GenMACCSKeys(m)) for m in [m for m in mol if m]]

In [100]:
# def get_agro_categories(cid):
#     rest_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/"+str(cid)+"/JSON"
#     with urllib.request.urlopen(rest_url) as url:
#         #data = json.loads(url.read().decode())
#         data = json.loads(url.read().decode("utf-8", errors='ignore'))
#     dicts = data.get('Record').get('Section')
#     dicts_agro = next((item for item in dicts if item["TOCHeading"] == "Agrochemical Information"), None)
#     if dicts_agro is None: return None 
#     else: dicts_agro = dicts_agro.get('Section')
#     list_agro = next((item for item in dicts_agro if item["TOCHeading"] == "Agrochemical Category"), None)
#     if list_agro is None: return None 
#     else: list_agro = list_agro.get('Information', None)
#     if list_agro is None: return None
#     categories = [list_agro[i].get('Value').get('StringWithMarkup')[0].get('String') for i in range(len(list_agro))]
#     return categories

In [101]:
# DF_AGRO['agro_categories'] = [get_agro_categories(cid) for cid in DF_AGRO.PUBCHEM_COMPOUND_CID]
# cats = DF_AGRO.agro_categories.tolist()
# for i in DF_AGRO.PUBCHEM_COMPOUND_CID:
#     cats[DF_AGRO.index[DF_AGRO['PUBCHEM_COMPOUND_CID'] == str(i)].tolist()[0]] = get_agro_categories(i)
# cats = [[] if c is None else c for c in cats]
# DF_AGRO['agro_categories'] = cats

In [102]:
# Unravel the free text

# flattened = [j for sub in cats for j in sub]
# flattened = np.unique(flattened)
# unflat = [f.split(',') for f in flattened]
# flattened = [j for sub in unflat for j in sub]
# unflat = [f.split('->') for f in flattened]
# flattened = [j for sub in unflat for j in sub]
# flattened = [f.strip().lower().removesuffix('s') for f in flattened]
# flattened = np.unique(flattened) # categories

# res = np.empty((len(DF_AGRO), len(flattened)))
# for i, cat in enumerate(flattened):
#     res[:,i] = [str(cat) in str(cats[i]).lower() for i in range(len(cats))]

# df_props = pd.DataFrame(res, columns=flattened, dtype=int)
# df_props['algicide'] = df_props.algicide + df_props.algaecide
# df_props['repellent'] = df_props.repellent + df_props.repellant
# df_props['plant growth regulator'] = df_props['growth reg.'] + df_props['plant growth regulator']
# df_props.drop(['algaecide', 'other substance', 'other treatment', 'repellant', 'special use', 'growth reg.'], axis=1, inplace=True)
# df_agro_prop = DF_AGRO.join(df_props, how='left')

In [None]:
# add information which dataset is contained in AGRO (read SOIL/BBD first!)

# mol_agro = [Chem.MolFromSmiles(s) for s in df_agro_prop.smiles]
# mol_bbd = [Chem.MolFromSmiles(s) for s in DF_BBD.SMILES]
# mol_soil = [Chem.MolFromSmiles(s) for s in DF_SOIL.SMILES]

# smiles_agro = [(None if m is None else Chem.MolToSmiles(m)) for m in mol_agro]
# smiles_bbd = [(None if m is None else Chem.MolToSmiles(m)) for m in mol_bbd]
# smiles_soil = [(None if m is None else Chem.MolToSmiles(m)) for m in mol_soil]

# agro_in_bbd = [smiles_agro[i] in smiles_bbd for i in range(len(smiles_agro))]
# agro_in_soil = [smiles_agro[i] in smiles_soil for i in range(len(smiles_agro))]
# df_agro_prop['isInSOIL'] = agro_in_soil
# df_agro_prop['isInBBD'] = agro_in_bbd

# df_agro_prop.to_csv('Data\\AGRO_all.csv', header=True, index=True)

In [106]:
DF_AGRO = pd.read_csv('Data\\AGRO_all.csv', header=0, index_col=0)

# SOIL

#### Read in Dataset

1. Read all SMILES from the enviPath platform.
2. Obtain the corresponding MACCS keys using RDKit (these will be used as features).
3. Drop those SMILES that cannot be processed using RDKit.

In [15]:
# from enviPath_python.enviPath import *
# from enviPath_python.objects import *

# # extract all smiles from enviPath
# EAWAG_SOIL = 'https://envipath.org/package/5882df9c-dae1-4d80-a40e-db4724271456'
# soil_smiles = get_smiles_from_envipath(EAWAG_SOIL)

In [2]:
# def get_maccs(compound):
#     try:
#         mol = Chem.MolFromSmiles(compound)
#         return MACCSkeys.GenMACCSKeys(mol)
#     except:
#         return np.NaN

In [34]:
# def get_smiles_from_envipath(package):
#     eP = enviPath('https://envipath.org/')
#     p = eP.get_package(package)
#     compounds = p.get_compounds()
#     smiles = []
#     for i, c in enumerate(compounds):
#         smiles.append(c.get_smiles())
#         print(i/len(compounds)*100, '% done', end='\r')
#     return smiles

In [127]:
# DF_SOIL = pd.DataFrame(soil_smiles, columns=['SMILES'])
# DF_SOIL['MACCS'] = [get_maccs(c) for c in DF_SOIL.SMILES]
# DF_SOIL.dropna(subset=['MACCS'], inplace=True)
# soil_maccs = np.stack(DF_SOIL.MACCS.to_numpy())

#### Join Discovery Years for Root Compounds

In [65]:
# # obtain list containing the years for root compounds

# eP = enviPath('https://envipath.org')
# soil_package = Package(eP.requester, id=EAWAG_SOIL)

# # get pathways
# pathways = soil_package.get_pathways()

# # open output file & set header
# outfile = open('SOIL_root_dates.tsv', 'w')
# outfile.write('root_node_SMILES\tdate\tpathway_link\n')

# all_soil_scenarios = soil_package.get_scenarios()
# def list_to_dict(all_scenarios):
#     scen_dict = {}
#     for scen in all_scenarios:
#         scen_dict[scen.get_name()] = scen.get_id()
#     return scen_dict

# def get_main_scen_name(ref_scen_name):
#     return ref_scen_name.split(' (Related Scenario) - ')[0]

# scen_dict = list_to_dict(all_soil_scenarios)

# # iterate through pathways in soil
# for pathway in pathways:
#     for node in pathway._get('nodes'):
#         if node.get('depth') == 0: # only consider root nodes (at depth == 0)
#             smiles = node.get('smiles') # fetch the smiles of the node
#             scenarios = node.get('proposed') # fetch a list of scenario links
#             all_dates = set([]) # container to keep unique dates
#             checked_main_scenarios = []
#             for scenario in scenarios:
#                 this_scenario = Scenario(eP.requester, id=scenario.get('scenarioId')) # load full scenario
#                 all_dates.add(this_scenario._get('date')) # fetch the date
#                 # if it's a related scenario, also check for dates in the main scenario
#                 if '(Related Scenario)' in this_scenario.get_name():
#                     main_scenario_name = get_main_scen_name(this_scenario.get_name())
#                     try:
#                         main_scenario_id = scen_dict[main_scenario_name]
#                     except KeyError:
#                         print("Warning: no id found for main scenario name {} "
#                               "derived from referring scenario id {}".format(main_scenario_name, this_scenario.get_id()))
#                     else:
#                         if main_scenario_id not in checked_main_scenarios: # check if already analyzed
#                             main_scenario = Scenario(eP.requester, id=main_scenario_id)
#                             all_dates.add(main_scenario._get('date'))
#                             checked_main_scenarios.append(main_scenario_id)
#             outfile.write('{}\t{}\t{}\n'.format(smiles, ';'.join(all_dates), pathway.get_id())) # write to output file
#             continue # no need to search further since we found the root node

In [71]:
# # extract years from dates
# df_tmp = pd.read_csv('Data/SOIL_root_dates.tsv', sep='\t', header=0)
# def getdate(s):
#     if s is np.nan: return 0
#     a = re.findall(r'.*([1-3][0-9]{3})', s)
#     if len(a) == 0: return 0
#     return int(a[0])
# df_tmp['year'] = [getdate(df_tmp.date[i]) for i in range(len(df_tmp))]

In [128]:
# DF_SOIL = DF_SOIL.merge(df_tmp[['root_node_SMILES', 'year']], left_on='SMILES', right_on='root_node_SMILES', how='left')
# DF_SOIL = DF_SOIL.drop(['root_node_SMILES'], axis=1)
# DF_SOIL['isRootCompound'] = DF_SOIL['year']>=0

#### Find the compounds in PubChem (if possible)

In [131]:
# cids = np.zeros(len(DF_SOIL))
# for i, smile in enumerate(DF_SOIL.SMILES):
#     try:
#         cids[i] = pcp.get_compounds(smile, 'smiles')[0].cid
#     except:
#         continue
#     print('Completed', i/len(DF_SOIL)*100, '%', end='\r')
# DF_SOIL['cid'] = cids
# np.save('Data\\SOIL_cids.npy', cids)
# DF_SOIL['cid'] = np.load('Data\\SOIL_cids.npy')

#### Query Use Categories

In [108]:
# def get_use_categories(cid):
#     rest_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/"+str(cid)+"/JSON"
#     with urllib.request.urlopen(rest_url) as url:
#         #data = json.loads(url.read().decode())
#         data = json.loads(url.read().decode("utf-8", errors='ignore'))
#     dicts = data.get('Record').get('Section')
#     dicts_agro = next((item for item in dicts if item["TOCHeading"] == "Use and Manufacturing"), None)
#     if dicts_agro is None: return None 
#     else: dicts_agro = dicts_agro.get('Section')
#     list_agro = next((item for item in dicts_agro if item["TOCHeading"] == "Uses"), None)
#     if list_agro is None: return None 
#     else: list_agro = list_agro.get('Section')[0].get('Information')
#     if list_agro is None: return None
#     categories = [list_agro[i].get('Value').get('StringWithMarkup')[0].get('String') for i in range(len(list_agro))]
#     return categories

In [134]:
# # query Use Categories from PubChem
# use_cats = []
# for i, cid in enumerate(cids.astype(np.int64)):
#     try:
#         use_cats.append(get_use_categories(cid))
#     except:
#         use_cats.append([])
#         continue
#     print('Completed', i/len(cids)*100, '%', end='\r')
# DF_SOIL['use_cats'] = use_cats
# np.save('Data\\SOIL_use_cats.npy', use_cats)
# DF_SOIL['use_cats'] = np.load('Data\\SOIL_use_cats.npy', allow_pickle=True)

In [136]:
# # pre-process free-text field
# use_cats = [[] if c is None else c for c in DF_SOIL['use_cats'].values]
# use_cats_concat = [','.join(s) for s in use_cats]
# use_cats_concat = [s.lower() for s in use_cats_concat]

In [141]:
# # search for specific terms
# USE_CATS = ['acaricide', 'attractant', 'biocide', 'fungicide', 'herbicide', 'insecticide', 
#             'pesticide', 'growth reg', 'transformation product', 'drug', 'food', 'health hazard', 'fire hazard']

# soil_categories = np.empty((len(DF_SOIL), len(USE_CATS)))
# for j, c in enumerate(USE_CATS):
#     for i in range(len(DF_SOIL)):
#         soil_categories[i, j] = c in use_cats_concat[i]
        
# DF_SOIL[USE_CATS] = soil_categories

#### Agrochemical subset of PubChem

In [144]:
# # get canonical smiles representation for more promising comparison
# mol_agro = [Chem.MolFromSmiles(s) for s in DF_AGRO.smiles]
# mol_soil = [Chem.MolFromSmiles(s) for s in DF_SOIL.SMILES]
# smiles_agro = [(None if m is None else Chem.MolToSmiles(m)) for m in mol_agro]
# smiles_soil = [(None if m is None else Chem.MolToSmiles(m)) for m in mol_soil]

In [145]:
# # create a flag indicating if the compound is contained in the agrochemical subset of PubChem
# DF_SOIL['SMILES_canon'] = smiles_soil
# DF_SOIL['isInAgro'] = [i in smiles_agro for i in smiles_soil]

In [148]:
#DF_SOIL.to_csv('Data\\SOIL_all.csv', header=True)
DF_SOIL = pd.read_csv('Data/SOIL_all.csv', header=0, index_col=0)

# BBD

#### Read in Dataset

1. Read all SMILES from the enviPath platform.
2. Obtain the corresponding MACCS keys using RDKit (these will be used as features).
3. Drop those SMILES that cannot be processed using RDKit.

In [154]:
# # extract all smiles from enviPath
# EAWAG_BBD = 'https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1'
# bbd_smiles = get_smiles_from_envipath(EAWAG_BBD)

In [150]:
# DF_BBD = pd.DataFrame(bbd_smiles, columns=['SMILES'])
# DF_BBD['MACCS'] = [get_maccs(c) for c in DF_BBD.SMILES]
# DF_BBD.dropna(subset=['MACCS'], inplace=True)
# bbd_maccs = np.stack(DF_BBD.MACCS.to_numpy())

#### Join Discovery Years for Root Compounds

For BBD, we use the last date of a professional update (via webserver). This date most likely coincides with the date when the compound was added since "normal" users access the compounds mainly via SQL and trigger different events. For the sake of reproducibility, we provide those dates in the file "BBD_dates.tsv".

In [153]:
# df_tmp = pd.read_csv('Data/BBD_dates.tsv', sep='\t', header=0)
# df_tmp['year'] = [getdate(df_tmp.date[i]) for i in range(len(df_tmp))]

# DF_BBD = DF_BBD.merge(df_tmp[['smiles', 'year']], left_on='SMILES', right_on='smiles', how='left')
# DF_BBD = DF_BBD.drop(['smiles'], axis=1)
# DF_BBD['isRootCompound'] = DF_BBD['year']>=0

#### Find the compounds in PubChem (if possible)

In [162]:
# cids = np.zeros(len(DF_BBD))
# for i, smile in enumerate(DF_BBD.SMILES):
#     try:
#         cids[i] = pcp.get_compounds(smile, 'smiles')[0].cid
#     except:
#         continue
#     print('Completed', i/len(DF_BBD)*100, '%', end='\r')
# DF_BBD['cid'] = cids
# np.save('Data\\BBD_cids.npy', cids)
# DF_BBD['cid'] = np.load('Data\\BBD_cids.npy')

#### Query Use Categories

In [163]:
# # query Use Categories from PubChem
# use_cats = []
# for i, cid in enumerate(cids.astype(np.int64)):
#     try:
#         use_cats.append(get_use_categories(cid))
#     except:
#         use_cats.append([])
#         continue
#     print('Completed', i/len(cids)*100, '%', end='\r')
# DF_BBD['use_cats'] = use_cats
# np.save('Data\\BBD_use_cats.npy', use_cats)
# DF_BBD['use_cats'] = np.load('Data\\BBD_use_cats.npy', allow_pickle=True)

In [157]:
# # pre-process free-text field
# use_cats = [[] if c is None else c for c in DF_BBD['use_cats'].values]
# use_cats_concat = [','.join(s) for s in use_cats]
# use_cats_concat = [s.lower() for s in use_cats_concat]

In [158]:
# # search for specific terms
# bbd_categories = np.empty((len(DF_BBD), len(USE_CATS)))
# for j, c in enumerate(USE_CATS):
#     for i in range(len(DF_BBD)):
#         bbd_categories[i, j] = c in use_cats_concat[i]
        
# DF_BBD[USE_CATS] = bbd_categories

#### Agrochemical subset of PubChem

In [159]:
# # get canonical smiles representation for more promising comparison
# mol_bbd = [Chem.MolFromSmiles(s) for s in DF_BBD.SMILES]
# smiles_bbd = [(None if m is None else Chem.MolToSmiles(m)) for m in mol_bbd]

In [160]:
# # create a flag indicating if the compound is contained in the agrochemical subset of PubChem
# DF_BBD['SMILES_canon'] = smiles_bbd
# DF_BBD['isInAgro'] = [i in smiles_agro for i in smiles_bbd]

In [161]:
# DF_BBD.to_csv('Data\\BBD_all.csv', header=True)
DF_BBD = pd.read_csv('Data/BBD_all.csv', header=0, index_col=0)