# Import packages

In [58]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
import sys
import yaml

sys.path.insert(1, os.path.join(os.getcwd(), '../scripts'))
from parse_gro_files import read_residue_file, extract_info_from_gro
# Update font size for graphics
plt.rcParams.update({'font.size': 20})

# Load data

In [2]:
def clean_df(df, column_name='file_name'):
    remove = ['.DS_Store', '__MACOSX', '.git']
    df = df[~df[column_name].str.contains('|'.join(remove))].reset_index(drop=True)
    return df

In [3]:
def read_query_file(filename):
    """File query.yml parser.

    This function parses the query file.

    Parameters
    ----------
    filename : str
        Filename for query parameters.
    
    Returns
    -------
    file_types : dict
        Dictionary with type, engine, keywords to use and category
    """
    with open(filename, "r") as param_file:
        data_loaded = yaml.safe_load(param_file)
    file_types = data_loaded["file_types"]
    return file_types

In [4]:
def assign_file_category(file_type):
    """Assign file category based on file type.
    
    Parameters
    ----------
    file_type : str
        File extension.
    
    Returns
    -------
    str
        Category.
    """
    if file_type in list_coordinate:
        return "coordinate"
    elif file_type in list_topology:
        return "topology"
    elif file_type in list_trajectory:
        return "trajectory"
    else:
        return "none"

In [5]:
def assign_md_engine(file_type):
    """Assign MD engine based on file type.
    
    Parameters
    ----------
    file_type : str
        File extension.
    
    Returns
    -------
    str
        Name of MD engine.
    """
    if file_type in ext_gromacs:
        return "gromacs"
    elif file_type in ext_namd:
        return "namd"
    elif file_type in ext_amber:
        return "amber"
    else:
        return "none"

In [6]:
def extract_remote_info_from_gro(
    gro_file_path,
    target_path,
    gro_file_counter,
    gro_file_number,
    protein_residues,
    lipid_residues,
    nucleic_residues,
    water_ion_residues,
    glucid_residues
):
    """Extract information from Gromacs gro file.

    Parameters
    ----------
    gro_file_path : str
        Path to gro file
    target_path : str
        Path to the directory to find gro files
    gro_file_counter : int
        Counter for the current gro file
    gro_file_number : int
        Total number of gro files to parse
    protein_residues : list
        List of protein residues
    lipid_residues : list
        List of lipid residues
    nucleic_residues : lst
        List of nucleic acid residues
    water_ion_residues : lst
        List of water and ions
    glucid_residues : lst
        List of glucid residues

    Returns
    -------
    dict
        Dictionnary of extracted informations
    """
    AA_ref = {
        'ALA': 5,
        'ARG': 11,
        'ASN': 8,
        'ASP': 8,
        'CYS': 6,
        'GLN': 9,
        'GLU': 9,
        'GLY': 4,
        'HIS': 9,
        'HSD': 9,
        'ILE': 8,
        'LEU': 8,
        'LYS': 9,
        'MET': 8,
        'PHE': 11,
        'PRO': 7,
        'SER': 6,
        'THR': 7,
        'TRP': 14,
        'TYR': 12,
        'VAL': 7,
    }
    CG_list = ['BB', 'SC1', 'SC2', 'SC3']
    
    info = {
        "dataset_origin": None,
        "dataset_id": None,
        "atom_number": None,
        "has_protein": False,
        "has_nucleic": False,
        "has_lipid": False,
        "has_water_ion": False,
        "has_glucid": False,
        "filename": None,
        "is_AA": None,
    }
    info["dataset_origin"], info["dataset_id"], info["filename"] = str(
        target_path
    ).split("|")
    print(
        f"Reading {gro_file_counter:{len(str(gro_file_number))}d}/{gro_file_number} {str(gro_file_path)}"
    )
    f = requests.get(gro_file_path)
    prev_residue_number = -999
    all_gro = []
    for index, gro_file in enumerate(f):
        gro_file = gro_file.decode('UTF-8')
        all_gro.append(gro_file)
    gro_file = "".join(all_gro)
    for idx, line in enumerate(gro_file.split('\n')):
        if idx == 1:
            try:
                info["atom_number"] = int(line)
            except:
                pass
        if (idx > 1) and (len(line.rstrip()) <= 70) and (len(line.split()) > 3):
            residue_number = int(line[0:5])
            residue_name = line[5:10].strip()
            atom_name = line[10:15].strip()
            atom_number = int(line[15:20])
            if residue_name in AA_ref.keys():
                if prev_residue_number == -999:
                    prev_residue_number = residue_number
                    prev_residue_name = residue_name
                    num_atoms = 0
            if residue_name in protein_residues:
                info["has_protein"] = True
                if info["is_AA"] == None:
                    if atom_name in CG_list:
                        info["is_AA"] = False
                    else:
                        if prev_residue_number == residue_number:
                            num_atoms = num_atoms+1
                        else:
                            if prev_residue_name in AA_ref.keys():
                                if num_atoms > AA_ref[prev_residue_name]:
                                    info["is_AA"] = True
                                else:
                                    info["is_AA"] = False
                            else:
                                prev_residue_number = -999
            elif residue_name in lipid_residues:
                info["has_lipid"] = True
            elif residue_name in nucleic_residues:
                info["has_nucleic"] = True
            elif residue_name in water_ion_residues:
                info["has_water_ion"] = True
            elif residue_name in glucid_residues:
                info["has_glucid"] = True
            # WALL particles
            elif residue_name in ["WAL"]:
                pass
            else:
                pass
                #print(f"Unknown residue: {residue_name} / {str(gro_file_path)}")
    if info["is_AA"] == None:
        info["is_AA"] = False
    return info
    

In [7]:
def extract_remote_info_from_pdb(
    gro_file_path,
    target_path,
    gro_file_counter,
    gro_file_number,
    protein_residues,
    lipid_residues,
    nucleic_residues,
    water_ion_residues,
    glucid_residues
):
    """Extract information from pdb file.

    Parameters
    ----------
    gro_file_path : str
        Path to gro file
    target_path : str
        Path to the directory to find gro files
    gro_file_counter : int
        Counter for the current gro file
    gro_file_number : int
        Total number of gro files to parse
    protein_residues : list
        List of protein residues
    lipid_residues : list
        List of lipid residues
    nucleic_residues : lst
        List of nucleic acid residues
    water_ion_residues : lst
        List of water and ions
    glucid_residues : lst
        List of glucid residues

    Returns
    -------
    dict
        Dictionnary of extracted informations
    """
    AA_ref = {
        'ALA': 5,
        'ARG': 11,
        'ASN': 8,
        'ASP': 8,
        'CYS': 6,
        'GLN': 9,
        'GLU': 9,
        'GLY': 4,
        'HIS': 9,
        'HSD': 9,
        'ILE': 8,
        'LEU': 8,
        'LYS': 9,
        'MET': 8,
        'PHE': 11,
        'PRO': 7,
        'SER': 6,
        'THR': 7,
        'TRP': 14,
        'TYR': 12,
        'VAL': 7,
    }
    CG_list = ['BB', 'SC1', 'SC2', 'SC3']
    
    info = {
        "dataset_origin": None,
        "dataset_id": None,
        "atom_number": None,
        "has_protein": False,
        "has_nucleic": False,
        "has_lipid": False,
        "has_water_ion": False,
        "has_glucid": False,
        "filename": None,
        "is_AA": None,
    }
    info["dataset_origin"], info["dataset_id"], info["filename"] = str(
        target_path
    ).split("|")
    print(
        f"Reading {gro_file_counter:{len(str(gro_file_number))}d}/{gro_file_number} {str(gro_file_path)}"
    )
    f = requests.get(gro_file_path)
    prev_residue_number = -999
    atom_num = 0
    all_gro = []
    for index, gro_file in enumerate(f):
        gro_file = gro_file.decode('UTF-8')
        all_gro.append(gro_file)
    gro_file = "".join(all_gro)
    for idx, line in enumerate(gro_file.split('\n')):
        if len(line.split()) > 5:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                residue_number = int(line[22:26])
                residue_name = line[17:21].strip()
                atom_name = line[11:16].strip()
                atom_number = int(line[6:11])
                atom_num = atom_num+1
                if residue_name in AA_ref.keys():
                    if prev_residue_number == -999:
                        prev_residue_number = residue_number
                        prev_residue_name = residue_name
                        num_atoms = 0
                if residue_name in protein_residues:
                    info["has_protein"] = True
                    if info["is_AA"] == None:
                        if atom_name in CG_list:
                            info["is_AA"] = False
                        else:
                            if prev_residue_number == residue_number:
                                num_atoms = num_atoms+1
                            else:
                                if prev_residue_name in AA_ref.keys():
                                    if num_atoms > AA_ref[prev_residue_name]:
                                        info["is_AA"] = True
                                    else:
                                        info["is_AA"] = False
                                else:
                                    prev_residue_number = -999
                elif residue_name in lipid_residues:
                    info["has_lipid"] = True
                elif residue_name in nucleic_residues:
                    info["has_nucleic"] = True
                elif residue_name in water_ion_residues:
                    info["has_water_ion"] = True
                elif residue_name in glucid_residues:
                    info["has_glucid"] = True
                # WALL particles
                elif residue_name in ["WAL"]:
                    pass
                else:
                    pass
                    #print(f"Unknown residue: {residue_name} / {str(gro_file_path)}")
    info["atom_number"] = atom_num
    if info["is_AA"] == None:
        info["is_AA"] = False
    return info
    

In [8]:
def get_coord_info(coord_file, source, gro_file_name='', gro_file_idx=0, GRO_FILE_NUMBER=0, ):
    if source=='gro':
        gro_info = extract_remote_info_from_gro(
                coord_file,
                gro_file_name,
                gro_file_idx,
                GRO_FILE_NUMBER,
                PROTEIN_RESIDUES,
                LIPID_RESIDUES,
                NUCLEIC_RESIDUES,
                WATER_ION_RESIDUES,
                GLUCID_RESIDUES
            )
    elif source=='pdb':
        gro_info = extract_remote_info_from_pdb(
            coord_file,
            gro_file_name,
            gro_file_idx,
            GRO_FILE_NUMBER,
            PROTEIN_RESIDUES,
            LIPID_RESIDUES,
            NUCLEIC_RESIDUES,
            WATER_ION_RESIDUES,
            GLUCID_RESIDUES
        )
    return gro_info

In [9]:
dico_file_types = read_query_file("../params/query.yml")

ext_gromacs = []
ext_namd = []
ext_amber = []

list_coordinate = []
list_trajectory = []
list_topology = []

for dico in dico_file_types:
    if dico["engine"] == "gromacs":
        ext_gromacs.append(dico["type"])
    elif dico["engine"] == "namd":
        ext_namd.append(dico["type"])
    elif dico["engine"] == "amber":
        ext_amber.append(dico["type"])
    
    if dico["category"] == "coordinate":
        list_coordinate.append(dico["type"])
    elif dico["category"] == "topology":
        list_topology.append(dico["type"])
    elif dico["category"] == "trajectory":
        list_trajectory.append(dico["type"])

ext_all = ext_gromacs + ext_namd + ext_amber
# The filetype "top" is included in both Gromacs and Amber. 
# And, the filetype "prm" is included in Namd and Amber.
# But, to simplify the analyses, I consider that "top" is only a filetype of Gromacs and prm a filetype of Namd.

In [10]:
PROTEIN_RESIDUES, LIPID_RESIDUES, NUCLEIC_RESIDUES, WATER_ION_RESIDUES, GLUCID_RESIDUES = read_residue_file(os.path.join("../params/residue_names.yml"))


Reading residue definition fom: ../params/residue_names.yml


In [11]:
zenodo_datasets = pd.read_csv(os.path.join(os.getcwd(), '../data', "zenodo_datasets.tsv"), sep="\t")
print(f'Number of datasets in zenodo: {len(zenodo_datasets)}')
figshare_datasets = pd.read_csv(os.path.join(os.getcwd(), '../data', "figshare_datasets.tsv"), sep="\t")
print(f'Number of datasets in figshare: {len(figshare_datasets)}')
all_datasets = pd.concat([zenodo_datasets, figshare_datasets], ignore_index=True)
print(f'Number of datasets in both: {len(all_datasets)}')

all_datasets['year'] = all_datasets['date_creation'].apply(lambda x: int(x.split('-')[0]))
#all_datasets

Number of datasets in zenodo: 873
Number of datasets in figshare: 2006
Number of datasets in both: 2879


In [12]:
zenodo_files = pd.read_csv(os.path.join(os.getcwd(), '../data', "zenodo_files.tsv"), sep="\t")
print(f'Number of files before cleaning in zenodo: {len(zenodo_files)}')
zenodo_files = clean_df(zenodo_files)
print(f'Number of files in zenodo: {len(zenodo_files)}')
figshare_files = pd.read_csv(os.path.join(os.getcwd(), '../data', "figshare_files.tsv"), sep="\t")
print(f'Number of files before cleaning in figshare: {len(figshare_files)}')
figshare_files = clean_df(figshare_files)
print(f'Number of files in figshare: {len(figshare_files)}')
all_files = pd.concat([zenodo_files, figshare_files], ignore_index=True)
print(f'Number of files in both: {len(all_files)}')
#all_files

  interactivity=interactivity, compiler=compiler, result=result)


Number of files before cleaning in zenodo: 152868
Number of files in zenodo: 152862
Number of files before cleaning in figshare: 333283
Number of files in figshare: 311941
Number of files in both: 464803


In [13]:
all_df = pd.merge(all_datasets, all_files, on=['dataset_origin', 'dataset_id'], how='right')
#all_df

In [14]:
all_df["category"] = all_df["file_type"].apply(assign_file_category)
all_df["category"].value_counts()

all_df["engine_MD"] = all_df["file_type"].apply(assign_md_engine)
all_df["engine_MD"].value_counts()

none       395282
gromacs     55080
amber        7658
namd         6783
Name: engine_MD, dtype: int64

In [15]:
all_df['dataset_id_ori'] = all_df['dataset_origin'] + '_' + all_df['dataset_id'].astype(str)

# Statistics

## Numbers

In [16]:
print(f"Number of datasets in zenodo: {len(all_datasets.loc[all_datasets['dataset_origin']=='zenodo', 'dataset_origin'])}")
print(f"Number of datasets in figshare: {len(all_datasets.loc[all_datasets['dataset_origin']=='figshare', 'dataset_origin'])}")
print(f"Number of datasets in both: {len(all_datasets.loc[all_datasets['dataset_origin'].isin(['figshare', 'zenodo']), 'dataset_origin'])}")


Number of datasets in zenodo: 873
Number of datasets in figshare: 2006
Number of datasets in both: 2879


In [17]:
print(f"Number of files in zenodo: {len(all_files.loc[all_files['dataset_origin']=='zenodo', 'dataset_origin'])}")
print(f"Number of files in figshare: {len(all_files.loc[all_files['dataset_origin']=='figshare', 'dataset_origin'])}")
print(f"Number of files in both: {len(all_files.loc[all_files['dataset_origin'].isin(['figshare', 'zenodo']), 'dataset_origin'])}")


Number of files in zenodo: 152862
Number of files in figshare: 311941
Number of files in both: 464803


In [18]:
print(f"Number of files in zenodo: \n{(all_df.loc[all_df['dataset_origin']=='zenodo', 'file_size']).describe()}")
print(f"Number of files in figshare: \n{(all_df.loc[all_df['dataset_origin']=='figshare', 'file_size']).describe()}")
print(f"Number of files in both: \n{(all_df.loc[all_df['dataset_origin'].isin(['figshare', 'zenodo']), 'file_size']).describe()}")


Number of files in zenodo: 
count    1.528620e+05
mean     7.812160e+07
std      9.143179e+08
min      0.000000e+00
25%      6.800000e+03
50%      4.904000e+05
75%      7.700000e+06
max      5.619977e+10
Name: file_size, dtype: float64
Number of files in figshare: 
count    4.513000e+03
mean     1.763161e+08
std      7.375295e+08
min      5.000000e+00
25%      9.161800e+04
50%      3.205243e+06
75%      6.824028e+07
max      1.720746e+10
Name: file_size, dtype: float64
Number of files in both: 
count    1.573750e+05
mean     8.093750e+07
std      9.098725e+08
min      0.000000e+00
25%      7.800000e+03
50%      5.176000e+05
75%      7.837158e+06
max      5.619977e+10
Name: file_size, dtype: float64


In [19]:
print(f"Size of files in zenodo: {(all_df.loc[all_df['dataset_origin']=='zenodo', 'file_size']).sum()}")
print(f"Size of files in figshare: {(all_df.loc[all_df['dataset_origin']=='figshare', 'file_size']).sum()}")
print(f"Size of files in both: {(all_df.loc[all_df['dataset_origin'].isin(['figshare', 'zenodo']), 'file_size']).sum()}")


Size of files in zenodo: 11941824539219.0
Size of files in figshare: 795714657760.0
Size of files in both: 12737539196979.0


In [20]:

print(f"Number of zipfiles in zenodo: {len(all_files.loc[(all_files['dataset_origin']=='zenodo') & (all_files['file_type']=='zip'), 'dataset_origin'])}")
print(f"Number of zipfiles in figshare: {len(all_files.loc[(all_files['dataset_origin']=='figshare') & (all_files['file_type']=='zip'), 'dataset_origin'])}")
print(f"Number of zipfiles in both: {len(all_files.loc[(all_files['dataset_origin'].isin(['figshare', 'zenodo'])) & (all_files['file_type']=='zip'), 'dataset_origin'])}")


Number of zipfiles in zenodo: 1344
Number of zipfiles in figshare: 1823
Number of zipfiles in both: 3167


In [21]:

print(f"Size of zipfiles in zenodo: {(all_files.loc[(all_files['dataset_origin']=='zenodo') & (all_files['file_type']=='zip'), 'file_size']).sum()}")
print(f"Size of zipfiles in figshare: {(all_files.loc[(all_files['dataset_origin']=='figshare') & (all_files['file_type']=='zip'), 'file_size']).sum()}")
print(f"Size of zipfiles in both: {(all_files.loc[(all_files['dataset_origin'].isin(['figshare', 'zenodo'])) & (all_files['file_type']=='zip'), 'file_size']).sum()}")



Size of zipfiles in zenodo: 2372187319823.0
Size of zipfiles in figshare: 185090397617.0
Size of zipfiles in both: 2557277717440.0


In [22]:

print(f"Size of zipfiles in zenodo: \n{(all_files.loc[(all_files['dataset_origin']=='zenodo') & (all_files['file_type']=='zip'), 'file_size']).describe()}")
print(f"Size of zipfiles in figshare: \n{(all_files.loc[(all_files['dataset_origin']=='figshare') & (all_files['file_type']=='zip'), 'file_size']).describe()}")
print(f"Size of zipfiles in both: \n{(all_files.loc[(all_files['dataset_origin'].isin(['figshare', 'zenodo'])) & (all_files['file_type']=='zip'), 'file_size']).describe()}")
print(all_files.loc[(all_files['dataset_origin']=='figshare') & (all_files['file_type']=='zip'), 'file_size'].sum())
print(all_df.loc[(all_df['dataset_origin']=='figshare') & (all_df['file_type']=='zip'), 'file_size'].sum())
all_df.loc[(all_df['dataset_origin']=='figshare') & (all_df['file_type']=='zip') & (all_df['from_zip_file']==True)]


Size of zipfiles in zenodo: 
count    1.344000e+03
mean     1.765020e+09
std      3.733025e+09
min      1.177000e+03
25%      6.034670e+07
50%      3.352578e+08
75%      1.977511e+09
max      4.464433e+10
Name: file_size, dtype: float64
Size of zipfiles in figshare: 
count    1.770000e+03
mean     1.045708e+08
std      6.501742e+08
min      4.580000e+02
25%      2.114065e+05
50%      3.808956e+06
75%      1.776786e+07
max      1.720746e+10
Name: file_size, dtype: float64
Size of zipfiles in both: 
count    3.114000e+03
mean     8.212196e+08
std      2.632263e+09
min      4.580000e+02
25%      8.392685e+05
50%      1.663913e+07
75%      3.260503e+08
max      4.464433e+10
Name: file_size, dtype: float64
185090397617.0
185090397617.0


Unnamed: 0,dataset_origin,dataset_id,doi,date_creation,date_last_modified,date_fetched,file_number,download_number,view_number,license,...,file_type,file_size,file_md5,from_zip_file,file_name,file_url,origin_zip_file,category,engine_MD,dataset_id_ori
158255,figshare,14932766,10.1021/acs.jpcb.1c05320.s002,2021-07-08,2021-07-08,2022-04-29T00:44:50,1,12,43,CC BY-NC 4.0,...,zip,,,True,SI-ZIP/MD_Cavity/ani-HypD-50.pdb.zip,,jp1c05320_si_002.zip,none,none,figshare_14932766
158256,figshare,14932766,10.1021/acs.jpcb.1c05320.s002,2021-07-08,2021-07-08,2022-04-29T00:44:50,1,12,43,CC BY-NC 4.0,...,zip,,,True,SI-ZIP/MD_Cavity/Caver_analysis.zip,,jp1c05320_si_002.zip,none,none,figshare_14932766
158257,figshare,14932766,10.1021/acs.jpcb.1c05320.s002,2021-07-08,2021-07-08,2022-04-29T00:44:50,1,12,43,CC BY-NC 4.0,...,zip,,,True,SI-ZIP/MD_Cavity/zwi-HypD-50.pdb.zip,,jp1c05320_si_002.zip,none,none,figshare_14932766
174767,figshare,12841135,10.1021/acs.jcim.0c00525.s002,2020-08-21,2020-08-21,2022-04-29T00:45:07,1,11,66,CC BY-NC 4.0,...,zip,,,True,Supplementary_Materials/Tables_performance_POP...,,ci0c00525_si_002.zip,none,none,figshare_12841135
174768,figshare,12841135,10.1021/acs.jcim.0c00525.s002,2020-08-21,2020-08-21,2022-04-29T00:45:07,1,11,66,CC BY-NC 4.0,...,zip,,,True,Supplementary_Materials/Partial_Dependence_Plo...,,ci0c00525_si_002.zip,none,none,figshare_12841135
174772,figshare,12841135,10.1021/acs.jcim.0c00525.s002,2020-08-21,2020-08-21,2022-04-29T00:45:07,1,11,66,CC BY-NC 4.0,...,zip,,,True,Supplementary_Materials/gromacs_inputs.zip,,ci0c00525_si_002.zip,none,none,figshare_12841135
175095,figshare,11764158,10.6084/m9.figshare.11764158.v2,2020-01-29,2020-01-29,2022-04-29T00:45:12,10,1139,3605,CC BY 4.0,...,zip,,,True,QHD_LP6/other_files/centroid_QHD.zip,,QHD_LP6.zip,none,none,figshare_11764158
175121,figshare,11764158,10.6084/m9.figshare.11764158.v2,2020-01-29,2020-01-29,2022-04-29T00:45:12,10,1139,3605,CC BY 4.0,...,zip,,,True,QHD_LP5/other_files/centroid_QHD.zip,,QHD_LP5.zip,none,none,figshare_11764158
175147,figshare,11764158,10.6084/m9.figshare.11764158.v2,2020-01-29,2020-01-29,2022-04-29T00:45:12,10,1139,3605,CC BY 4.0,...,zip,,,True,QHD_LP4/other_files/centroid_QHD.zip,,QHD_LP4.zip,none,none,figshare_11764158
175174,figshare,11764158,10.6084/m9.figshare.11764158.v2,2020-01-29,2020-01-29,2022-04-29T00:45:12,10,1139,3605,CC BY 4.0,...,zip,,,True,QHD_LP3/other_files/centroid_QHD.zip,,QHD_LP3.zip,none,none,figshare_11764158


# Get traj datasets

In [94]:
traj_df = all_df.copy()
traj_df = traj_df.loc[traj_df['category']=='trajectory'].reset_index(drop=True)

traj_df_from_zip = traj_df.copy()
traj_df_from_zip = traj_df_from_zip.loc[traj_df_from_zip['from_zip_file']==True].reset_index(drop=True)
traj_df_no_zip = traj_df.copy()
traj_df_no_zip = traj_df_no_zip.loc[traj_df_no_zip['from_zip_file']==False].reset_index(drop=True)
traj_df_no_zip

Unnamed: 0,dataset_origin,dataset_id,doi,date_creation,date_last_modified,date_fetched,file_number,download_number,view_number,license,...,file_type,file_size,file_md5,from_zip_file,file_name,file_url,origin_zip_file,category,engine_MD,dataset_id_ori
0,zenodo,495247,10.5281/zenodo.495247,2017-04-17,2020-01-21,2022-04-28T22:43:20,5,29,22,CC-BY-4.0,...,edr,1.591384e+06,3f592c232aa0487dbd534fe2e309563c,False,md50ns_chunk01.edr,https://zenodo.org/api/files/06706527-b59a-449...,none,trajectory,gromacs,zenodo_495247
1,zenodo,495247,10.5281/zenodo.495247,2017-04-17,2020-01-21,2022-04-28T22:43:20,5,29,22,CC-BY-4.0,...,xtc,5.867509e+08,afe83c617ac83b2923b0e135bfed2f3e,False,md50ns_chunk01.xtc,https://zenodo.org/api/files/06706527-b59a-449...,none,trajectory,gromacs,zenodo_495247
2,zenodo,495247,10.5281/zenodo.495247,2017-04-17,2020-01-21,2022-04-28T22:43:20,5,29,22,CC-BY-4.0,...,edr,1.591384e+06,f9a433c460e08e75c91f18569b632057,False,md50ns_chunk02.edr,https://zenodo.org/api/files/06706527-b59a-449...,none,trajectory,gromacs,zenodo_495247
3,zenodo,495247,10.5281/zenodo.495247,2017-04-17,2020-01-21,2022-04-28T22:43:20,5,29,22,CC-BY-4.0,...,xtc,5.867548e+08,a36878a526ba916a86e2fea18925193f,False,md50ns_chunk02.xtc,https://zenodo.org/api/files/06706527-b59a-449...,none,trajectory,gromacs,zenodo_495247
4,zenodo,4352287,10.5281/zenodo.4352287,2020-12-18,2020-12-19,2022-04-28T22:43:20,2,14,25,CC-BY-4.0,...,xtc,1.362350e+10,de899753a8274a8cb0d6e0e407104dfb,False,purepopccent.xtc,https://zenodo.org/api/files/2f4d99e4-5c1c-493...,none,trajectory,gromacs,zenodo_4352287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5385,figshare,17056145,10.6084/m9.figshare.17056145.v1,2021-11-22,2021-11-22,2022-04-29T00:43:59,15,1,18,CC BY 4.0,...,nc,2.704000e+03,35439bca805eb698d112cf1f28ab9e83,False,Figure10.nc,https://ndownloader.figshare.com/files/31545164,,trajectory,amber,figshare_17056145
5386,figshare,17056145,10.6084/m9.figshare.17056145.v1,2021-11-22,2021-11-22,2022-04-29T00:43:59,15,1,18,CC BY 4.0,...,nc,3.814880e+05,26fc1db119c827f33240fb193fc88bc7,False,Figure8a.nc,https://ndownloader.figshare.com/files/31545179,,trajectory,amber,figshare_17056145
5387,figshare,17056145,10.6084/m9.figshare.17056145.v1,2021-11-22,2021-11-22,2022-04-29T00:43:59,15,1,18,CC BY 4.0,...,nc,3.814880e+05,1817b9bbe5186ba3163c6ce095534e57,False,Figure8d.nc,https://ndownloader.figshare.com/files/31545185,,trajectory,amber,figshare_17056145
5388,figshare,7217726,10.6084/m9.figshare.7217726.v1,2018-10-17,2018-10-17,2022-04-29T00:44:00,1,588,647,CC BY 4.0,...,nc,3.110905e+09,5b18c1ad901a15dc56e519132669a9c8,False,EmissionClump.nc,https://ndownloader.figshare.com/files/13291901,,trajectory,amber,figshare_7217726


In [24]:
# checkout struc for content & number of atoms
# get size of traj
# get number of struc & traj in dataset


In [32]:
if os.path.exists(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv')):
    infos = pd.read_csv(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), sep="\t")
    already_checked = list(infos['dataset_id_ori'].unique())
else:
    infos = [['dataset_id_ori', 'category', 'file_name', 'file_type', 'size', 'url', 'atom_num', 
          'has_protein', 'has_nucleic', 'has_lipid', 'has_water_ion', 'has_glucid', 'is_AA']]
    with open(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), 'w') as fp:
        fp.write("\t".join(infos[0])+"\n")
    already_checked = []

for ins, dataset_id_ori in enumerate(traj_df_no_zip['dataset_id_ori'].unique()):
    if not (dataset_id_ori in already_checked):
        print(f"{ins}/{len(traj_df_no_zip['dataset_id_ori'].unique())}")
        dataset_files = all_df.loc[all_df['dataset_id_ori']==dataset_id_ori]
        coordinate_files = dataset_files.loc[dataset_files['category']=='coordinate']
        coordinate_files = dataset_files.loc[dataset_files['file_type'].isin(['gro', 'pdb'])].reset_index(drop=True)
        coordinate_files = coordinate_files.dropna(subset=['file_url']).reset_index(drop=True)
        if len(coordinate_files)>0:
            for index, row in coordinate_files.iterrows():
                source = row['file_type']
                infile=row['file_url']
                rdr = get_coord_info(infile, source, 
                               gro_file_name=f'{row["dataset_origin"]}|{row["dataset_id"]}|{infile.split("/")[-1]}', 
                               gro_file_idx=index, GRO_FILE_NUMBER=len(coordinate_files))
                if rdr['is_AA']==True:
                    print(rdr)
                new_line = [
                    dataset_id_ori, 'coordinate', rdr['filename'], source, row['file_size'], infile, rdr['atom_number'],
                    rdr['has_protein'], rdr['has_nucleic'], rdr['has_lipid'], rdr['has_water_ion'], rdr['has_glucid'], rdr['is_AA'],
                ]
                #infos.append(new_line)
                
                with open(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), 'a') as fp:
                    fp.write("\t".join([str(el) for el in new_line])+"\n")
            sub_traj_df_no_zip = traj_df_no_zip.loc[traj_df_no_zip['dataset_id_ori']==dataset_id_ori]
            sub_traj_df_no_zip = sub_traj_df_no_zip.loc[sub_traj_df_no_zip['category']=='trajectory']
            sub_traj_df_no_zip = sub_traj_df_no_zip.loc[sub_traj_df_no_zip['file_type'].isin(['xtc', 'trr'])].reset_index(drop=True)
            sub_traj_df_no_zip
            for index, row in sub_traj_df_no_zip.iterrows():
                infile=row['file_url']
                new_line = [
                    dataset_id_ori, 'trajectory', row['file_name'], 'xtc', row['file_size'], row['file_url'], None,
                    None, None, None, None, None, None,
                ]
                #infos.append(new_line)
                with open(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), 'a') as fp:
                    fp.write("\t".join([str(el) for el in new_line])+"\n")
            already_checked.append(dataset_id_ori)
# if len(infos) > 1:
#     infos_df = pd.DataFrame(data=infos[1:], columns=infos[0])
#     infos_df.to_csv(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), sep="\t")
# infos

['zenodo_3965660', 'zenodo_4115152', 'zenodo_1219366', 'zenodo_4388378', 'zenodo_3956812', 'zenodo_14066', 'zenodo_3956814', 'zenodo_51760', 'zenodo_45007', 'zenodo_4607770', 'zenodo_34415', 'zenodo_12743', 'zenodo_4674308', 'zenodo_14591', 'zenodo_4114065', 'zenodo_30898', 'zenodo_1129435', 'zenodo_4114641', 'zenodo_4501251', 'zenodo_3966452', 'zenodo_32173', 'zenodo_3956709', 'zenodo_13934', 'zenodo_4607833', 'zenodo_13814', 'zenodo_35156', 'zenodo_3956775', 'zenodo_4643634', 'zenodo_4522359', 'zenodo_4114422', 'zenodo_4643914', 'zenodo_229332', 'zenodo_4445375', 'zenodo_2640439', 'zenodo_16291', 'zenodo_3760965', 'zenodo_1468560', 'zenodo_1009094', 'zenodo_887398', 'zenodo_4643862', 'zenodo_1219468', 'zenodo_259392', 'zenodo_15550', 'zenodo_32498', 'zenodo_3483793', 'zenodo_4644028', 'zenodo_35193', 'zenodo_1488102', 'zenodo_2641987', 'zenodo_2653859', 'zenodo_1293891', 'zenodo_44622', 'zenodo_2653735', 'zenodo_45008', 'zenodo_1198454', 'zenodo_3871590', 'zenodo_1402417', 'zenodo_38

KeyboardInterrupt: 

In [None]:
infos = pd.read_csv(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), sep="\t")
print(len(infos['dataset_id_ori'].unique()), len(infos))
infos

## get AA + protein

In [119]:
infos = pd.read_csv(os.path.join(os.getcwd(), '../data', 'dataset_info.tsv'), sep="\t")
print(len(infos['dataset_id_ori'].unique()), len(infos))
infos

482 4108


Unnamed: 0,dataset_id_ori,category,file_name,file_type,size,url,atom_num,has_protein,has_nucleic,has_lipid,has_water_ion,has_glucid,is_AA
0,zenodo_3965660,coordinate,coexistence_PSM.gro,gro,7.975367e+06,https://zenodo.org/api/files/37920b54-82ad-436...,115584,False,False,True,True,False,False
1,zenodo_3965660,trajectory,coexistence_PSM.xtc,xtc,4.292043e+10,https://zenodo.org/api/files/37920b54-82ad-436...,,,,,,,
2,zenodo_4115152,coordinate,bar_curved_r1.gro,gro,1.546060e+07,https://zenodo.org/api/files/cc5d7019-f47e-40d...,224066,True,False,True,True,False,False
3,zenodo_4115152,coordinate,bar_curved_r2.gro,gro,1.546060e+07,https://zenodo.org/api/files/cc5d7019-f47e-40d...,224066,True,False,True,True,False,False
4,zenodo_4115152,trajectory,bar_curved_r1_nowater.xtc,xtc,1.998482e+10,https://zenodo.org/api/files/cc5d7019-f47e-40d...,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4103,zenodo_6456372,coordinate,PB-01.pdb,pdb,4.005454e+06,https://zenodo.org/api/files/5d900911-5e28-4d1...,50701,True,False,True,True,False,True
4104,zenodo_6456372,coordinate,PB-02.pdb,pdb,4.005454e+06,https://zenodo.org/api/files/5d900911-5e28-4d1...,50701,True,False,True,True,False,True
4105,zenodo_6456372,coordinate,PB-03.pdb,pdb,4.005454e+06,https://zenodo.org/api/files/5d900911-5e28-4d1...,50701,True,False,True,True,False,True
4106,zenodo_6456372,coordinate,PB-04.pdb,pdb,4.005454e+06,https://zenodo.org/api/files/5d900911-5e28-4d1...,50701,True,False,True,True,False,True


In [120]:
infos_ids = infos.copy()
fitting_ids = infos_ids.loc[((infos_ids['is_AA']=='True') & (infos_ids['has_protein']=='True'))]['dataset_id_ori'].unique()
print(len(fitting_ids))
fitting_df = infos_ids.loc[infos_ids['dataset_id_ori'].isin(fitting_ids)]
not_fitting_ids = fitting_df.loc[((fitting_df['is_AA']=='False'))]['dataset_id_ori'].unique()
fitting_df = fitting_df.loc[~fitting_df['dataset_id_ori'].isin(not_fitting_ids)]
fitting_df.sort_values(by=['size'], ascending=False, inplace=True)
fitting_df.atom_num.replace(to_replace=[None, 'None'], value=None, inplace=True)
fitting_df['atom_num'] = fitting_df['atom_num'].apply(lambda x: None if x in [None, 'None'] else int(x))
print('trajectory:\n', fitting_df.loc[fitting_df['category']=='trajectory'].describe())
print('coordinate:\n', fitting_df.loc[fitting_df['category']=='coordinate'].describe())


94
trajectory:
                size      atom_num
count  5.090000e+02  6.300000e+01
mean   2.324818e+09  7.894806e+05
std    5.176207e+09  2.521899e+05
min    5.619520e+05  8.307000e+03
25%    8.910778e+07  4.980730e+05
50%    3.880190e+08  9.364600e+05
75%    1.926148e+09  9.364600e+05
max    3.763558e+10  1.052627e+06
coordinate:
                size      atom_num
count  4.220000e+02  4.220000e+02
mean   6.388385e+06  1.083017e+05
std    7.480993e+06  1.144924e+05
min    4.973700e+04  6.270000e+02
25%    4.005454e+06  5.070100e+04
50%    4.847897e+06  1.033750e+05
75%    7.132967e+06  1.077290e+05
max    7.398050e+07  1.055753e+06


In [123]:
fitting_df.sort_values(by=['dataset_id_ori', 'file_name'], ascending=True, inplace=True)
fitting_df = fitting_df.reset_index(drop=True)
fitting_df.to_csv(os.path.join(os.getcwd(), '../data', 'AA_prot_info.tsv'), sep="\t")
fitting_df

Unnamed: 0,dataset_id_ori,category,file_name,file_type,size,url,atom_num,has_protein,has_nucleic,has_lipid,has_water_ion,has_glucid,is_AA
0,zenodo_1010142,coordinate,run.gro,gro,1.915140e+06,https://zenodo.org/api/files/f3ad9975-a342-41a...,27755.0,True,False,False,True,False,True
1,zenodo_1010142,trajectory,run1-2.xtc,xtc,7.990016e+09,https://zenodo.org/api/files/f3ad9975-a342-41a...,,,,,,,
2,zenodo_1010232,coordinate,begin.gro,gro,1.249067e+06,https://zenodo.org/api/files/34892030-83ff-4b9...,27755.0,True,False,False,True,False,True
3,zenodo_1010232,coordinate,conf.gro,gro,6.538700e+04,https://zenodo.org/api/files/34892030-83ff-4b9...,1451.0,True,False,False,False,False,True
4,zenodo_1010232,coordinate,run70ns.gro,gro,1.915178e+06,https://zenodo.org/api/files/34892030-83ff-4b9...,27755.0,True,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
926,zenodo_825389,trajectory,md-production_7.xtc,xtc,1.808458e+09,https://zenodo.org/api/files/c9968144-fa39-45f...,,,,,,,
927,zenodo_825389,coordinate,md-production_8.gro,gro,4.466611e+06,https://zenodo.org/api/files/c9968144-fa39-45f...,99256.0,True,False,False,True,True,True
928,zenodo_825389,trajectory,md-production_8.xtc,xtc,1.808572e+09,https://zenodo.org/api/files/c9968144-fa39-45f...,,,,,,,
929,zenodo_825389,coordinate,md-production_9.gro,gro,4.466611e+06,https://zenodo.org/api/files/c9968144-fa39-45f...,99256.0,True,False,False,True,True,True


## get zip files

In [114]:
traj_df_from_zip = all_df.copy()
traj_df_from_zip = traj_df_from_zip.loc[(traj_df_from_zip['from_zip_file']==True) | (traj_df_from_zip['file_type']=='zip')].reset_index(drop=True)
traj_df_from_zip_not = traj_df_from_zip.loc[(traj_df_from_zip['file_name'].str.contains("CG"))]['dataset_id_ori'].unique()

traj_df_from_zip = traj_df_from_zip.loc[~traj_df_from_zip['dataset_id_ori'].isin(traj_df_from_zip_not)]
traj_df_from_zip
traj_df_from_zip_traj = traj_df_from_zip.loc[traj_df_from_zip['file_type'].isin(['xtc', 'trr', 'dcd'])]['dataset_id_ori'].unique()
traj_df_from_zip = traj_df_from_zip.loc[traj_df_from_zip['dataset_id_ori'].isin(traj_df_from_zip_traj)]
print(len(traj_df_from_zip['dataset_id_ori'].unique()))
print(traj_df_from_zip.columns)
traj_df_from_zip = traj_df_from_zip[['dataset_id_ori', 'category', 'file_name', 'file_type', 
                                     'file_size', 'file_url', 'dataset_url', 'from_zip_file', 'origin_zip_file']].reset_index(drop=True)
traj_df_from_zip.sort_values(by=['dataset_id_ori', 'file_name'], ascending=True, inplace=True)
traj_df_from_zip.to_csv(os.path.join(os.getcwd(), '../data', 'zip_info.tsv'), sep="\t")
traj_df_from_zip = traj_df_from_zip.loc[traj_df_from_zip['from_zip_file']==False].reset_index(drop=True)
traj_df_from_zip.to_csv(os.path.join(os.getcwd(), '../data', 'zip_unfold_info.tsv'), sep="\t")

traj_df_from_zip

115
Index(['dataset_origin', 'dataset_id', 'doi', 'date_creation',
       'date_last_modified', 'date_fetched', 'file_number', 'download_number',
       'view_number', 'license', 'dataset_url', 'year', 'file_type',
       'file_size', 'file_md5', 'from_zip_file', 'file_name', 'file_url',
       'origin_zip_file', 'category', 'engine_MD', 'dataset_id_ori'],
      dtype='object')


Unnamed: 0,dataset_id_ori,category,file_name,file_type,file_size,file_url,dataset_url,from_zip_file,origin_zip_file
0,figshare_11764158,none,QHD_LP1.zip,zip,1.112512e+09,https://ndownloader.figshare.com/files/21448860,https://api.figshare.com/v2/articles/11764158,False,
1,figshare_11764158,none,QHD_LP2.zip,zip,1.112907e+09,https://ndownloader.figshare.com/files/21448803,https://api.figshare.com/v2/articles/11764158,False,
2,figshare_11764158,none,QHD_LP3.zip,zip,1.112511e+09,https://ndownloader.figshare.com/files/21448776,https://api.figshare.com/v2/articles/11764158,False,
3,figshare_11764158,none,QHD_LP4.zip,zip,1.112508e+09,https://ndownloader.figshare.com/files/21448701,https://api.figshare.com/v2/articles/11764158,False,
4,figshare_11764158,none,QHD_LP5.zip,zip,1.112129e+09,https://ndownloader.figshare.com/files/21448647,https://api.figshare.com/v2/articles/11764158,False,
...,...,...,...,...,...,...,...,...,...
374,zenodo_6426152,none,PeptideGraphZenodoV2.zip,zip,8.388713e+09,https://zenodo.org/api/files/8588ea90-cacf-4dc...,https://zenodo.org/record/6426152,False,none
375,zenodo_6445362,none,molecular-dynamics.zip,zip,1.032550e+09,https://zenodo.org/api/files/9f47c8ff-0428-45a...,https://zenodo.org/record/6445362,False,none
376,zenodo_6445362,none,scripts.zip,zip,1.605454e+06,https://zenodo.org/api/files/9f47c8ff-0428-45a...,https://zenodo.org/record/6445362,False,none
377,zenodo_6445485,none,molecular-dynamics.zip,zip,9.713144e+08,https://zenodo.org/api/files/7882ea28-bef7-47d...,https://zenodo.org/record/6445485,False,none
