## MGnify notebook: retrieve info from API

In [6]:
# import libraries
import requests
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [7]:
# Ansi coor
RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
MAGENTA = '\033[95m'
CYAN = '\033[96m'
RESET = '\033[0m' 

In [9]:
def fetch_biomes_and_save(output_dir):
    """
    Fetches the list of biomes from the MGnify API and saves it to a text file in the specified output directory.
    The function makes a GET request to the MGnify API's biomes endpoint, extracts the biome IDs from the response,
    and writes them to a file named 'mgnify_biomes_list.txt' within the given output directory.

    Args:
        output_dir (str): The directory path where the biomes list file will be saved. The directory must exist.

    Note:
        This function requires the 'requests' library for making HTTP requests and 'os' library for file path operations.
    """
    url = "https://www.ebi.ac.uk/metagenomics/api/v1/biomes"
    response = requests.get(url)
    if response.status_code == 200:
        biomes_data = response.json()
        biomes_list = [biome['id'] for biome in biomes_data['data']]
        
        # Ensure the output directory exists
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Write the biome IDs to a file
        with open(os.path.join(output_dir, "mgnify_biomes_list.txt"), 'w') as file:
            for biome_name in biomes_list:
                file.write(f"{biome_name}\n")
                
        print("Biomes list saved successfully.")
        
    else:
        print("Failed to retrieve biomes. Status code:", response.status_code)


In [10]:
def get_studies_and_analyses_summary(biome_name, experiment_type, output_dir = '../outputs'):
    """
    Fetches and summarizes studies and analyses data from the MGnify API based on the specified biome name
    and experiment type. It saves the raw data as JSON and returns a merged DataFrame summary.

    This function queries the MGnify API for studies and analyses related to a specific biome and experiment type.
    The results are saved in separate JSON files within the specified output directory and then processed to
    create and return a comprehensive DataFrame summary.

    Args:
        biome_name (str): The name of the biome to filter studies and analyses.
        experiment_type (str): The type of experiment to filter analyses.
        output_dir (str, optional): The directory path where the JSON files will be saved. Defaults to '../outputs'.

    Returns:
        pd.DataFrame: A DataFrame summarizing the studies and analyses, including key details like study ID,
                      study name, number of samples, and related analysis information.
    """

    # API URLs for fetching studies and analyses data
    urls = {"studies": "https://www.ebi.ac.uk/metagenomics/api/v1/studies", "analyses": "https://www.ebi.ac.uk/metagenomics/api/v1/analyses"}

    # common parameters for API requests
    common_params = {"biome_name": biome_name}
    all_data = {"studies": [], "analyses": []}

    # connection request
    for key, url in urls.items():
        if key == "studies" and all_data["studies"]:
            continue

        params = common_params.copy()
        if key == "analyses":
            params.update({
                "lineage": biome_name,
                "experiment_type": experiment_type
            })

        page = 1
        
        while True:
            try:
                print(f"Retrieving data for page {page}...")
                params["page"] = page
                response = requests.get(url, params=params)
                response.raise_for_status()  # errors codes HTTP
                
                data = response.json()["data"]
                page_info = response.json()["meta"]["pagination"]
                all_data[key].extend(data)
                print(f"Page {page} retrieved successfully. Total pages: {page_info['pages']}")

                if page >= page_info["pages"]:
                    break
                page += 1
            except requests.exceptions.HTTPError as http_err:
                print(f"HTTP error occurred: {http_err} - Status code: {response.status_code}")
                break
            except Exception as err:
                print(f"An error occurred: {err}")
                break
        
        # save json files
        if key == "studies":
            output_file_path = os.path.join(output_dir, "mgnify_studies.json")
        else:
            output_file_path = os.path.join(output_dir, f"mgnify_analyses_{experiment_type}.json")
        
        with open(output_file_path, "w") as outfile:
            json.dump(all_data[key], outfile)
        print(f"{key.capitalize()} data for {experiment_type} saved to {output_file_path}")


    # building dataframes
    studies_columns = ['study_id', 'study_name', 'n_samples', 'bioproject', 'centre_name', 'biomes']
    studies_data = []
    for item in all_data['studies']:
        studies_data.append({
            'study_id': item['id'],
            'study_name': item['attributes'].get('study-name', ''),
            'n_samples': item['attributes'].get('samples-count', 0),
            'bioproject': item['attributes'].get('bioproject', ''),
            'centre_name': item['attributes'].get('centre-name', ''),
            'biomes': ", ".join([biome['id'] for biome in item['relationships']['biomes']['data']])
            })
    df_studies = pd.DataFrame(studies_data, columns=studies_columns)

    analyses_columns = ['analysis_id', 'experiment_type', 'pipeline_version', 'instrument_platform', 'study_id', 'sample_id', 'assembly_run_id']
    analyses_data = []
    for item in all_data['analyses']:
        analyses_data.append({
            'analysis_id': item['id'],
            'experiment_type': item['attributes'].get('experiment-type', ''),
            'pipeline_version': item['attributes'].get('pipeline-version', ''),
            'instrument_platform': item['attributes'].get('instrument-model', ''),
            'study_id': item['relationships']['study']['data'].get('id', '') if item['relationships'].get('study') else '',
            'sample_id': item['relationships']['sample']['data'].get('id', '') if item['relationships'].get('sample') else '',
            'assembly_run_id': item['relationships'].get('assembly', {}).get('data', {}).get('id', '') if item['attributes'].get('experiment-type') == 'assembly' else item['relationships'].get('run', {}).get('data', {}).get('id', '')
            })
    df_analyses = pd.DataFrame(analyses_data, columns=analyses_columns)

    # merging dataframe and return it
    df_summary = pd.merge(df_analyses, df_studies, on='study_id', how='left')
    
    return df_summary

In [11]:
def explore_dataset(dataset):
    """
    Explores the given dataset by printing out statistics and information related to its composition.
    This includes the total number of unique studies, the distribution of unique assembly run IDs per study,
    the presence of missing values across variables, and the median number of samples per biome.

    Args:
        dataset (pd.DataFrame): The dataset to be explored. It must contain the columns 'study_id',
                                'assembly_run_id', 'biomes', and 'n_samples' among others.

    Note:
        The function assumes 'combined_df' is the dataset passed through the 'dataset' argument for some print statements.
        Replace 'combined_df' with 'dataset' in the actual implementation if 'combined_df' is a typo.
    """
    print("\nTotal number of unique studies")
    display(dataset['study_id'].nunique())
    print(GREEN + "_._" * 25 + RESET)

    print("\nNumber of unique assembly_run_id per study_id")
    print(dataset.groupby('study_id')['assembly_run_id'].nunique())
    print(GREEN + "_._" * 25 + RESET)

    # missing data
    print("\nMissing values per variable")
    print(dataset.isnull().sum())
    any_missing_data = dataset.isnull().values.any()
    print(f"Are there any missing data in the dataframe? {'yes' if any_missing_data else 'no'}")
    print(GREEN + "_._" * 25 + RESET)

    print("\nNumber of samples per biome (median)")
    print(dataset.groupby('biomes')['n_samples'].median().reset_index())
    print(GREEN + "_._" * 25 + RESET)

    experiment_type_counts = dataset["experiment_type"].value_counts()
    biomes_counts = dataset["biomes"].value_counts()

    print("\nDistribuzione di experiment_type:")
    print(experiment_type_counts)

    print("\nDistribuzione di biomes:")
    print(biomes_counts)
    print(GREEN + "_._" * 25 + RESET)



In [12]:
def feature_engineering(dataframe):
    """
    Performs feature engineering on the provided dataframe. It includes mapping pipeline versions
    to a simplified numerical scale, extracting initials from the 'assembly_run_id', and concatenating
    multiple identifiers into a single 'concatenated_ids' column.
    
    The function does the following transformations:
    - Maps 'pipeline_version' to 'pipeline_mapped' using a predefined version mapping for simplification.
    - Extracts the first three characters from 'assembly_run_id' and stores them in 'initials_run'.
    - Concatenates 'study_id', 'sample_id', 'assembly_run_id', and 'bioproject' into a new 'concatenated_ids' column.

    Args:
        dataframe (pd.DataFrame): The input dataframe to process. It must contain the columns 'pipeline_version',
                                  'assembly_run_id', 'study_id', 'sample_id', and 'bioproject'.

    Returns:
        pd.DataFrame: The dataframe with added features based on the original data.
    """

    version_mapping = {1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4, 4.1: 5, 5.0: 6}
    dataframe['pipeline_mapped'] = dataframe['pipeline_version'].map(version_mapping)

    # extract the first three characters from 'assembly_run_id'
    dataframe['initials_run'] = dataframe['assembly_run_id'].str[:3]

    dataframe['concatenated_ids'] = dataframe['study_id'] + '_' + dataframe['sample_id'] + '_' + dataframe['assembly_run_id'] + '_' + dataframe['bioproject']

    return dataframe

In [13]:
def removing_duplicates(dataframe):
    """
    Removes duplicate rows from the dataframe based on the 'concatenated_ids' column.
    Among duplicates, it retains only the row with the highest value in the 'pipeline_mapped' column.
    
    This function first counts the occurrences of each unique 'concatenated_ids' value to identify duplicates.
    For each set of duplicates, it sorts them by 'pipeline_mapped' in descending order and keeps the top one,
    effectively removing duplicates with lower 'pipeline_mapped' values. Rows without duplicates are preserved as is.

    Args:
        dataframe (pd.DataFrame): The dataframe to process. It must contain the columns 'concatenated_ids' and 'pipeline_mapped'.

    Returns:
        pd.DataFrame: A dataframe with duplicates removed based on the above criteria.
    """
    # Count occurrences of each unique ID in 'concatenated_ids'
    counts = dataframe['concatenated_ids'].value_counts()
    duplicates = (counts > 1).sum()

    # Report the number of duplicate IDs found
    print(f"Number of duplicates in the dataset: {duplicates}")

    # Initialize an empty DataFrame to store the filtered results
    filtered_df = pd.DataFrame()

    # Process each ID with more than one occurrence to identify and keep only the desired row
    for id, count in counts[counts > 1].items():
        # Select rows matching the current duplicate ID
        dup_rows = dataframe[dataframe['concatenated_ids'] == id]
        # Sort these rows by 'pipeline_mapped' in descending order and select the top one
        highest_pipeline_mapped_row = dup_rows.sort_values(by='pipeline_mapped', ascending=False).head(1)
        # Append the selected row to the filtered DataFrame
        filtered_df = pd.concat([filtered_df, highest_pipeline_mapped_row], ignore_index=True)
    
    # Identify and include rows that are not duplicates
    non_duplicate_ids = counts[counts == 1].index
    non_duplicate_rows = dataframe[dataframe['concatenated_ids'].isin(non_duplicate_ids)]
    filtered_df = pd.concat([filtered_df, non_duplicate_rows], ignore_index=True)

    # Return the DataFrame with duplicates removed
    return filtered_df

In [49]:
if __name__ == "__main__":
    # setting the variables 
    biome = "root:Engineered:Wastewater"
    biome_lower = biome.replace(":", "_").lower()
    experiments = ("metagenomic","metatranscriptomic","assembly")
    output_path = '../outputs/'
    df_summary_dict = {}

    print('STARTING STEP 1: fetch_biomes_and_save')
    fetch_biomes_and_save(output_dir= output_path)

    print('STARTING STEP 2: get_studies_and_analyses_summary')
    for exp in experiments:
        print(f"Processing experiment type: {exp}")
        df_summary = get_studies_and_analyses_summary(biome_name=biome, experiment_type=exp)
        df_summary_dict[exp] = df_summary  # Aggiungi il DataFrame al dizionario

        # save the CSV file
        df_summary.to_csv(os.path.join(output_path, f"{biome_lower}_{exp}_summary.csv"), index=False)
        combined_df = pd.concat(df_summary_dict.values(), axis=0)
        combined_df.to_csv(os.path.join(output_path, 'combined_dataframe.csv'), index=False)


STARTING STEP 1: fetch_biomes_and_save
Biomes list saved.
STARTING STEP 2: get_studies_and_analyses_summary
Processing experiment type: metagenomic
Retrieving data for page 1...
Page 1 retrieved successfully. Total pages: 8
Retrieving data for page 2...
Page 2 retrieved successfully. Total pages: 8
Retrieving data for page 3...
Page 3 retrieved successfully. Total pages: 8
Retrieving data for page 4...
Page 4 retrieved successfully. Total pages: 8
Retrieving data for page 5...
Page 5 retrieved successfully. Total pages: 8
Retrieving data for page 6...
Page 6 retrieved successfully. Total pages: 8
Retrieving data for page 7...
Page 7 retrieved successfully. Total pages: 8
Retrieving data for page 8...
Page 8 retrieved successfully. Total pages: 8
Studies data for metagenomic saved to ../outputs/mgnify_studies.json
Retrieving data for page 1...
Page 1 retrieved successfully. Total pages: 40
Retrieving data for page 2...
Page 2 retrieved successfully. Total pages: 40
Retrieving data for p

In [14]:
if __name__ == "__main__":
    # setting the variables 
    biome = "root:Engineered:Wastewater"
    biome_lower = biome.replace(":", "_").lower()
    experiments = ("metagenomic","metatranscriptomic","assembly")
    output_path = '../outputs/'
    df_summary_dict = {}

    combined_df = pd.read_csv(os.path.join(output_path, 'combined_dataframe.csv'))
    print('\033[93m' + 'STARTING STEP 3: explore_dataset' + '\033[0m')
    display(combined_df.dtypes)
    explore_dataset(combined_df)
    print('\033[93m' + 'STARTING STEP 4: feature_engineering' + '\033[0m')

    combined_df_updated = feature_engineering(combined_df)
    display(combined_df_updated['concatenated_ids'].value_counts())

    print('\033[93m' + 'STARTING STEP 5: removing_duplicates' + '\033[0m')
    new_dataframe = removing_duplicates(combined_df_updated)
    print(new_dataframe["initials_run"].value_counts())

    

[93mSTARTING STEP 3: explore_dataset[0m


analysis_id             object
experiment_type         object
pipeline_version       float64
instrument_platform     object
study_id                object
sample_id               object
assembly_run_id         object
study_name              object
n_samples                int64
bioproject              object
centre_name             object
biomes                  object
dtype: object


Total number of unique studies


117

[92m_.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__._[0m

Number of unique assembly_run_id per study_id
study_id
MGYS00000423      1
MGYS00000425      1
MGYS00000555      1
MGYS00000597     16
MGYS00000606      1
               ... 
MGYS00005614      8
MGYS00005769     11
MGYS00005802      6
MGYS00005846    110
MGYS00006570    152
Name: assembly_run_id, Length: 117, dtype: int64
[92m_.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__._[0m

Missing values per variable
analysis_id            0
experiment_type        0
pipeline_version       0
instrument_platform    0
study_id               0
sample_id              0
assembly_run_id        0
study_name             0
n_samples              0
bioproject             0
centre_name            0
biomes                 0
dtype: int64
Are there any missing data in the dataframe? no
[92m_.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__._[0m

Number of samples per biome (

concatenated_ids
MGYS00001976_ERS1874632_ERZ478770_PRJEB22150     2
MGYS00001312_ERS1426853_ERR1713406_PRJEB13831    2
MGYS00001312_ERS1426819_ERR1713372_PRJEB13831    2
MGYS00001312_ERS1426803_ERR1713356_PRJEB13831    2
MGYS00001312_ERS1426793_ERR1713346_PRJEB13831    2
                                                ..
MGYS00001312_ERS1443948_ERR1725972_PRJEB13831    1
MGYS00001312_ERS1443952_ERR1725976_PRJEB13831    1
MGYS00001312_ERS1443923_ERR1725947_PRJEB13831    1
MGYS00001312_ERS1444006_ERR1726030_PRJEB13831    1
MGYS00000777_ERS1107853_ERR1352918_PRJEB13232    1
Name: count, Length: 1515, dtype: int64

[93mSTARTING STEP 5: removing_duplicates[0m
Number of duplicates in the dataset: 99
initials_run
ERR    963
ERZ    524
SRR     20
DRR      8
Name: count, dtype: int64


## Descriptive analysis

In [71]:
combined_df.to_csv(os.path.join(output_path, 'combined_dataframe.csv'), index=False)    

In [15]:
combined_df.experiment_type.unique()

array(['metagenomic', 'metatranscriptomic', 'assembly'], dtype=object)

In [66]:
### do we have different pipeline versions considering different IDs?

analysis_versions = combined_df.groupby('analysis_id')['pipeline_version'].nunique()
study_versions = combined_df.groupby('study_id')['pipeline_version'].nunique()
sample_versions = combined_df.groupby('sample_id')['pipeline_version'].nunique()
run_versions = combined_df.groupby('assembly_run_id')['pipeline_version'].nunique()

multiple_versions_ana = analysis_versions[analysis_versions > 1]
multiple_versions_stu = study_versions[study_versions > 1]
multiple_versions_sam = sample_versions[sample_versions > 1]
multiple_versions_run = run_versions[run_versions > 1]

print(len(multiple_versions_ana))
print(len(multiple_versions_stu))
print(len(multiple_versions_sam))
print(len(multiple_versions_run))

0
4
162
197


In [16]:
combined_df[combined_df['assembly_run_id'] == 'ERR1713331']

Unnamed: 0,analysis_id,experiment_type,pipeline_version,instrument_platform,study_id,sample_id,assembly_run_id,study_name,n_samples,bioproject,centre_name,biomes,pipeline_mapped,initials_run,concatenated_ids
357,MGYA00216627,metagenomic,4.1,Illumina HiSeq 3000,MGYS00001312,ERS1426778,ERR1713331,Global surveillance of infectious diseases and...,179,PRJEB13831,DTU-GE,root:Engineered:Wastewater:Water and sludge,5,ERR,MGYS00001312_ERS1426778_ERR1713331_PRJEB13831
533,MGYA00085654,metagenomic,3.0,Illumina HiSeq 3000,MGYS00001312,ERS1426778,ERR1713331,Global surveillance of infectious diseases and...,179,PRJEB13831,DTU-GE,root:Engineered:Wastewater:Water and sludge,3,ERR,MGYS00001312_ERS1426778_ERR1713331_PRJEB13831


In [17]:
combined_df_updated.head(3)

Unnamed: 0,analysis_id,experiment_type,pipeline_version,instrument_platform,study_id,sample_id,assembly_run_id,study_name,n_samples,bioproject,centre_name,biomes,pipeline_mapped,initials_run,concatenated_ids
0,MGYA00166416,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488844,ERR2586218,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488844_ERR2586218_PRJEB26809
1,MGYA00166417,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488846,ERR2586220,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488846_ERR2586220_PRJEB26809
2,MGYA00166418,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488842,ERR2586216,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488842_ERR2586216_PRJEB26809


In [18]:
display(combined_df_updated['concatenated_ids'].value_counts())

concatenated_ids
MGYS00001976_ERS1874632_ERZ478770_PRJEB22150     2
MGYS00001312_ERS1426853_ERR1713406_PRJEB13831    2
MGYS00001312_ERS1426819_ERR1713372_PRJEB13831    2
MGYS00001312_ERS1426803_ERR1713356_PRJEB13831    2
MGYS00001312_ERS1426793_ERR1713346_PRJEB13831    2
                                                ..
MGYS00001312_ERS1443948_ERR1725972_PRJEB13831    1
MGYS00001312_ERS1443952_ERR1725976_PRJEB13831    1
MGYS00001312_ERS1443923_ERR1725947_PRJEB13831    1
MGYS00001312_ERS1444006_ERR1726030_PRJEB13831    1
MGYS00000777_ERS1107853_ERR1352918_PRJEB13232    1
Name: count, Length: 1515, dtype: int64

In [19]:
new_dataframe = removing_duplicates(combined_df_updated)

Number of duplicates in the dataset: 99


In [21]:
new_dataframe.shape

(1515, 15)

In [22]:
combined_df_updated[combined_df_updated['concatenated_ids']== 'MGYS00001312_ERS1426853_ERR1713406_PRJEB13831']

Unnamed: 0,analysis_id,experiment_type,pipeline_version,instrument_platform,study_id,sample_id,assembly_run_id,study_name,n_samples,bioproject,centre_name,biomes,pipeline_mapped,initials_run,concatenated_ids
236,MGYA00216506,metagenomic,4.1,Illumina HiSeq 3000,MGYS00001312,ERS1426853,ERR1713406,Global surveillance of infectious diseases and...,179,PRJEB13831,DTU-GE,root:Engineered:Wastewater:Water and sludge,5,ERR,MGYS00001312_ERS1426853_ERR1713406_PRJEB13831
526,MGYA00085647,metagenomic,3.0,Illumina HiSeq 3000,MGYS00001312,ERS1426853,ERR1713406,Global surveillance of infectious diseases and...,179,PRJEB13831,DTU-GE,root:Engineered:Wastewater:Water and sludge,3,ERR,MGYS00001312_ERS1426853_ERR1713406_PRJEB13831


In [23]:
print(combined_df_updated.initials_run.unique())

['ERR' 'SRR' 'DRR' 'ERZ']


The identifiers ERR, SRR, and DRR are associated with FASTQ files that can be obtained from public sequencing databases.

These prefixes represent sequencing run identifiers from three of the main sequencing data archives, which are part of the International Nucleotide Sequence Database Collaboration (INSDC).

Here's a brief description:

- ERR: European Nucleotide Archive (ENA)
- SRR: Sequence Read Archive (SRA) of the National Center for Biotechnology Information (NCBI)
- DRR: DNA Data Bank of Japan (DDBJ) Sequence Read Archive

These identifiers are used to access sequencing data, including FASTQ files, which contain raw DNA sequences and their quality scores.

Users can download the FASTQ files associated with these identifiers using tools like fastq-dump from the SRA Toolkit package or web interfaces of the respective archives.

ERZ is not commonly recognized as a standard prefix for identifiers directly associated with FASTQ files in public sequencing databases.

It might refer to a specific format or identifier of a particular database or project, but it is not standardized for FASTQ file download like ERR, SRR, and DRR are.

## Prossimi steps

1. creare una nuova variabile chiamata "name_id" in cui si considerano solo le prime 3 lettere e vengono printate nella nuova colonna. **(DONE)**
2. unire study_id, sample_id, assembly_run_id, n_samples e bioproject in un unica variabile stringa e se doppione considerare solo quella con versione piu alta; per fare cio forse conviene utilizzare un dizionario per convertire i float **(DONE)**
3. creare una nuova variabile in grado di separare le prime 3 lettere di assembly_run_id **(DONE)**
4. creare funzione in grado di scaricare ERR da un sito e altri nominativi da un altro e salvarli in un file txt
5. Possiamo creare i file json al di fuori del ciclo for? in questo modo posso creare solo due json e non avere il problema della sovrascrizione.
6. adattare la funzione al file finale per scaricare i dati fastq

10. scrivere l'analisi in file main.py and functions.py

In [24]:
combined_df_updated.head()

Unnamed: 0,analysis_id,experiment_type,pipeline_version,instrument_platform,study_id,sample_id,assembly_run_id,study_name,n_samples,bioproject,centre_name,biomes,pipeline_mapped,initials_run,concatenated_ids
0,MGYA00166416,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488844,ERR2586218,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488844_ERR2586218_PRJEB26809
1,MGYA00166417,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488846,ERR2586220,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488846_ERR2586220_PRJEB26809
2,MGYA00166418,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488842,ERR2586216,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488842_ERR2586216_PRJEB26809
3,MGYA00166419,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488841,ERR2586215,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488841_ERR2586215_PRJEB26809
4,MGYA00166420,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488845,ERR2586219,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488845_ERR2586219_PRJEB26809


In [25]:
combined_df_updated["initials_run"].value_counts()

initials_run
ERR    1056
ERZ     530
SRR      20
DRR       8
Name: count, dtype: int64

**European Nucleotide Archive (ENA):**

The ENA provides direct access to FASTQ files via FTP or HTTP.

You can construct the URL for direct download if you know the run identifier (e.g., ERR, SRR, DRR). The URLs generally follow a standardized format.

**NCBI Sequence Read Archive (SRA):**

For SRR files, the NCBI does not offer a direct equivalent method to the ENA for downloading via FTP/HTTP without using the SRA Toolkit.

However, some third-party tools and services, such as NCBI's EDirect or web services like the Sequence Read Archive (SRA) Explorer, can facilitate the search for direct download URLs when available.

**DNA Data Bank of Japan (DDBJ):**

Similar to ENA and NCBI, the DDBJ may also offer ways to access data, but the common practice for accessing DRR data is through the SRA Toolkit or web interfaces that facilitate downloading.

In [27]:
combined_df_updated[combined_df_updated['initials_run']== 'ERR']

Unnamed: 0,analysis_id,experiment_type,pipeline_version,instrument_platform,study_id,sample_id,assembly_run_id,study_name,n_samples,bioproject,centre_name,biomes,pipeline_mapped,initials_run,concatenated_ids
0,MGYA00166416,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488844,ERR2586218,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488844_ERR2586218_PRJEB26809
1,MGYA00166417,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488846,ERR2586220,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488846_ERR2586220_PRJEB26809
2,MGYA00166418,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488842,ERR2586216,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488842_ERR2586216_PRJEB26809
3,MGYA00166419,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488841,ERR2586215,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488841_ERR2586215_PRJEB26809
4,MGYA00166420,metagenomic,4.1,Illumina HiSeq 2500,MGYS00002383,ERS2488845,ERR2586219,Antibiotic manufacturing effluent enriches res...,6,PRJEB26809,UNIVERSITY OF GOTHENBURG,root:Engineered:Wastewater:Activated Sludge,5,ERR,MGYS00002383_ERS2488845_ERR2586219_PRJEB26809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1079,MGYA00041075,metatranscriptomic,2.0,Illumina HiSeq 2500,MGYS00000823,ERS1110353,ERR1356740,Studying the presence and selection of antibio...,36,PRJEB13233,EAWAG,root:Engineered:Wastewater:Activated Sludge,2,ERR,MGYS00000823_ERS1110353_ERR1356740_PRJEB13233
1080,MGYA00041076,metatranscriptomic,2.0,Illumina HiSeq 2500,MGYS00000823,ERS1110341,ERR1356716,Studying the presence and selection of antibio...,36,PRJEB13233,EAWAG,root:Engineered:Wastewater:Activated Sludge,2,ERR,MGYS00000823_ERS1110341_ERR1356716_PRJEB13233
1081,MGYA00041077,metatranscriptomic,2.0,Illumina HiSeq 2500,MGYS00000823,ERS1110364,ERR1356762,Studying the presence and selection of antibio...,36,PRJEB13233,EAWAG,root:Engineered:Wastewater:Activated Sludge,2,ERR,MGYS00000823_ERS1110364_ERR1356762_PRJEB13233
1082,MGYA00041078,metatranscriptomic,2.0,Illumina HiSeq 2500,MGYS00000823,ERS1110354,ERR1356742,Studying the presence and selection of antibio...,36,PRJEB13233,EAWAG,root:Engineered:Wastewater:Activated Sludge,2,ERR,MGYS00000823_ERS1110354_ERR1356742_PRJEB13233


In [28]:
combined_df_ERR = combined_df_updated[combined_df_updated['initials_run']== 'ERR']

In [29]:
combined_df_ERR.shape

(1056, 15)

In [32]:
combined_df_ERR.assembly_run_id.unique().count()

AttributeError: 'numpy.ndarray' object has no attribute 'count'

In [None]:

from ftplib import FTP
server_address = 'ftp.sra.ebi.ac.uk'



def download_files_from_list(server, input_ids_file, local_directory, remote_directory = '/vol1/fastq/'):
    try:
        ftp = FTP(server)
        ftp.login()

        with open(input_ids_file, 'r') as id_file:
            ids = id_file.readlines()

            for id_name in ids:
                id_name = id_name.strip()
                folder_name = id_name[:6]

                remote_path = f"{remote_directory}/{folder_name}/{id_name}/"
                local_path = f"{local_directory}/{folder_name}/{id_name}/"

                os.makedirs(local_path, exist_ok=True)

                ftp.cwd(remote_path)

                files_to_download = ftp.nlst()

                for file in files_to_download:
                    with open(os.path.join(local_path, file), 'wb') as local_file:
                        # wb = write - binary 
                        ftp.retrbinary('RETR ' + file, local_file.write)
                        # retrbinary = download files in binary format (retrieve binary)
                    print(f"File {file} successfully downloaded in {local_path}")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        ftp.quit()