In [1]:
import requests
import json
import csv
from tqdm import tqdm  # Import tqdm for the progress bar

disease_url = "https://www.metabolomicsworkbench.org/rest/study/study_id/ST/disease"
studies = {}  # Create an empty dictionary to store the study IDs and diseases

In [2]:
# Make the HTTPS GET request
response = requests.get(disease_url)
if response.status_code == 200:
    data = json.loads(response.text)
    for index, study_data in data.items():
        disease = study_data.get("Disease", "")  # Get the disease value from the study_data
        study_id = study_data.get("Study ID", "")  # Get the study ID from the study_data
        studies[study_id] = {"Disease": disease}
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")

print("Studies filtering done")
print("retrieved cancer studies: ", len(studies))

Studies filtering done
retrieved cancer studies:  1281


In [3]:
# Define the URL pattern
url_pattern = "https://www.metabolomicsworkbench.org/rest/study/study_id/{x}/data"

In [4]:
# Create a function to make the HTTPS call and save the response to a file
def get_study_number_of_samples(study_id):
    url = url_pattern.format(x=study_id)
    try:
        response = requests.get(url, timeout=600)  
        response.raise_for_status()  # Raise an exception for non-200 status codes

        data = json.loads(response.text)
        if "1" in data and "DATA" in data["1"]:
           return len(data["1"]["DATA"])
        elif "2" in data and "DATA" not in data["2"]:
            return len(data["2"])
        else:
            print("   Error parsing study:", study_id, ", data is not in the expected format")
            return 0
    except requests.exceptions.Timeout:
        print(f"    Skipping study ID {study_id} due to timeout")
        return 0
    except requests.exceptions.RequestException as e:
        print(f"    Failed to retrieve data for study ID {study_id}: {e}")
        return 0

In [5]:
with open("output/study_number_of_samples.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Study ID", "Disease", "Number of Samples"])
    
    # Get the first 10 studies
    # Wrap the studies list with tqdm for the progress bar
    for index, (study_id, disease) in enumerate(studies.items(), start=1):
        number_of_samples = get_study_number_of_samples(study_id)
        writer.writerow([study_id, disease.get("Disease", ""), number_of_samples])
        # Update the progress bar
        progress = index / len(studies) * 100
        tqdm.write(f"{index}/{len(studies)} - Progress: {progress:.2f}% - Study ID: {study_id} - Number of Samples: {number_of_samples}")


1/1281 - Progress: 0.08% - Study ID: ST000007 - Number of Samples: 60
2/1281 - Progress: 0.16% - Study ID: ST000010 - Number of Samples: 39
3/1281 - Progress: 0.23% - Study ID: ST000020 - Number of Samples: 88
   Error parsing study: ST000022 , data is not in the expected format
4/1281 - Progress: 0.31% - Study ID: ST000022 - Number of Samples: 0
   Error parsing study: ST000026 , data is not in the expected format
5/1281 - Progress: 0.39% - Study ID: ST000026 - Number of Samples: 0
   Error parsing study: ST000027 , data is not in the expected format
6/1281 - Progress: 0.47% - Study ID: ST000027 - Number of Samples: 0
   Error parsing study: ST000028 , data is not in the expected format
7/1281 - Progress: 0.55% - Study ID: ST000028 - Number of Samples: 0
   Error parsing study: ST000029 , data is not in the expected format
8/1281 - Progress: 0.62% - Study ID: ST000029 - Number of Samples: 0
   Error parsing study: ST000030 , data is not in the expected format
9/1281 - Progress: 0.70% 