In [3]:
# GeneLab AI/ML AWG - Digital Twin SubGroup Project - Data Download and Inspection Template Scripts
# Version: 2024-08-29
# Author: Dr. Jian Gong (University of Wyoming), jian.gong@uwyo.edu (email for questions and comments)

import os
import ast
import urllib.request as urlrequest
# from urllib.parse import quote
import requests
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 4000) # Modify this value to best fit your screen/editor
pd.set_option('display.max_colwidth', 100) # Use this option to display the full content of columns

from pprint import pprint as pp # use pp() to print json formatted text
# from tqdm import tqdm


In [5]:
# Obtain an overview of the RR dataset
# Search parameter: https://osdr.nasa.gov/bio/repo/search?q=RR%20Transcriptional&data_source=cgene,alsda&data_type=study

# Note: there is a limit of 25 records that can be accessed at a time.
data_url = 'https://osdr.nasa.gov/osdr/data/osd/files/1-999'

response = requests.get(data_url)

if response.status_code == 200:
    data = response.json()
    df = pd.json_normalize(data)
else:
    print('Error:', response.status_code)
    df = pd.DataFrame()

print(df.info())
print(df)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   hits                        1 non-null      int64 
 1   input                       1 non-null      object
 2   page_number                 1 non-null      int64 
 3   page_size                   1 non-null      int64 
 4   page_total                  1 non-null      int64 
 5   success                     1 non-null      bool  
 6   total_hits                  1 non-null      int64 
 7   valid_input                 1 non-null      object
 8   studies.OSD-1.file_count    1 non-null      int64 
 9   studies.OSD-1.study_files   1 non-null      object
 10  studies.OSD-11.file_count   1 non-null      int64 
 11  studies.OSD-11.study_files  1 non-null      object
 12  studies.OSD-12.file_count   1 non-null      int64 
 13  studies.OSD-12.study_files  1 non-null      object
 14

In [6]:
# Function to profile a set of datasets

def profile_osdr_study(study_id):
    study_string = f'studies.OSD-{study_id}.study_files'

    file_list = list(df[study_string])

    # pp(file_list)
    # Filter entries within file_list with 'category' containing 'Study Metadata', or 'file_name' containing 'Normalized/Unnormalized'
    rna_seq_files = [file for sublist in file_list for file in sublist if 'RNA-Seq' in file['category']]
    genelab_processed_files = [file for sublist in file_list for file in sublist if 'GeneLab Processed RNA-Seq Files' in file['category']]
    result_files = [file for sublist in file_list for file in sublist if 'Normalized' in file['file_name'] or 'Unnormalized' in file['file_name'] or 'Differential' in file['subcategory']]

    print(f": OSDR-{study_id:{3}} contains {len(rna_seq_files):{5}} raw RNA-Seq files, with {len(result_files):{5}} result files.")
    # pp(genelab_processed_files)
    return len(result_files)

# convert string csv to list
# study_ids = [int(x) for x in df['input'][0].split(',')]
study_ids = df['valid_input'][0]
print(study_ids)

print("===== Study Summary =====")
for id in study_ids:
    profile_osdr_study(int(id))


['1', '2', '3', '4', '5', '6', '7', '8', '9', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '144', '145', '146', '147', '148', '149', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165

KeyError: 'studies.OSD-26.study_files'

In [25]:
# 1. Examine and Download OSD-667 Study Files

study_id = 667
study_string = f'studies.OSD-{study_id}.study_files'

file_list = list(df[study_string])

pp(file_list)

# Filter entries within file_list with 'category' containing 'Study Metadata', or 'file_name' containing 'Normalized/Unnormalized'

rna_seq_files = [file for sublist in file_list for file in sublist if 'RNA-Seq' in file['category']]
genelab_processed_files = [file for sublist in file_list for file in sublist if 'GeneLab Processed RNA-Seq Files' in file['category']]
result_files = [file for sublist in file_list for file in sublist if 'Normalized' in file['file_name'] or 'Unnormalized' in file['file_name'] or 'Differential' in file['subcategory']]

print(f"-- The study contains {len(rna_seq_files)} raw RNA-Seq files, {len(result_files)} result files.")

pp(genelab_processed_files)



[[{'category': 'Study Metadata Files',
   'date_created': 1716363425.069,
   'date_updated': 1716363425.069,
   'file_name': 'OSD-667_metadata_OSD-667-ISA.zip',
   'file_size': 92876,
   'organization': 'OSD',
   'remote_url': '/geode-py/ws/studies/OSD-667/download?source=datamanager&file=OSD-667_metadata_OSD-667-ISA.zip',
   'restricted': False,
   'subcategory': '',
   'subdirectory': '',
   'visible': True},
  {'category': 'RNA-Seq',
   'date_created': 1697675036.182,
   'date_updated': 1697675036.182,
   'file_name': 'GLDS-606_rna-seq_RR10_CLN_VIV_WT_V5_R2_raw.fastq.gz',
   'file_size': 7473164720,
   'organization': 'genelab',
   'remote_url': '/geode-py/ws/studies/OSD-667/download?source=datamanager&file=GLDS-606_rna-seq_RR10_CLN_VIV_WT_V5_R2_raw.fastq.gz',
   'restricted': False,
   'subcategory': 'Raw sequence data',
   'subdirectory': '',
   'visible': True},
  {'category': 'RNA-Seq',
   'date_created': 1697675036.183,
   'date_updated': 1697675036.183,
   'file_name': 'GLDS-6