In [1]:
import pandas as pd
import numpy as np

In [2]:
# task specific imports
from Bio import Entrez

In [3]:
SERIES_ACCESSION = "GSE60143"

In [4]:
Entrez.email = "carissa.bleker@nib.si"    

In [5]:
# Get series UID
handle = Entrez.esearch(db="gds", term=f"{SERIES_ACCESSION}[Accession] AND gse[Filter]", field="acc", retmax=1)
series_result = Entrez.read(handle)
series_result

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['200060143'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'GSE60143[Accession]', 'Field': 'Accession', 'Count': '944', 'Explode': 'N'}, {'Term': 'gse[Filter]', 'Field': 'Filter', 'Count': '224215', 'Explode': 'N'}, 'AND'], 'QueryTranslation': 'GSE60143[Accession] AND gse[Filter]'}

In [6]:
series_uid = series_result["IdList"][0]
series_uid

'200060143'

In [7]:
# Get samples from the series
handle = Entrez.esummary(db="gds", id=series_uid, retmax=1000)
series_result = Entrez.read(handle)
series_result

[{'Item': [], 'Id': '200060143', 'Accession': 'GSE60143', 'GDS': '', 'title': 'In vitro, genomic context identification of transcription factor binding sites', 'summary': 'This SuperSeries is composed of the SubSeries listed below.', 'GPL': '17639;17628', 'GSE': '60143', 'taxon': 'Zea mays; Arabidopsis thaliana', 'entryType': 'GSE', 'gdsType': 'Other; Genome binding/occupancy profiling by high throughput sequencing', 'ptechType': '', 'valType': '', 'SSInfo': '', 'subsetInfo': '', 'PDAT': '2016/05/25', 'suppFile': 'NARROWPEAK', 'Samples': [{'Accession': 'GSM1925639', 'Title': 'SBP_tnt.SPL9_col_a'}, {'Accession': 'GSM1925722', 'Title': 'WRKY_tnt.WRKY47_col_a'}, {'Accession': 'GSM1925888', 'Title': 'NAC_tnt.VND4_col_v3a'}, {'Accession': 'GSM1925473', 'Title': 'MYB_tnt.MYB77_col_a'}, {'Accession': 'GSM1925390', 'Title': 'LOBAS2_tnt.LBD23_colamp_a'}, {'Accession': 'GSM1925261', 'Title': 'C3H_tnt.At1g70910_col_a'}, {'Accession': 'GSM1925075', 'Title': 'AP2EREBP_tnt.DEAR5_colamp_a'}, {'Access

In [8]:
samples = series_result[0]['Samples']
len(samples)

941

In [9]:
all_sample_details = {}
failed_samples = []
for s in samples:
    try:
        # Get sample UID
        handle = Entrez.esearch(db="gds", term=f"{s['Accession']}[Accession] AND gsm[Filter]", field="acc", retmax=1)
        sample = Entrez.read(handle)
        sample_uid = sample["IdList"][0]
        # Get sample details
        handle = Entrez.esummary(db="gds", id=sample_uid, retmax=100)
        sample_details = Entrez.read(handle)[0]
        all_sample_details[s["Accession"]] = sample_details
    except:
        print(s)
        failed_samples.append(s)

In [20]:
import json

In [24]:
with open("./tmp_sample_info_dump.json", "w") as f: json.dump(all_sample_details, f)

In [19]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html#
pd.DataFrame.from_dict(all_sample_details, orient="index").to_csv("sample")

Unnamed: 0,Item,Id,Accession,GDS,title,summary,GPL,GSE,taxon,entryType,...,ExtRelations,n_samples,SeriesTitle,PlatformTitle,PlatformTaxa,SamplesTaxa,PubMedIds,Projects,FTPLink,GEO2R
GSM1925639,[],301925639,GSM1925639,,SBP_tnt.SPL9_col_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
GSM1925722,[],301925722,GSM1925722,,WRKY_tnt.WRKY47_col_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
GSM1925888,[],301925888,GSM1925888,,NAC_tnt.VND4_col_v3a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],,
GSM1925473,[],301925473,GSM1925473,,MYB_tnt.MYB77_col_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
GSM1925390,[],301925390,GSM1925390,,LOBAS2_tnt.LBD23_colamp_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1925497,[],301925497,GSM1925497,,MYBrelated_tnt.AT4G12670_col_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
GSM1925102,[],301925102,GSM1925102,,AP2EREBP_tnt.ERF4_col_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
GSM1925700,[],301925700,GSM1925700,,WRKY_tnt.WRKY27_colamp_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,
GSM1925285,[],301925285,GSM1925285,,E2FDP_tnt.E2FC_col_a,young leaf,17639,60141;60143,Arabidopsis thaliana,GSM,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRX1...",0,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1925...,


In [10]:
failed_samples

[]