In [1]:
!pip install bioservices

Collecting bioservices
  Downloading bioservices-1.12.1-py3-none-any.whl.metadata (19 kB)
Collecting appdirs<2.0.0,>=1.4.4 (from bioservices)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting colorlog<7.0.0,>=6.9.0 (from bioservices)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting easydev<0.14.0,>=0.13.3 (from bioservices)
  Downloading easydev-0.13.3-py3-none-any.whl.metadata (4.0 kB)
Collecting grequests<0.8.0,>=0.7.0 (from bioservices)
  Downloading grequests-0.7.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting lxml<6.0.0,>=5.3.0 (from bioservices)
  Downloading lxml-5.4.0-cp313-cp313-win_amd64.whl.metadata (3.6 kB)
Collecting requests<3.0.0,>=2.32.3 (from bioservices)
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting requests-cache<2.0.0,>=1.2.1 (from bioservices)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting rich-click<2.0.0,>=1.8.5 (from bioservices)
  Downloading


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
#Step 2 - Explore Files in the FTP Folder
from ftplib import FTP #library to interact with FTP servers.

def list_ftp_files(ftp_url):#Connects to EBI's FTP server
    ftp_host = "ftp.ebi.ac.uk"
    path = ftp_url.replace("ftp://ftp.ebi.ac.uk/", "")

    with FTP(ftp_host) as ftp:#Goes into each folder
        ftp.login()
        ftp.cwd(path)
        print(f"\n📁 Files in {ftp_url}:\n")
        ftp.retrlines('LIST')#Lists all files in a directory

# List files for each dataset
ftp_links = [
    "ftp://ftp.ebi.ac.uk/pub/databases/arrayexpress/data/experiment/GEOD/E-GEOD-7543/",
    "ftp://ftp.ebi.ac.uk/pub/databases/arrayexpress/data/experiment/MTAB/E-MTAB-3630/",
    "ftp://ftp.ebi.ac.uk/pub/databases/arrayexpress/data/experiment/MTAB/E-MTAB-3629/"
]

# List files in each FTP folder
for link in ftp_links:
    list_ftp_files(link)



📁 Files in ftp://ftp.ebi.ac.uk/pub/databases/arrayexpress/data/experiment/GEOD/E-GEOD-7543/:

-rwxr-xr-x    1 ftp      ftp          3841 Dec 23  2011 E-GEOD-7543.README.txt
-rwxr-xr-x    1 ftp      ftp          3807 Dec 23  2011 E-GEOD-7543.idf.txt
-rwxr-xr-x    1 ftp      ftp       2927136 Dec 23  2011 E-GEOD-7543.processed.1.zip
-rwxr-xr-x    1 ftp      ftp        111217 Dec 23  2011 E-GEOD-7543.sdrf.txt

📁 Files in ftp://ftp.ebi.ac.uk/pub/databases/arrayexpress/data/experiment/MTAB/E-MTAB-3630/:

-r--r--r--    1 ftp      ftp          4366 Jun 01  2016 E-MTAB-3630.idf.txt
-r--r--r--    1 ftp      ftp          4366 May 24  2016 E-MTAB-3630.idf.txt_original
-r--r--r--    1 ftp      ftp       6116215 Jun 08  2015 E-MTAB-3630.raw.1.zip
-rw-rw-r--    1 ftp      ftp          3397 Jun 08  2015 E-MTAB-3630.sdrf.txt

📁 Files in ftp://ftp.ebi.ac.uk/pub/databases/arrayexpress/data/experiment/MTAB/E-MTAB-3629/:

-r--r--r--    1 ftp      ftp          4263 Jun 01  2016 E-MTAB-3629.idf.txt
-r--r--

In [5]:
from ftplib import FTP
import os

# Define accessions and corresponding SDRF filenames
datasets = {
    "E-GEOD-7543": "E-GEOD-7543.sdrf.txt",
    "E-MTAB-3630": "E-MTAB-3630.sdrf.txt",
    "E-MTAB-3629": "E-MTAB-3629.sdrf.txt"
}

def download_ftp_file(accession, filename):#create a function to set up FTP paths
    # Set up FTP paths
    ftp_host = "ftp.ebi.ac.uk"
    ftp_base = "pub/databases/arrayexpress/data/experiment"
    
    # Get prefix (GEOD or MTAB) to build full folder path
    prefix = accession.split("-")[1][:4]
    ftp_folder = f"{ftp_base}/{prefix}/{accession}"
    
    # Create local directory if needed
    os.makedirs("downloads", exist_ok=True)
    local_path = f"downloads/{accession}_{filename}"

    with FTP(ftp_host) as ftp:
        ftp.login()
        ftp.cwd(ftp_folder)
        with open(local_path, "wb") as f:
            ftp.retrbinary(f"RETR {filename}", f.write)
            print(f"Downloaded: {accession} → {filename}")

# Loop through datasets and download SDRF files
for acc, file in datasets.items():
    try:
        download_ftp_file(acc, file)
    except Exception as e:
        print(f"Failed to download {file} for {acc}: {e}")


Downloaded: E-GEOD-7543 → E-GEOD-7543.sdrf.txt
Downloaded: E-MTAB-3630 → E-MTAB-3630.sdrf.txt
Downloaded: E-MTAB-3629 → E-MTAB-3629.sdrf.txt


In [9]:
#Step 4 - Open in jupyter
import pandas as pd

# Define list of file paths
sdrf_files = [
    "downloads/E-GEOD-7543_E-GEOD-7543.sdrf.txt",
    "downloads/E-MTAB-3630_E-MTAB-3630.sdrf.txt",
    "downloads/E-MTAB-3629_E-MTAB-3629.sdrf.txt"
]

# Load and preview each SDRF file
for file in sdrf_files:
    print(f"\nPreview of {file.split('/')[-1]}:\n" + "-"*60)
    try:
        df = pd.read_csv(file, sep="\t", low_memory=False)
        print(df.head(6))           # Print first 3 rows
        print("\nColumns:\n", df.columns.tolist())
    except Exception as e:
        print(f"Could not read {file}: {e}")



Preview of E-GEOD-7543_E-GEOD-7543.sdrf.txt:
------------------------------------------------------------
            Source Name Characteristics [Organism]  \
0  GSE7543GSM182769_Cy5               Homo sapiens   
1  GSE7543GSM182769_Cy3               Homo sapiens   
2  GSE7543GSM182784_Cy3               Homo sapiens   
3  GSE7543GSM182784_Cy5               Homo sapiens   
4  GSE7543GSM182796_Cy5               Homo sapiens   
5  GSE7543GSM182796_Cy3               Homo sapiens   

                                         Description Protocol REF  \
0  human whole blood collected into PAXgene RNA v...    P-G7543-1   
1  common reference sample compsed of total RNAs ...    P-G7543-1   
2  common reference sample compsed of total RNAs ...    P-G7543-1   
3  human whole blood collected into PAXgene RNA v...    P-G7543-1   
4  human whole blood collected into PAXgene RNA v...    P-G7543-1   
5  common reference sample compsed of total RNAs ...    P-G7543-1   

  Protocol REF.1              

In [10]:
for file in sdrf_files:
    print(f"\nColumns in {file.split('/')[-1]}:\n" + "-"*60)
    df = pd.read_csv(file, sep="\t", low_memory=False)
    for col in df.columns:
        print("•", col)



🔎 Columns in E-GEOD-7543_E-GEOD-7543.sdrf.txt:
------------------------------------------------------------
• Source Name
• Characteristics [Organism]
• Description
• Protocol REF
• Protocol REF.1
• Sample Name
• Protocol REF.2
• Extract Name
• Material Type
• Protocol REF.3
• Labeled Extract Name
• Label
• Material Type.1
• Protocol REF.4
• Hybridization Name
• Array Design REF
• Comment [Array Design URI]
• Protocol REF.5
• Scan Name
• Protocol REF.6
• Derived Array Data Matrix File
• Comment [Derived ArrayExpress FTP file]

🔎 Columns in E-MTAB-3630_E-MTAB-3630.sdrf.txt:
------------------------------------------------------------
• Source Name
• Characteristics[organism]
• Characteristics[cell line]
• Material Type
• Protocol REF
• Protocol REF.1
• Extract Name
• Protocol REF.2
• Labeled Extract Name
• Label
• Protocol REF.3
• Assay Name
• Technology Type
• Array Design REF
• Term Source REF
• Protocol REF.4
• Array Data File
• Comment [ArrayExpress FTP file]
• Factor Value[compoun

In [12]:
# See the unique values in the compound field for each dataset
for file in sdrf_files:
    df = pd.read_csv(file, sep="\t", low_memory=False)
    compound_cols = [col for col in df.columns if "compound" in col.lower()]
    if compound_cols:
        print(f"\nValues in {compound_cols[0]} from {file.split('/')[-1]}:")
        print(df[compound_cols[0]].unique())



Values in Factor Value[compound] from E-MTAB-3630_E-MTAB-3630.sdrf.txt:
['none' 'PM10' 'PM2.5']

Values in Factor Value[compound] from E-MTAB-3629_E-MTAB-3629.sdrf.txt:
['none' 'PM10' 'PM2.5']
