In [1]:
import requests
import os
import pandas as pd
import gzip
import io
import numpy as np
import GEOparse

In [2]:
gse = GEOparse.get_GEO(geo="GSE25066", destdir="data/")

#build expression matrix (genes as rows, samples as columns, then transpose)
expression_data = pd.DataFrame({gsm_name: gsm.table.set_index("ID_REF")["VALUE"] for gsm_name, gsm in gse.gsms.items()}).T

labels = {}

for gsm_name, gsm in gse.gsms.items():
    characteristics = gsm.metadata.get("characteristics_ch1", [])
    response_line = next((c for c in characteristics if "response" in c.lower() or "pcr" in c.lower()), None)

    if response_line:
        try:
            label_str = response_line.split(":")[1].strip().lower()
            if label_str == "pcr":
                labels[gsm_name] = 1
            elif label_str == "rd":
                labels[gsm_name] = 0
            else:
                labels[gsm_name] = None  #"na" or unexpected
        except Exception:
            labels[gsm_name] = None
    else:
        labels[gsm_name] = None

label_series = pd.Series(labels)

#merge labels with expression data
df = expression_data.merge(label_series.rename("pcr_response"), left_index=True, right_index=True)

#drop samples with missing expression or missing label
n_before = df.shape[0]
df_clean = df.dropna()
n_after = df_clean.shape[0]
df_clean["pcr_response"] = df_clean["pcr_response"].astype(int)

print(f"Samples before dropping missing: {n_before}")
print(f"Samples after dropping missing: {n_after}")
print(f"Samples discarded: {n_before - n_after}")

#show counts of pCR response labels
print(df_clean["pcr_response"].value_counts().sort_index())

In [44]:
df_clean.to_csv("/omics/groups/OE0540/internal/users/de_pietri/code/project/data/df_final3.csv", index=True)