In [None]:
import json, io, os, sys
import typing as tp
import polars as pl
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pacsv
from pathlib import Path
import json
from tqdm import tqdm


In [None]:
import pandas as pd

## helper funcs

In [None]:
def polars_schema_to_json(df: pl.DataFrame, out: Path) -> None:
    """
    Serialize a Polars DataFrame schema to JSON as {col: dtype_name}.
    dtype_name ∈ {"Utf8","String","Int64","Float64","Boolean",...}
    """
    schema = {name: str(dtype) for name, dtype in df.schema.items()}
    out.write_text(json.dumps(schema, indent=2))
def json_schema_to_polars_dtypes(json_path: Path) -> dict[str, pl.DataType]:
    """
    Read {col: dtype_name} and map to Polars dtypes.
    Unknown names default to Utf8 (conservative).
    """
    name_to_pl = {
        "Utf8": pl.Utf8, "String": pl.Utf8,
        "Int64": pl.Int64, "Float64": pl.Float64, "Boolean": pl.Boolean,
        "Int32": pl.Int32, "Float32": pl.Float32, "Date": pl.Date,
        "Datetime": pl.Datetime, "Time": pl.Time, "Categorical": pl.Categorical,
    }
    schema = json.loads(json_path.read_text())
    return pl.Schema({c: name_to_pl.get(t, pl.Utf8) for c, t in schema.items()})

## paths

In [None]:
# ---- call these on your paths ----
root = Path("/mnt/hdd/jesse_archive/stampformer_archive/refine_bio/HOMO_SAPIENS")

we needed ~60 GB RAM toread the whole metadata tsv. 

In [None]:
data_dir = Path('/mnt/hdd/jesse_archive/stampformer_archive/refine_bio/HOMO_SAPIENS')
tsv_meta_path = data_dir / 'metadata_HOMO_SAPIENS.tsv'
schema_path = data_dir / "metadata_HOMO_SAPIENS_schema.json"
parquet_path = data_dir / "metadata_HOMO_SAPIENS.parquet"
potential_clin_path = data_dir / 'potential_clin_data.csv'
sep='\t'

In [None]:
parquet_path

PosixPath('/mnt/hdd/jesse_archive/stampformer_archive/refine_bio/HOMO_SAPIENS/metadata_HOMO_SAPIENS.parquet')

## examine the parquet

In [None]:
data = pl.read_parquet(parquet_path)

In [None]:
columns = pl.read_parquet_schema(str(parquet_path)).names()
columns[:4]

['refinebio_accession_code',
 'experiment_accession',
 'refinebio_age',
 'refinebio_cell_line']

In [None]:

# lazy scan (no full read)
lf = pl.scan_parquet(str(parquet_path))

# build one expression per column
exprs = [
    (pl.col(c)
     .is_not_null()
     .and_(pl.col(c).cast(pl.Utf8).str.strip_chars().ne("")))
     .sum()
     .alias(c)
    for c in columns
]

# compute counts lazily
counts = lf.select(exprs).collect().to_dict(as_series=False)

# convert to a tidy DataFrame
df_counts = pl.DataFrame({
    "colname": list(counts.keys()),
    "num_non_empty": [v[0] for v in counts.values()]
})

### find probable response data

In [None]:
######## edit######
import re, json
from pathlib import Path
import polars as pl
from typing import Iterable, Dict, List, Tuple, Set

# 1) Canonical response codes/labels (values)
RESPONSE_VALUE_TOKENS: Set[str] = {
    # atomic codes
    "PCR","RD","CR","NR","R","SD","PPR","PNC","PD","CRU","IR","PRCR","NON-CR","VGPR","NCR","PR","CRP",
    "MLFS","HI","RCB","CRI","OJBR","NOR","NC","ICCR","ICPR","ISD","ICPD","IUPD","IUPR","NONE","S","HR","TF","ED",
    # common long forms / keywords
    "PATHOLOGICAL COMPLETE REMISSION","RESIDUAL DISEASE","COMPLETE RESPONSE","NONRESPONSE","RESPONSE","STABLE DISEASE",
    "PATHOLOGICAL PARTIAL RESPONDERS","NON-RESPONDERS","PROGRESSIVE DISEASE","UNCONFIRMED COMPLETED REMISSION",
    "INCOMPLETE RESPONSE","LONG-HER","PARTIAL RESPONSE AND COMPLETE RESPONSE","NON-COMPLETE REMISSION",
    "VERY GOOD PARTIAL RESPONSE","GOOD/COMPLETE PCR","CR WITH INCOMPLETE HEMATOLOGIC OR PLATELET RECOVERY",
    "MORPHOLOGIC LEUKEMIA-FREE STATE","HEMATOLOGIC IMPROVEMENT","RESIDUAL CANCER BURDEN",
    "COMPLETE REMISSION WITH INCOMPLETE HEMATOLOGIC RECOVERY","OBJECTIVE RESPONDERS","NON-RESPONDERS","NO CHANGE",
    "CONFIRMED COMPLETE RESPONSE","CONFIRMED PARTIAL RESPONSE","CONFIRMED PROGRESSIVE DISEASE",
    "UNCONFIRMED PROGRESSIVE DISEASE","UNCONFIRMED PARTIAL RESPONSE","NO RESPONSE","SENSITIVE",
    "HIGHLY RESISTANT","TREATMENT FAILURE","EARLY DEATH","RESIDUAL","SURVIVAL","PROGRESSIVE",
    # generic binaries
    "YES","NO","Y","N","TRUE","FALSE","T","F","1","0"
}

# 2) Column-name terms (be specific; avoid single-letter codes here)
NAME_TERMS: Set[str] = {
    # core signals
    "RESPONSE","RESPONDER","RESPONDERS","RESPONDING","RESP","RESPONSES",
    "REMISSION","COMPLETE_REMISSION","NON_COMPLETE_REMISSION","NON-COMPLETE_REMISSION",
    "PROGRESSIVE_DISEASE","STABLE_DISEASE","PARTIAL_RESPONSE","COMPLETE_RESPONSE",
    "PROGRESSION","BURDEN","RESIDUAL","PATHOLOGICAL","PATH","RCB",
    # frequent clinical abbreviations safe for NAMES
    "PCR","CR","PR","SD","PD","VGPR","CRU","CRI","CRP","MLFS","HI","OJBR",
    "ICCR","ICPR","ISD","ICPD","IUPD","IUPR","NR","NONRESPONDER","NONRESPONDERS","NON_RESPONSE",
    "SENSITIVE","RESISTANT","RESISTANCE","TREATMENT_FAILURE","FAILURE","EARLY_DEATH","ED",
    # words that often wrap the above
    "THERAPEUTIC","THERAPY","TREATMENT","OUTCOME","EFFICACY","EFFECTIVENESS",
    "RESP_CAT","RESP_CLASS","RESP_STATUS","CLINICAL_BENEFIT","OBJECTIVE_RESPONSE",
    "RECIST","BOR"  # Best Overall Response
}

def _compile_name_regex(terms: Iterable[str]) -> re.Pattern:
    """
    Build a robust column-name regex from NAME_TERMS.
    - Normalizes underscores/hyphens vs words
    - Word-boundaries to avoid overmatching substrings
    """
    esc = []
    for t in terms:
        t = t.strip().upper().replace("-", "[\\-_]").replace("_", "[\\-_]")
        esc.append(t)
    # word-ish boundaries: (^|[^A-Z0-9]) ... (?=[^A-Z0-9]|$)
    pattern = r"(?i)(^|[^A-Z0-9])(" + "|".join(esc) + r")(?=[^A-Z0-9]|$)"
    return re.compile(pattern)

NAME_RE = _compile_name_regex(NAME_TERMS)

def normalize_token(x: str) -> str:
    """Uppercase + collapse spaces + keep A-Z0-9/+ and '-' (remove punctuation)."""
    x = (x or "").strip().upper()
    x = re.sub(r"[^A-Z0-9/\-\s]+", " ", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

def looks_like_response_values(tokens: Set[str], max_token_len: int = 40) -> Tuple[bool,float]:
    """
    Decide if a column's value set looks like treatment response labels.
    Returns (is_response_like, coverage_ratio) where coverage is fraction of tokens
    within RESPONSE_VALUE_TOKENS.
    """
    if not tokens: return (False, 0.0)
    if all(t in {"1","0"} for t in tokens): return (True, 1.0)
    # length guard to avoid free-text
    short = {t for t in tokens if len(t) <= max_token_len}
    if not short: return (False, 0.0)
    covered = {t for t in short if t in RESPONSE_VALUE_TOKENS}
    return (len(covered) > 0 and len(covered) / len(short) >= 0.6, len(covered)/max(1,len(short)))

def candidate_response_columns(all_columns: Iterable[str]) -> List[str]:
    """
    Filter column names by NAME_RE (specific clinical response terms only).
    """
    out = []
    for c in all_columns:
        if NAME_RE.search(c.upper()):
            out.append(c)
    return out

In [None]:
candidates = candidate_response_columns(data.columns)
len(candidates)

414

### perform scan in mem

In [None]:

# --- profile candidate columns by values (eager) ---
rows = []
for c in tqdm(candidates):
    s = data[c]
    non_null = int(s.is_not_null().sum())
    if non_null == 0:
        rows.append({"colname": c, "dtype": str(s.dtype), "n_non_null": 0, "n_unique": 0,
                     "is_binary_numeric": False, "is_response_like": False, "coverage": 0.0,
                     "tokens_sample": []})
        continue
    # fast unique sample
    u = s.cast(pl.Utf8, strict=False).drop_nulls().unique()
    # beware: collect to Python only now
    toks = {normalize_token(v) for v in u.to_list() if v is not None and normalize_token(v) != ""}
    is_bin = toks.issubset({"0","1"}) if toks else False
    is_resp, cov = looks_like_response_values(toks)
    rows.append({
        "colname": c, "dtype": str(s.dtype), "n_non_null": non_null,
        "n_unique": int(u.len()), "is_binary_numeric": is_bin,
        "is_response_like": is_resp, "coverage": cov,
        "tokens_sample": sorted(list(toks))[:20]
    })

resp_profile = pl.DataFrame(rows).sort(["is_response_like","coverage","n_non_null"], descending=[True,True,True])
shortlist = resp_profile.filter((pl.col("is_response_like")==True) | (pl.col("is_binary_numeric")==True))

# optional: persist
(resp_profile.write_parquet(data_dir / "response_column_profile.parquet"))
(shortlist.write_parquet(data_dir / "response_column_shortlist.parquet"))

100%|██████████████████████████████████████████████| 414/414 [00:00<00:00, 440.39it/s]


In [None]:
resp_profile['n_non_null'].sum()

279266

In [None]:
shortlist

colname,dtype,n_non_null,n_unique,is_binary_numeric,is_response_like,coverage,tokens_sample
str,str,i64,i64,bool,bool,f64,list[str]
"""characteristics_ch1_pr ihc""","""Int64""",941,2,true,true,1.0,"[""0"", ""1""]"
"""characteristics_ch1_pcr""","""String""",554,6,false,true,1.0,"[""0"", ""1"", … ""YES""]"
"""characteristics_ch1_pathologic…","""String""",486,2,false,true,1.0,"[""PCR"", ""RD""]"
"""characteristics_ch1_birth outc…","""Int64""",323,2,true,true,1.0,"[""0"", ""1""]"
"""characteristics_ch1_pasi75 res…","""String""",322,2,false,true,1.0,"[""NR"", ""R""]"
…,…,…,…,…,…,…,…
"""characteristics_ch1_Pathologic…","""Int64""",23,3,false,true,0.666667,"[""0"", ""1"", ""2""]"
"""characteristics_ch1_dopamine-a…","""String""",20,3,false,true,0.666667,"[""-"", ""NO"", ""YES""]"
"""characteristics_ch1_objective …","""String""",11,3,false,true,0.666667,"[""0"", ""1"", ""NA""]"
"""characteristics_ch1_x-ray trea…","""String""",10,3,false,true,0.666667,"[""NO"", ""YES"", ""YES/NO""]"


In [None]:
shortlist.to_pandas().to_csv(potential_clin_path.parent /'response_collumn_shortlist.csv', index = False)

## save subset 

1. subset of data with the short list collumns, and the gse and gsm
2. valuable collumns
3. all experimetns we already know
4. the table of experiment info

In [None]:
# filter: keep rows where any of those cols is not null
mask = pl.any_horizontal([pl.col(c).is_not_null() for c in shortlist['colname'] if c in data.columns])
subset = data.filter(mask)
subset = subset.select([c for c in subset.columns if not subset[c].is_null().all()])


refinebio_accession_code,experiment_accession,refinebio_age,refinebio_cell_line,refinebio_disease,refinebio_disease_stage,refinebio_organism,refinebio_platform,refinebio_processed,refinebio_processor_id,refinebio_processor_name,refinebio_processor_version,refinebio_race,refinebio_sex,refinebio_source_database,refinebio_specimen_part,refinebio_subject,refinebio_time,refinebio_title,refinebio_treatment,biomaterial_provider_ch1,channel_count,characteristics_ch1_4oht treatment,characteristics_ch1_8_week_disease control_1yes_0no,characteristics_ch1_AJCC Stage,characteristics_ch1_Age,characteristics_ch1_Age,characteristics_ch1_Age (years),characteristics_ch1_Age of onset (years),characteristics_ch1_Alcohol abuse (ratings scale,characteristics_ch1_All patients (1=included in survival analysis),characteristics_ch1_AnnArbor.Stage,characteristics_ch1_B.Symptoms,characteristics_ch1_BCL6.Break,characteristics_ch1_BL.probability,characteristics_ch1_Baseline WBC,characteristics_ch1_Brain pH,…,contact_address,contact_city,contact_country,contact_department,contact_email,contact_fax,contact_institute,contact_laboratory,contact_name,contact_phone,contact_state,contact_web_link,contact_zip/postal_code,data_processing,data_row_count,description,extract_protocol_ch1,geo_accession,growth_protocol_ch1,hyb_protocol,label_ch1,label_protocol_ch1,last_update_date,molecule_ch1,organism_ch1,platform_id,relation,scan_protocol,series_id,source_name_ch1,status,submission_date,supplementary_file,taxid_ch1,title,treatment_protocol_ch1,type
str,str,f64,str,str,str,str,str,bool,i64,str,str,str,str,str,str,str,str,str,str,str,i64,str,i64,str,str,i64,i64,str,str,i64,str,str,str,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""GSM1019241""","""GSE37745""",80.0,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133 P…",true,73,"""Affymetrix SCAN""","""v1.3.7""",,"""male""","""GEO""",,,,"""Patient 10, male, squamous""",,,1,,,,,,,,,,,,,,,,…,"""Rudbeck laboratory""","""Uppsala""","""Sweden""",,"""patrick.micke@igp.uu.se""",,"""Uppsala University""",,"""Patrick,,Micke""",,,,"""75185""","""Biocondoctor affy package stan…",54675,"""gene expression data from non …","""Five to ten sections (10m) wer…","""GSM1019241""","""no in vitro data""","""For each sample, 2g of total R…","""biotin""","""For each sample, 2g of total R…","""Nov 30 2012""","""total RNA""","""Homo sapiens""","""GPL570""",,"""The arrays were washed and sta…","""GSE37745""","""cancer cells from patient 10""","""Public on Oct 12 2012""","""Oct 12 2012""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""Patient 10, male, squamous""","""no treatment""","""RNA"""
"""GSM1030127""","""GSE41998""",40.0,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133A …",true,99,"""Affymetrix SCAN""","""v1.4.7""",,"""female""","""GEO""",,,,"""Br Ca Pt sample #S1141087-1""",,,1,,,,,,,,,,,,,,,,…,"""Route 206 and Province Line Ro…","""Princeton""","""USA""",,"""christine.horak@bms.com""",,"""Bristol-Myers Squibb""",,"""Christine,,Horak""",,"""NJ""",,"""08543""","""RMA background correction and …",22277,"""['Gene expression data from pr…","""Total RNA was extracted from R…","""GSM1030127""",,"""Labeled cRNA targets with grea…","""biotin""","""Biotinylated cRNA were prepare…","""Jan 01 2013""","""total RNA""","""Homo sapiens""","""GPL571""",,"""Affymetrix GeneChip Scanner 30…","""GSE41998""","""pre-treatment breast cancer tu…","""Public on Jan 01 2013""","""Nov 02 2012""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""Br Ca Pt sample #S1141087-1""",,"""RNA"""
"""GSM1030241""","""GSE41998""",33.0,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133A …",true,99,"""Affymetrix SCAN""","""v1.4.7""",,"""female""","""GEO""",,,,"""Br Ca Pt sample #E4343391-4""",,,1,,,,,,,,,,,,,,,,…,"""Route 206 and Province Line Ro…","""Princeton""","""USA""",,"""christine.horak@bms.com""",,"""Bristol-Myers Squibb""",,"""Christine,,Horak""",,"""NJ""",,"""08543""","""RMA background correction and …",22277,"""['Gene expression data from pr…","""Total RNA was extracted from R…","""GSM1030241""",,"""Labeled cRNA targets with grea…","""biotin""","""Biotinylated cRNA were prepare…","""Jan 01 2013""","""total RNA""","""Homo sapiens""","""GPL571""",,"""Affymetrix GeneChip Scanner 30…","""GSE41998""","""pre-treatment breast cancer tu…","""Public on Jan 01 2013""","""Nov 02 2012""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""Br Ca Pt sample #E4343391-4""",,"""RNA"""
"""GSM1030329""","""GSE41998""",41.0,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133A …",true,99,"""Affymetrix SCAN""","""v1.4.7""",,"""female""","""GEO""",,,,"""Br Ca Pt sample #A7181967-1""",,,1,,,,,,,,,,,,,,,,…,"""Route 206 and Province Line Ro…","""Princeton""","""USA""",,"""christine.horak@bms.com""",,"""Bristol-Myers Squibb""",,"""Christine,,Horak""",,"""NJ""",,"""08543""","""RMA background correction and …",22277,"""['Gene expression data from pr…","""Total RNA was extracted from R…","""GSM1030329""",,"""Labeled cRNA targets with grea…","""biotin""","""Biotinylated cRNA were prepare…","""Jan 01 2013""","""total RNA""","""Homo sapiens""","""GPL571""",,"""Affymetrix GeneChip Scanner 30…","""GSE41998""","""pre-treatment breast cancer tu…","""Public on Jan 01 2013""","""Nov 02 2012""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""Br Ca Pt sample #A7181967-1""",,"""RNA"""
"""GSM1186521""","""GSE48905""",,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133 P…",true,89,"""Affymetrix SCAN""","""v1.4.4""",,"""female""","""GEO""",,,,"""PP4_1_004A_U133p2_100ng.CEL""",,,1,,,,,,,,,,,,,,,,…,"""Venlighedsvej 1""","""Hrsholm""","""Denmark""","""lab""","""tj@medical-prognosis.com""",,"""MPI""","""lab""","""Thomas,,Jensen""","""22930837""",,,"""2970""","""R""",54675,"""PP4_1_004A_U133p2_100ng.CEL""","""Rneasy""","""GSM1186521""",,"""affy hyb wash stain""","""biotin""","""ambion messageamp""","""Jan 09 2014""","""total RNA""","""Homo sapiens""","""GPL570""",,"""affy standard""","""GSE48905""","""blood""","""Public on Jan 08 2014""","""Jul 16 2013""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""PP4_1_004A_U133p2_100ng.CEL""",,"""RNA"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""GSM540159""","""GSE21653""",57.0,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133 P…",true,73,"""Affymetrix SCAN""","""v1.3.7""",,,"""GEO""","""breast cancer tumor""",,,"""BC52""",,,1,,,,,,,,,,,,,,,,…,"""232 Bd Ste Marguerite""","""Marseille""","""France""","""Centre de cancrologie de Marse…","""finettip@ipc.unicancer.fnclcc.…",,"""Institut Paoli-Calmettes""","""Molecular Oncology""","""Pascal,,FINETTI""","""(33)+4 91 22 33 04""","""BdR""",,"""13009""","""RMA using a quantile normaliza…",54613,"""n/a""","""Guanidinium isothiocyanate and…","""GSM540159""",,"""Affymetrix standard protocol""","""biotin""","""Affymetrix standard protocol""","""Aug 28 2018""","""total RNA""","""Homo sapiens""","""GPL570""","""['Reanalyzed by: GSM781373', '…","""Affymetrix standard protocol""","""GSE21653""","""human breast cancer tumors""","""Public on May 05 2010""","""May 04 2010""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""BC52""",,"""RNA"""
"""GSM540272""","""GSE21653""",53.0,,,,"""HOMO_SAPIENS""","""Affymetrix Human Genome U133 P…",true,89,"""Affymetrix SCAN""","""v1.4.4""",,,"""GEO""","""breast cancer tumor""",,,"""BC165""",,,1,,,,,,,,,,,,,,,,…,"""232 Bd Ste Marguerite""","""Marseille""","""France""","""Centre de cancrologie de Marse…","""finettip@ipc.unicancer.fnclcc.…",,"""Institut Paoli-Calmettes""","""Molecular Oncology""","""Pascal,,FINETTI""","""(33)+4 91 22 33 04""","""BdR""",,"""13009""","""RMA using a quantile normaliza…",54613,"""n/a""","""Guanidinium isothiocyanate and…","""GSM540272""",,"""Affymetrix standard protocol""","""biotin""","""Affymetrix standard protocol""","""Aug 28 2018""","""total RNA""","""Homo sapiens""","""GPL570""","""['Reanalyzed by: GSM781486', '…","""Affymetrix standard protocol""","""GSE21653""","""human breast cancer tumors""","""Public on May 05 2010""","""May 04 2010""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""BC165""",,"""RNA"""
"""GSM615239""","""GSE25066""",,,,"""3""","""HOMO_SAPIENS""","""Affymetrix Human Genome U133A …",true,259,"""Affymetrix SCAN""","""v1.16.11-hotfix""",,,"""GEO""",,,,"""M283""",,,1,,,,,,,,,,,,,,,,…,"""400 West Cummings Park, Suite …","""Woburn""","""USA""",,"""christos@nuverabio.com""",,"""Nuvera Biosciences""",,"""Christos,,Hatzis""","""781-938-3830""","""MA""","""www.nuverabio.com""","""01801""","""Probe intensities were quantif…",22283,,"""Total RNA was extracted from t…","""GSM615239""",,"""Following fragmentation, 10 ug…","""biotin""","""A single-round T7 amplificatio…","""Oct 26 2011""","""total RNA""","""Homo sapiens""","""GPL96""",,"""GeneChips were scanned using t…","""['GSE25055', 'GSE25066']""","""breast cancer, sample MDACC-M2…","""Public on May 11 2011""","""Nov 01 2010""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""M283""","""Patients prospectively consent…","""RNA"""
"""GSM615806""","""GSE25065""",,,,"""na""","""HOMO_SAPIENS""","""Affymetrix Human Genome U133A …",true,328,"""Affymetrix SCAN""","""v1.23.4-hotfix""",,,"""GEO""","""breast cancer tumor""",,,"""PERU56""",,,1,,,,,,,,,,,,,,,,…,"""400 West Cummings Park, Suite …","""Woburn""","""USA""",,"""christos@nuverabio.com""",,"""Nuvera Biosciences""",,"""Christos,,Hatzis""","""781-938-3830""","""MA""","""www.nuverabio.com""","""01801""","""Probe intensities were quantif…",22283,,"""Total RNA was extracted from t…","""GSM615806""",,"""Following fragmentation, 10 ug…","""biotin""","""A single-round T7 amplificatio…","""Oct 26 2011""","""total RNA""","""Homo sapiens""","""GPL96""",,"""GeneChips were scanned using t…","""['GSE25065', 'GSE25066']""","""breast cancer, sample LBJ/IN/G…","""Public on May 11 2011""","""Nov 01 2010""","""ftp://ftp.ncbi.nlm.nih.gov/geo…","""9606""","""PERU56""","""Patients prospectively consent…","""RNA"""


In [None]:
subset['experiment_accession'].str.starts_with('GSE').sum(), len(subset)

(7639, 7639)

In [None]:
from stampformer.utils.paths import PathList
from stampformer.data.data_utils import load_zarr_group
import zarr

path_dict = PathList()


clin_obs = pd.read_csv(path_dict.data.clin_data_dir /'clin_obs.csv') 
clin_obs

Unnamed: 0,data_cancer_name,dataset_name,depth,drug_ids,drug_list,is_microarray,label,num_expressed,num_measured,primary_tissue,sample_id,tcga_subtype,drug_list_canonized
0,Breast cancer,CTR_Microarray_1-I,0.0,"[3874, 2882, 105, 6792, 2790, 0, 0, 0]","[""Doxorubicin"", ""Paclitaxel"", ""Cyclophosphamid...",True,0,19068,19068,Breast,GSM1233067,BRCA,"['DOXORUBICIN', 'PACLITAXEL', 'CYCLOPHOSPHAMID..."
1,Breast cancer,CTR_Microarray_1-I,0.0,"[3874, 2882, 105, 6792, 2790, 0, 0, 0]","[""Doxorubicin"", ""Paclitaxel"", ""Cyclophosphamid...",True,0,19068,19068,Breast,GSM1233069,BRCA,"['DOXORUBICIN', 'PACLITAXEL', 'CYCLOPHOSPHAMID..."
2,Breast cancer,CTR_Microarray_1-I,0.0,"[3874, 2882, 105, 6792, 2790, 0, 0, 0]","[""Doxorubicin"", ""Paclitaxel"", ""Cyclophosphamid...",True,0,19068,19068,Breast,GSM1233072,BRCA,"['DOXORUBICIN', 'PACLITAXEL', 'CYCLOPHOSPHAMID..."
3,Breast cancer,CTR_Microarray_1-I,0.0,"[3874, 2882, 105, 6792, 2790, 0, 0, 0]","[""Doxorubicin"", ""Paclitaxel"", ""Cyclophosphamid...",True,1,19068,19068,Breast,GSM1233085,BRCA,"['DOXORUBICIN', 'PACLITAXEL', 'CYCLOPHOSPHAMID..."
4,Breast cancer,CTR_Microarray_1-I,0.0,"[3874, 2882, 105, 6792, 2790, 0, 0, 0]","[""Doxorubicin"", ""Paclitaxel"", ""Cyclophosphamid...",True,0,19068,19068,Breast,GSM1233086,BRCA,"['DOXORUBICIN', 'PACLITAXEL', 'CYCLOPHOSPHAMID..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12375,,ipsy2_genexp,0.0,"[2882, 6863, 6801, 0, 0, 0, 0, 0]","[""Paclitaxel"", ""Pertuzumab"", ""Trastuzumab""]",True,1,7151,7151,,GSM5859964,BRCA,"['PACLITAXEL', 'PERTUZUMAB', 'TRASTUZUMAB']"
12376,,ipsy2_genexp,0.0,"[2882, 3690, 0, 0, 0, 0, 0, 0]","[""Paclitaxel"", ""Neratinib""]",True,0,7151,7151,,GSM5859965,BRCA,"['PACLITAXEL', 'NERATINIB']"
12377,,ipsy2_genexp,0.0,"[2882, 6804, 0, 0, 0, 0, 0, 0]","[""Paclitaxel"", ""Pembrolizumab""]",True,1,7151,7151,,GSM5859966,BRCA,"['PACLITAXEL', 'PEMBROLIZUMAB']"
12378,,ipsy2_genexp,0.0,"[2882, 6878, 3838, 0, 0, 0, 0, 0]","[""Paclitaxel"", ""ABT 888"", ""Carboplatin""]",True,1,7151,7151,,GSM5859967,BRCA,"['PACLITAXEL', 'ABT-888', 'CARBOPLATIN']"


In [None]:
rb_ids = pd.Series(subset['refinebio_accession_code'])
clin_ids = clin_obs['sample_id']
rb_ids[:10],clin_ids[:10]

(0    GSM1019241
 1    GSM1030127
 2    GSM1030241
 3    GSM1030329
 4    GSM1186521
 5    GSM1379817
 6    GSM1379857
 7    GSM1380021
 8    GSM1619397
 9    GSM1672485
 dtype: object,
 0    GSM1233067
 1    GSM1233069
 2    GSM1233072
 3    GSM1233085
 4    GSM1233086
 5    GSM1233098
 6    GSM1233102
 7    GSM1233103
 8    GSM1233107
 9    GSM1233108
 Name: sample_id, dtype: object)

In [None]:
cols = ['source_name_ch1','title','treatment_protocol_ch1','refinebio_title',	'refinebio_treatment','refinebio_specimen_part']

In [None]:
refinebio_cell_line	refinebio_disease	refinebio_disease_stage	refinebio_organism refinebio_age	refinebio_cell_line refinebio_accession_code	experiment_accession

In [None]:
subset.to_pandas().to_csv(potential_clin_path, index=False)

In [None]:
potential_clin_path

PosixPath('/mnt/hdd/jesse_archive/stampformer_archive/refine_bio/HOMO_SAPIENS/potential_clin_data.csv')

In [None]:
not_in_clin = 0
for sample in rb_ids:
    if sample not in clin_ids.values: not_in_clin += 1
len(rb_ids),not_in_clin    

(7639, 5086)

### save all gse data overlaping

In [None]:
gse_data = pd.read_csv(path_dict.data.gse_data)
gse_data['series']

0         GSE24080
1        GSE100666
2        GSE100942
3        GSE101472
4        GSE101607
           ...    
53562     GSE93709
53563     GSE94819
53564     GSE97382
53565     GSE99623
53566     GSE99733
Name: series, Length: 53567, dtype: object

In [None]:
accessions = subset['experiment_accession'].to_pandas()  # or .to_list() if memory allows

# filter to keep only rows whose 'series' value is in subset['experiment_accession']
gse_subset = gse_data[gse_data['series'].isin(accessions)]
gse_subset.to_csv(data_dir /'experiment_data.csv', index=False)

## read entire dataset

In [None]:
data = pl.read_csv(
        source=str(path),
        separator=sep,
        has_header=True,                # fallback to string headers if missing
        infer_schema_length=420000,       # load all as strings
        #infer_schema=False,       # load all as strings
        null_values=["", "NA", "NaN", "null", "None",'5503934202250110435328'], 
        ignore_errors=False,
        low_memory=True
    )

we manually skirted this error:
```
ComputeError: could not parse 5503934202250110435328 as dtype i64 at column 'characteristics_ch1_barcode' (column number 1484) The current offset in the file is 5534434 bytes.

You might want to try: - increasing infer_schema_length (e.g. infer_schema_length=10000), - specifying correct dtype with the schema_overrides argument - setting ignore_errors to True, - adding 5503934202250110435328 to the null_values list.
```

## save the schema

In [None]:
polars_schema_to_json(data, schema_path)

## stream to parquet

In [None]:
# try loading schema:
saved_schema  =json_schema_to_polars_dtypes(schema_path)
saved_schema.dtypes()[:4]

[String, String, Float64, String]

In [None]:
lazy_data = pl.scan_csv(
    str(tsv_meta_path),
    separator=sep,
    has_header=True,
    infer_schema=False,
    schema=saved_schema,                  # ← reuse schema
    null_values=["", "NA", "NaN", "null", "None"],
    ignore_errors=False,
    low_memory=True,
)

In [None]:
lazy_data.sink_parquet(
    parquet_path,
    compression="zstd",
    compression_level=4,
    row_group_size=64_000,
    #use_pyarrow=True,
)

In [None]:
parquet_out

PosixPath('/mnt/hdd/jesse_archive/stampformer_archive/refine_bio/HOMO_SAPIENS/metadata_HOMO_SAPIENS.parquet')