<a href="https://colab.research.google.com/github/KVincent007/Predicting-age-from-the-transcriptome-of-human-dermal-fibroblasts/blob/master/GSE113957_capstone_parse_metadata%26merge2TPM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

title_to_gsm = {}
with open('/content/GSE113957_family.soft.txt') as f:
    gsm = None
    for line in f:
        line = line.strip()
        if line.startswith("^SAMPLE"):
            gsm = line.split(" = ", 1)[1]
        elif line.startswith("!Sample_title") and gsm:
            title = line.split(" = ", 1)[1].strip()
            title_to_gsm[title] = gsm
            gsm = None

tpm = pd.read_csv("/content/GSE113957_norm_counts_TPM_GRCh38.p13_NCBI.csv", index_col=0)

# Replace sample titles with GSM IDs (using your title_to_gsm mapping)
tpm.rename(columns=title_to_gsm, inplace=True)

# Drop any columns that couldn't be mapped
tpm = tpm.loc[:, tpm.columns.notna()]

# Transpose: make samples rows, genes columns
tpm = tpm.T
tpm.index.name = 'sample_id'

# --- 2. Parse metadata from SOFT file ---
sample_ids = []
ages = []
sexes = []
diseases = []

with open('/content/GSE113957_family.soft.txt', 'r') as f:
    current_id = None
    current_age = None
    current_sex = None
    current_disease = None

    for line in f:
        line = line.strip()
        if line.startswith("^SAMPLE"):
            if current_id:
                sample_ids.append(current_id)
                ages.append(current_age)
                sexes.append(current_sex)
                diseases.append(current_disease)
            current_id = line.split(" = ")[1]
            current_age = None
            current_sex = None
            current_disease = None
        elif line.startswith("!Sample_characteristics_ch1"):
            lower_line = line.lower()
            if "age" in lower_line:
                current_age = line.split(":")[-1].strip()
            elif "sex" in lower_line:
                current_sex = line.split(":")[-1].strip()
            elif "disease" in lower_line or "diagnosis" in lower_line:
                current_disease = line.split(":")[-1].strip()

    # Don't forget the last sample
    if current_id:
        sample_ids.append(current_id)
        ages.append(current_age)
        sexes.append(current_sex)
        diseases.append(current_disease)

# --- 3. Build metadata DataFrame ---
meta_df = pd.DataFrame({
    "sample_id": sample_ids,
    "Age": ages,
    "sex": sexes,
    "disease": diseases
})

# Convert age to numeric (some missing values might turn into NaN)
meta_df["Age"] = pd.to_numeric(meta_df["Age"], errors='coerce')
# Convert 'sex' to string before applying .str accessor
meta_df["sex"] = meta_df["sex"].astype(str).str.capitalize()
meta_df["disease"] = meta_df["disease"].astype(str).str.upper()

# Normalize label case
tpm.index = tpm.index.str.strip().str.upper()
meta_df['sample_id'] = meta_df['sample_id'].str.strip().str.upper()

# --- 4. Merge metadata with TPM ---
merged = tpm.merge(meta_df, left_index=True, right_on='sample_id')

# --- 5. Remove samples where disease == "HGPS" ---
filtered = merged[merged["disease"] != "HGPS"].set_index('sample_id')


# ✅ Result: 'filtered' now contains merged TPM + metadata, excluding HGPS
print("Merged and filtered shape:", filtered.shape)
print(filtered[["Age", "sex", "disease"]].head())
#print("TPM sample IDs:", set(tpm.index[:5]))
#print("Metadata sample IDs:", set(meta_df['sample_id'].head()))

# Save the cleaned merged dataset to CSV
filtered.to_csv("/content/merged_TPM_with_metadata_filtered.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/GSE113957_family.soft.txt'

In [None]:
df = pd.read_csv("/content/merged_TPM_with_metadata_filtered.csv")
print(df.head())  # or df.sample(5)
print(df.shape)

    sample_id  100287102  653635  102466751  107985730  100302278  645520  \
0  GSM3124560    0.59030   22.44      34.83        0.0        0.0  0.0000   
1  GSM3124561    0.09679   19.34      16.46        0.0        0.0  0.0000   
2  GSM3124562    0.83440   18.31      14.19        0.0        0.0  0.0000   
3  GSM3124563    0.09767   14.23      14.24        0.0        0.0  0.0000   
4  GSM3124564    0.24130   17.69      17.58        0.0        0.0  0.1764   

   79501  100996442  729737  ...   4568    4540    4541    4556    4519  \
0    0.0     0.5693  1.3230  ...  667.1  3062.0  2172.0  1038.0  3468.0   
1    0.0     1.3720  1.4900  ...  396.4  2958.0  2426.0  1191.0  3231.0   
2    0.0     0.6197  1.2340  ...  380.5  3567.0  2802.0  1464.0  3982.0   
3    0.0     0.5604  0.3242  ...  381.8  2215.0  1728.0   846.5  2202.0   
4    0.0     1.3850  0.9466  ...  401.4  4153.0  3712.0  1745.0  4281.0   

     4576   4571   age     sex  disease  
0   97.09  284.8   1.0    Male   NORMAL  
1 