In [5]:
import numpy as np
import pandas as pd
import os
import torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

import random
import matplotlib.pyplot as plt

random.seed(1)

np.random.seed(1)
torch.manual_seed(1)

<torch._C.Generator at 0x7fd503712990>

In [6]:
#import of age metadata
meta_dir='metadata/TCGA-ACC/'
metadata=pd.read_table(f'{meta_dir}nationwidechildrens.org_clinical_patient_acc.txt', header=1 )
metadata=metadata.drop([0,])
#print(metadata)
#print(metadata.columns)

metadata_age=metadata[['bcr_patient_uuid','age_at_initial_pathologic_diagnosis']]
metadata_age

Unnamed: 0,bcr_patient_uuid,age_at_initial_pathologic_diagnosis
1,B3164F7B-C826-4E08-9EE6-8FF96D29B913,58
2,8E7C2E31-D085-4B75-A970-162526DD07A0,44
3,DFD687BC-6E69-42F7-AF94-D17FC150D1A1,23
4,5F3E2974-F1DF-47A2-8A8A-29BB525EEEF6,23
5,802DBD0D-EF07-4C91-AB8D-1DD39532E947,30
...,...,...
88,42EF34CC-FFB4-432C-82BB-7AA56639FF51,42
89,83E7B9F8-04A4-440F-AAFC-7A8E9DDFF284,27
90,5EB5EE1F-ECFF-4427-B717-B91380C043AE,63
91,9A8EA017-0152-48B2-ACCB-D040DEB1C6B6,63


In [None]:
manifest_df = pd.read_csv('gdc_manifest.2025-08-29.112350.txt', sep='\t')

all_data = metadata_age
all_data = all_data.rename(columns={'bcr_patient_uuid': 'ID'})

cpg_columns = None
data_rows = {}

data_path = "/sybig/home/fim/Dokumente/Sysbio/All_models/cancerdata_test/data"


for index, row in manifest_df.iterrows():

    filename = row['filename']
    patient_uuid = row['id'].upper()
    file_path = os.path.join(data_path, filename)
    if os.path.exists(file_path) and patient_uuid:
        try:
            df = pd.read_csv(file_path, sep="\t", header=None, names=['CpG', 'Value'])

            if cpg_columns is None:
                cpg_columns = df['CpG'].tolist()
            values_series = pd.Series(df['Value'].values, index=df['CpG'])
            data_rows[patient_uuid] = values_series
        except pd.errors.EmptyDataError:
            print(f"Warning: Skipping empty file {filename}")
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

# DataFrame from all samples (rows = patient UUIDs, columns = CpGs)
cpg_df = pd.DataFrame.from_dict(data_rows, orient='index')
cpg_df = cpg_df.reset_index().rename(columns={'index': 'ID'})

# Join cpg_df with all_data 
all_data = pd.merge(all_data, cpg_df, on='ID', how='inner')


print(all_data)

Empty DataFrame
Columns: [ID, age_at_initial_pathologic_diagnosis, cg00000029, cg00000108, cg00000109, cg00000165, cg00000236, cg00000289, cg00000292, cg00000321, cg00000363, cg00000622, cg00000658, cg00000714, cg00000721, cg00000734, cg00000769, cg00000807, cg00000884, cg00000905, cg00000924, cg00000948, cg00000957, cg00001099, cg00001245, cg00001249, cg00001261, cg00001269, cg00001349, cg00001364, cg00001446, cg00001510, cg00001534, cg00001582, cg00001583, cg00001593, cg00001594, cg00001687, cg00001747, cg00001791, cg00001793, cg00001809, cg00001854, cg00001874, cg00001930, cg00002028, cg00002033, cg00002080, cg00002116, cg00002145, cg00002190, cg00002224, cg00002236, cg00002406, cg00002426, cg00002449, cg00002464, cg00002473, cg00002490, cg00002531, cg00002591, cg00002593, cg00002597, cg00002646, cg00002660, cg00002719, cg00002749, cg00002769, cg00002808, cg00002809, cg00002810, cg00002837, cg00002930, cg00003014, cg00003091, cg00003173, cg00003181, cg00003187, cg00003202, cg0000328

In [21]:
cpg_df

Unnamed: 0,ID,cg00000029,cg00000108,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,...,rs7746156,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873
0,b2f26e1b-5db0-42c4-8801-ef4453a7cf1a,0.152337,0.964203,0.930357,0.129594,0.767029,0.769537,0.374575,0.719192,0.145885,...,,,,,,,,,,
1,ebc18477-528d-4cf2-b42d-3944b0aeb1e2,0.630360,0.957146,0.933978,0.123027,0.911722,0.676925,0.900762,0.272177,0.730346,...,,,,,,,,,,
2,0a203f6e-57ad-4c49-b948-46adde7cd495,0.874947,0.926756,0.945334,0.198684,0.949381,0.823840,0.893881,0.345828,0.605450,...,,,,,,,,,,
3,34719285-9c88-4a8c-92b4-f722fdbc71db,0.910817,0.968111,0.937723,0.202090,0.938981,0.914248,0.959579,0.862119,0.385371,...,,,,,,,,,,
4,9188d543-e731-471f-8374-1b81051bb93d,0.115733,0.963698,0.916952,0.903669,0.936143,0.778190,0.870168,0.797064,0.541181,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,ed65fc0a-eba4-4dd0-852a-5c4e7f77518a,0.288577,0.966652,0.914195,0.181959,0.916252,0.786245,0.856100,0.881900,0.901078,...,,,,,,,,,,
76,67eea0d6-1be5-4005-a552-5a3839ec1f74,0.140051,0.975727,0.952754,0.178048,0.897952,0.729423,0.888674,0.863389,0.241796,...,,,,,,,,,,
77,19c8f069-a9d1-4501-8c13-be2d8dc15801,0.552129,0.973272,0.948831,0.150518,0.910781,0.838605,0.854544,0.844312,0.860337,...,,,,,,,,,,
78,29d7a41c-b772-4534-b762-584820af4cbc,0.150332,0.971726,0.922560,0.217870,0.918043,,0.749394,0.499272,0.383193,...,,,,,,,,,,


In [9]:
CPG_PATH   = "/sybig/home/fim/Dokumente/Sysbio/Transformer/cpgs.npy"
cpgs           = np.load(CPG_PATH, allow_pickle=True)
list_cpgs = list(cpgs)
#split
all_data.sample(frac=1,random_state=1)

#cleaning up age data
all_data['age_at_initial_pathologic_diagnosis'] = pd.to_numeric(all_data['age_at_initial_pathologic_diagnosis'], errors='coerce')
all_data = all_data.dropna(subset=['age_at_initial_pathologic_diagnosis'])

#filtering by cpgs
columns_to_keep = list(['age_at_initial_pathologic_diagnosis', 'bcr_patient_uuid'])+list_cpgs
all_data_filtered = all_data.loc[:, all_data.columns.intersection(columns_to_keep)]

#keeping only features
meta_cols = ['age_at_initial_pathologic_diagnosis', 'bcr_patient_uuid']
feature_cols = cpgs
num_features = len(feature_cols)
print(f"Using {num_features} CpG features.")

#scaling by training data
scaler = RobustScaler()
data_scaled = pd.DataFrame(
    scaler.fit_transform(all_data_filtered[feature_cols]),
    index=all_data_filtered.index,
    columns=feature_cols
)

print(data_scaled)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['age_at_initial_pathologic_diagnosis'] = pd.to_numeric(all_data['age_at_initial_pathologic_diagnosis'], errors='coerce')


Using 20318 CpG features.


  self.center_ = np.nanmedian(X, axis=0)
  return _nanquantile_unchecked(


     cg00000292  cg00002426  cg00003994  cg00007981  cg00008493  cg00008713  \
1           NaN         NaN         NaN         NaN         NaN         NaN   
3           NaN         NaN         NaN         NaN         NaN         NaN   
4           NaN         NaN         NaN         NaN         NaN         NaN   
5           NaN         NaN         NaN         NaN         NaN         NaN   
6           NaN         NaN         NaN         NaN         NaN         NaN   
..          ...         ...         ...         ...         ...         ...   
166         NaN         NaN         NaN         NaN         NaN         NaN   
167         NaN         NaN         NaN         NaN         NaN         NaN   
169         NaN         NaN         NaN         NaN         NaN         NaN   
170         NaN         NaN         NaN         NaN         NaN         NaN   
171         NaN         NaN         NaN         NaN         NaN         NaN   

     cg00009407  cg00011459  cg00012199  cg00012386

In [None]:
#important filepaths
CPG_PATH   = "/sybig/home/fim/Dokumente/Sysbio/Transformer/cpgs.npy"

#data loading
train_val_data = pd.read_pickle(TRAIN_PATH)
test_data      = pd.read_pickle(TEST_PATH)
cpgs           = np.load(CPG_PATH, allow_pickle=True)

#split
all_data=pd.concat([train_val_data,test_data])
all_data.sample(frac=1,random_state=1)
print(all_data.head)
train_val_data,test_data=train_test_split(all_data, test_size=splitB, random_state=1)

#cleaning up age data
train_val_data['age'] = pd.to_numeric(train_val_data['age'], errors='coerce')
train_val_data = train_val_data.dropna(subset=['age'])
test_data['age'] = pd.to_numeric(test_data['age'], errors='coerce')
test_data = test_data.dropna(subset=['age'])

#split 2, adjusted by tissue types for the transformer
try:
    train_df, val_df = train_test_split(
        train_val_data,
        test_size=splitB,
        stratify=train_val_data['tissue_type'],
        random_state=1
    )
    print("Stratified split successful.")
except ValueError:
    # If stratification fails due to small class sizes
    train_df, val_df = train_test_split(
        train_val_data,
        test_size=splitB,
        random_state=1
    )
    print("Random split used (too few samples per tissue).")


#keeping only features
meta_cols = ['age', 'tissue_type', 'tissue_idx', 'dataset', 'gender']
feature_cols = cpgs
num_features = len(feature_cols)
print(f"Using {num_features} CpG features.")

#scaling by training data
scaler = RobustScaler()
train_scaled = pd.DataFrame(
    scaler.fit_transform(train_df[feature_cols]),
    index=train_df.index,
    columns=feature_cols
)
val_scaled = pd.DataFrame(
    scaler.transform(val_df[feature_cols]),
    index=val_df.index,
    columns=feature_cols
)
test_scaled = pd.DataFrame(
    scaler.transform(test_data[feature_cols]),
    index=test_data.index,
    columns=feature_cols
)

#get tissue mapping for embedding (transformer)
all_tissues = pd.concat([
    train_df['tissue_type'],
    val_df['tissue_type'],
    test_data['tissue_type']
]).unique()
tissue_mapping = {t: i for i, t in enumerate(all_tissues)}
num_tissues = len(tissue_mapping)
print("Total tissue types:", num_tissues)

def map_tissue(df):
    return df['tissue_type'].map(tissue_mapping)

train_df['tissue_idx'] = map_tissue(train_df)
val_df['tissue_idx']   = map_tissue(val_df)
test_data['tissue_idx'] = map_tissue(test_data)


#converting to tensors for transformer
def df_to_tensor(df_scaled, tissue_idx, ages):
    X = torch.tensor(df_scaled.values.astype(np.float32), dtype=torch.float32)
    t = torch.tensor(tissue_idx.values, dtype=torch.long)
    y = torch.tensor(ages.values, dtype=torch.float32)
    return X, t, y

X_train, tissue_train, y_train = df_to_tensor(train_scaled, train_df['tissue_idx'], train_df['age'])
X_val, tissue_val, y_val       = df_to_tensor(val_scaled,   val_df['tissue_idx'],   val_df['age'])
X_test, tissue_test, y_test    = df_to_tensor(test_scaled,  test_data['tissue_idx'], test_data['age'])

#converting to np arrays for ElasticNet
X_train_np, y_train_np = train_scaled.values, train_df['age'].values
X_val_np,   y_val_np   = val_scaled.values,   val_df['age'].values
X_test_np,  y_test_np  = test_scaled.values,  test_data['age'].values

print("Final shapes:")
print(" Train:", X_train.shape, y_train.shape)
print(" Val:  ", X_val.shape, y_val.shape)
print(" Test: ", X_test.shape, y_test.shape)
