In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Testicular_Cancer/GSE28094'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"A DNA Methylation Fingerprint of  1,628 Human Samples"
!Series_summary	"DNA methylation is the best characterized of the different layers that make up the epigenetic setting. Most of the studies characterizing DNA methylation patterns have been restricted to particular genomic loci in a limited number of human samples and pathological conditions. The recently arrived single-base-resolution technologies for DNA methylation are extremely helpful tools, but are not yet applicable and affordable for studying large groups of subjects. Herein, we present a compromise between an extremely comprehensive study of a human sample population with an intermediate level of resolution of CpGs at the genomic level. We obtained a DNA methylation fingerprint of 1,628 human samples where we interrogated 1,505 CpG sites. The DNA methylation patterns revealed show this epigenetic mark to be critical in tissue-type definition and stemness, particularly around transcrip

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = False  # Since it's explicitly mentioned as DNA methylation data

# Determine the availability of variables
# Testicular_Cancer data availability
for key, values in {3: ['disease state: testis cancer', 'disease state: healthy', 'disease state: lung cancer'], 4: ["biomaterial: testis tumor tissue", "biomaterial: testis normal tissue"]}.items():
    if "disease state: testis cancer" in values or "biomaterial: testis tumor tissue" in values:
        trait_row = key

# Age data availability
for key, values in {1: ['age: N/A', 'age: 65', 'age: 79', 'age: 51']}.items():
    if any("age: " in v for v in values):
        age_row = key

# Gender data availability
for key, values in {0: ['gender: N/A', 'gender: male', 'gender: female']}.items():
    if any("gender: male" in v or "gender: female" in v for v in values):
        gender_row = key

# Define data type conversion functions
def convert_trait(value):
    return 1 if "testis cancer" in value.split(":")[1].strip() else 0

def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except (ValueError, IndexError):
        return None

def convert_gender(value):
    gender = value.split(":")[1].strip().lower()
    if gender == "male":
        return 1
    elif gender == "female":
        return 0
    return None

# Save cohort metadata
save_cohort_info('GSE28094', './preprocessed/Testicular_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction if trait_row is not None
if trait_row is not None: 
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Testicular_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Testicular_Cancer/trait_data/GSE28094.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM694425': [0, None, None], 'GSM694426': [0, None, None], 'GSM694427': [0.0, 65.0, 1.0], 'GSM694428': [0, None, None], 'GSM694429': [0, None, None], 'GSM694430': [0.0, 79.0, 1.0], 'GSM694431': [0, None, None], 'GSM694432': [0, None, None], 'GSM694433': [0.0, 51.0, 1.0], 'GSM694434': [0, None, None], 'GSM694435': [0, None, None], 'GSM694436': [0.0, 69.0, 1.0], 'GSM694437': [0.0, 83.0, 1.0], 'GSM694438': [0.0, 72.0, 1.0], 'GSM694439': [0, None, None], 'GSM694440': [0, None, None], 'GSM694441': [0.0, 70.0, 1.0], 'GSM694442': [0, None, None], 'GSM694443': [0, None, None], 'GSM694444': [0.0, 78.0, 1.0], 'GSM694445': [0.0, 74.0, 1.0], 'GSM694446': [0, None, None], 'GSM694447': [0.0, 69.0, 1.0], 'GSM694448': [0, None, None], 'GSM694449': [0, None, None], 'GSM694450': [0, None, None], 'GSM694451': [0, None, None], 'GSM694452': [0, None, None], 'GSM694453': [0.0, 64.0, 1.0], 'GSM694454': [0, None, None], 'GSM694455': [0.0, 59.0, 0.0], 'GSM694456': [0.0, 0.1, 1.0], 'GSM694457': [0, None, None