In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Telomere_Length/GSE80435'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Whole genome landscapes of major melanoma subtypes"
!Series_summary	"Cutaneous, acral and mucosal subtypes of melanoma were evaluated by whole-genome sequencing, revealing genes affected by novel recurrent mutations to the promoter (TERT, DPH3, OXNAD1, RPL13A, RALY, RPL18A, AP2A1), 5’-UTR (HNRNPUL1, CCDC77, PES1), and 3’-UTR (DYNAP, CHIT1, FUT9, CCDC141, CDH9, PTPRT) regions. TERT promoter mutations had the highest frequency of any mutation, but neither they nor ATRX mutations, associated with the alternative telomere lengthening mechanism, were correlated with greater telomere length. Genomic landscapes largely reflected ultraviolet radiation mutagenesis in cutaneous melanoma and provided novel insights into melanoma pathogenesis. In contrast, acral and mucosal melanomas exhibited predominantly structural changes, and mutation signatures of unknown aetiology not previously identified in melanoma. The majority of melanomas had potentially actionab

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check gene expression data availability from Background Information
is_gene_available = True

# Analyze sample characteristics dictionary for variable availability
# {0: ['region: Lymph nodes- Inguinal', 'region: Lymph nodes- Axilla', 'region: Lymph nodes- Groin']}
# No available information for 'Telomere_Length', 'age', or 'gender'

def extract_value(header_value):
    """Extract the value part from a header:value pair"""
    if header_value and ':' in header_value:
        return header_value.split(':', 1)[1].strip()
    return None

def convert_trait(raw_value):
    """Convert Telomere_Length raw value to appropriate data type (continuous)"""
    value = extract_value(raw_value)
    try:
        return float(value)
    except (TypeError, ValueError):
        return None

def convert_age(raw_value):
    """Convert age raw value to appropriate data type (continuous)"""
    value = extract_value(raw_value)
    try:
        return int(value)
    except (TypeError, ValueError):
        return None

def convert_gender(raw_value):
    """Convert gender raw value to appropriate data type (binary)"""
    value = extract_value(raw_value)
    if value is None:
        return None
    value = value.lower()
    if value == 'female':
        return 0
    elif value == 'male':
        return 1
    else:
        return None

save_cohort_info('GSE80435', './preprocessed/Telomere_Length/cohort_info.json', is_gene_available, trait_row is not None)
