In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Thymoma/GSE55852'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"GTF2I Mutations are Common in Thymic Epithelial Tumors"
!Series_summary	"Within a project aim to define the genomic aberration of thymic epithelial tumors, we performed array CGH in 65 thymic epithelial tumors. Tumor samples were collected during surgery or by image-guided biopsies and immediately frozen. Section from frozen material were cut and stained with Haematoxylin and Eosin. A pathologist reviewed the slides and selected only cases with >80% of cancer cells."
!Series_overall_design	"Copy number aberrations of a series of 65 thymic epithelial tumors were evaluated using array CGH. Differences in copy number aberrations between different histotypes were evaluated. Significant regions of CN aberrations were defined using GISTIC algorithms."
Sample Characteristics Dictionary:
{0: ['tumor: Thymic Carcinoma', 'tumor: Thymoma'], 1: ['Sex: M', 'Sex: F'], 2: ['who: TC', 'who: NEC', 'who: B3', 'who: B2', 'who: AB', 'who: A'], 3: ['Stage: IVB', 'gtf2

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
# Based on the background information, this dataset seems to focus on copy number aberrations rather than gene expression.
is_gene_available = False

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Thymoma is a trait:
trait_row = 0 if len(set(['tumor: Thymic Carcinoma', 'tumor: Thymoma'])) > 1 else None

# Age data is not mentioned:
age_row = None

# Gender is mentioned:
gender_row = 1 if len(set(['Sex: M', 'Sex: F'])) > 1 else None

# 2.3 Data Type Conversion
def convert_trait(value):
    """Converts trait data to binary."""
    v = value.split(':')[-1].strip()
    if v == 'Thymoma':
        return 1
    elif v == 'Thymic Carcinoma':
        return 0
    return None

def convert_age(value):
    """Converts age data, which is not available."""
    return None

def convert_gender(value):
    """Converts gender data to binary."""
    v = value.split(':')[-1].strip()
    if v == 'M':
        return 1
    elif v == 'F':
        return 0
    return None

# 3. Save Metadata
save_cohort_info('GSE55852', './preprocessed/Thymoma/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Thymoma', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Thymoma/trait_data/GSE55852.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1347258': [0, 1], 'GSM1347259': [0, 1], 'GSM1347260': [0, 0], 'GSM1347261': [0, 1], 'GSM1347262': [1, 1], 'GSM1347263': [0, 1], 'GSM1347264': [1, 0], 'GSM1347265': [0, 0], 'GSM1347266': [1, 1], 'GSM1347267': [0, 0], 'GSM1347268': [0, 0], 'GSM1347269': [0, 1], 'GSM1347270': [1, 0], 'GSM1347271': [0, 0], 'GSM1347272': [0, 0], 'GSM1347273': [1, 1], 'GSM1347274': [1, 1], 'GSM1347275': [1, 1], 'GSM1347276': [1, 1], 'GSM1347277': [1, 1], 'GSM1347278': [1, 0], 'GSM1347279': [1, 0], 'GSM1347280': [1, 0], 'GSM1347281': [1, 0], 'GSM1347282': [1, 1], 'GSM1347283': [1, 0], 'GSM1347284': [0, 1], 'GSM1347285': [1, 0], 'GSM1347286': [1, 1], 'GSM1347287': [1, 1], 'GSM1347288': [1, 0], 'GSM1347289': [1, 1], 'GSM1347290': [1, 1], 'GSM1347291': [1, 0], 'GSM1347292': [1, 0], 'GSM1347293': [1, 1], 'GSM1347294': [1, 1], 'GSM1347295': [1, 0], 'GSM1347296': [1, 0], 'GSM1347297': [1, 0], 'GSM1347298': [0, 1], 'GSM1347299': [1, 0], 'GSM1347300': [1, 1], 'GSM1347301': [1, 0], 'GSM1347302': [1, 1], 'GSM13473