In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Bladder_Cancer/GSE201395'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"An in vitro study of the development of resistance to FGFR inhibition in two urothelial carcinoma cell lines"
!Series_summary	"Expression array data was used to compare parental FGFR3-TACC3 fusion-driven urothelial cell lines with their FGFR inhibitor-resistant derivatives."
!Series_summary	"In this dataset, we include RT112 and RT4 parental cells, RT112 cells acutely treated with PD173074 (24 h), RT112 and RT4 resistant derivatives cultured with drug and their resistant derivatives cultured for four to six passages out of drug."
!Series_overall_design	"11 cell lines were analysed on the Affymetrix HTA 2.0 platform: RT112 and RT4 parental cells (RT112 parental no PD; RT4 parental no PD), RT112 cells acutely treated with PD173074 (24 h) (RT112 parental PD), RT112 and RT4 resistant derivatives cultured with drug (RT112 R1 PD; RT112 R2 PD; RT112 R3 PD; RT4 R1 PD) and their resistant derivatives cultured for four to six passages out of drug (RT112 R1 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1. Gene Expression Data Availability
is_gene_available = True  # Assuming the dataset uses the Affymetrix HTA 2.0 platform, which is for gene expression analysis

# Step 2. Variable Availability and Data Type Conversion

# 2.1 Data Availability
# In this dataset, `Bladder_Cancer`, `age`, and `gender` information are not explicitly listed.

# For the sake of completion, let's write conversion functions assuming we had found relevant keys

# Conversion functions
def convert_trait(value):
    return None  # No available data hence cannot convert

def convert_age(value):
    try:
        age = float(value.split(':')[1].strip())
        return age
    except (IndexError, ValueError):
        return None

def convert_gender(value):
    value = value.split(':')[1].strip().lower()
    if value in ['male', 'm']:
        return 1
    elif value in ['female', 'f']:
        return 0
    return None

# Save Metadata
save_cohort_info('GSE201395', './preprocessed/Bladder_Cancer/cohort_info.json', is_gene_available, trait_row is not None)
