In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Bladder_Cancer/GSE203149'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression data from muscle-invasive bladder cancer samples"
!Series_summary	"Gene signatures based on the median expression of a preselected set of genes can provide prognostic and treatment outcome prediction and so be valuable clinically."
!Series_summary	"Different health care services use different gene expression platforms to derive gene expression data. Here we have derived gene expression data using a microarray platform."
!Series_overall_design	"RNA extracted from FFPE blocks from patients with muscle-invasive bladder cancer and full transcriptome analysis on Clariom S microarray platform.  Sample blocks were collected for platform comparison and a heterogeneity gene signature study without any associated patient information."
Sample Characteristics Dictionary:
{0: ['disease: Muscle-invasive bladder cancer']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Checking if gene expression data is available
is_gene_available = True

# Step 2: Checking data availability and defining the corresponding rows
sample_characteristics = {0: ['disease: Muscle-invasive bladder cancer']}

trait_row = 0  # Assuming that 'Bladder_Cancer' is implicitly captured by the disease label
age_row = None  # No information regarding age found
gender_row = None  # No information regarding gender found

# Step 3: Defining data type conversion functions
def convert_trait(value):
    if 'disease: Muscle-invasive bladder cancer' in value:
        return 1
    else:
        return None

def convert_age(value):
    try:
        return float(value.split(':')[-1].strip())
    except:
        return None

def convert_gender(value):
    gender_str = value.split(':')[-1].strip().lower()
    if gender_str in ['male', 'm']:
        return 1
    elif gender_str in ['female', 'f']:
        return 0
    else:
        return None

# Step 4: Save cohort information
save_cohort_info('GSE203149', './preprocessed/Bladder_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Step 5: Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Bladder_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Bladder_Cancer/trait_data/GSE203149.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM6160439': [1], 'GSM6160440': [1], 'GSM6160441': [1], 'GSM6160442': [1], 'GSM6160443': [1], 'GSM6160444': [1], 'GSM6160445': [1], 'GSM6160446': [1], 'GSM6160447': [1], 'GSM6160448': [1], 'GSM6160449': [1], 'GSM6160450': [1], 'GSM6160451': [1], 'GSM6160452': [1], 'GSM6160453': [1], 'GSM6160454': [1], 'GSM6160455': [1], 'GSM6160456': [1], 'GSM6160457': [1], 'GSM6160458': [1], 'GSM6160459': [1], 'GSM6160460': [1], 'GSM6160461': [1], 'GSM6160462': [1], 'GSM6160463': [1], 'GSM6160464': [1], 'GSM6160465': [1], 'GSM6160466': [1], 'GSM6160467': [1], 'GSM6160468': [1], 'GSM6160469': [1], 'GSM6160470': [1], 'GSM6160471': [1], 'GSM6160472': [1], 'GSM6160473': [1], 'GSM6160474': [1], 'GSM6160475': [1], 'GSM6160476': [1], 'GSM6160477': [1], 'GSM6160478': [1], 'GSM6160479': [1], 'GSM6160480': [1], 'GSM6160481': [1], 'GSM6160482': [1], 'GSM6160483': [1], 'GSM6160484': [1], 'GSM6160485': [1], 'GSM6160486': [1], 'GSM6160487': [1], 'GSM6160488': [1], 'GSM6160489': [1], 'GSM6160490': [1], 'GSM6160491

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['AFFX-BkGr-GC03_st', 'AFFX-BkGr-GC04_st', 'AFFX-BkGr-GC05_st',
       'AFFX-BkGr-GC06_st', 'AFFX-BkGr-GC07_st', 'AFFX-BkGr-GC08_st',
       'AFFX-BkGr-GC09_st', 'AFFX-BkGr-GC10_st', 'AFFX-BkGr-GC11_st',
       'AFFX-BkGr-GC12_st', 'AFFX-BkGr-GC13_st', 'AFFX-BkGr-GC14_st',
       'AFFX-BkGr-GC15_st', 'AFFX-BkGr-GC16_st', 'AFFX-BkGr-GC17_st',
       'AFFX-BkGr-GC18_st', 'AFFX-BkGr-GC19_st', 'AFFX-BkGr-GC20_st',
       'AFFX-BkGr-GC21_st', 'AFFX-BkGr-GC22_st'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['TC0100006437.hg.1', 'TC0100006476.hg.1', 'TC0100006479.hg.1', 'TC0100006480.hg.1', 'TC0100006483.hg.1'], 'probeset_id': ['TC0100006437.hg.1', 'TC0100006476.hg.1', 'TC0100006479.hg.1', 'TC0100006480.hg.1', 'TC0100006483.hg.1'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['69091', '924880', '960587', '966497', '1001138'], 'stop': ['70008', '944581', '965719', '975865', '1014541'], 'total_probes': [10.0, 10.0, 10.0, 10.0, 10.0], 'category': ['main', 'main', 'main', 'main', 'main'], 'SPOT_ID': ['Coding', 'Multiple_Complex', 'Multiple_Complex', 'Multiple_Complex', 'Multiple_Complex'], 'SPOT_ID.1': ['NM_001005484 // RefSeq // Homo sapiens olfactory receptor, family 4, subfamily F, member 5 (OR4F5), mRNA. // chr1 // 100 // 100 // 0 // --- // 0 /// ENST00000335137 // ENSEMBL // olfactory receptor, family 4, subfamily F, member 5 [gene_biotype:protein_coding transcript_biotype:protein_coding] // chr1 // 10

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the keys for identifiers and gene symbols
identifier_key = 'probeset_id'
gene_symbol_key = 'SPOT_ID.1'

# 2. Re-fetch the mapping DataFrame using corrected keys if necessary
mapping_df = gene_annotation[[identifier_key, gene_symbol_key]].rename(columns={identifier_key: 'ID', gene_symbol_key: 'Gene'}).dropna()

# Preview to check if mappings are correct
print("Mapping DataFrame Head:")
print(mapping_df.head())

# 3. Apply the mapping and get the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, mapping_df)


Mapping DataFrame Head:
                  ID                                               Gene
0  TC0100006437.hg.1  NM_001005484 // RefSeq // Homo sapiens olfacto...
1  TC0100006476.hg.1  NM_152486 // RefSeq // Homo sapiens sterile al...
2  TC0100006479.hg.1  NM_198317 // RefSeq // Homo sapiens kelch-like...
3  TC0100006480.hg.1  NM_001160184 // RefSeq // Homo sapiens pleckst...
4  TC0100006483.hg.1  NM_005101 // RefSeq // Homo sapiens ISG15 ubiq...
