In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Cystic_Fibrosis/GSE139038'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiling in paired normal, apparently normal and breast tumour tissues"
!Series_summary	"The main objective of the study was to identify potential diagnostic and follow up markers along with therapeutic targets for breast cancer. We performed gene expression studies using the microarray technology on 65 samples including 41 breast tumours [24 early stage, 17 locally advanced, 18 adjacent normal tissue [paired normal] and 6 apparently normal from breasts which had been operated for non-malignant conditions. All the samples had frozen section done – tumours needed to have 70% or more tumour cells; paired normal and apparently normal had to be morphologically normal with no tumour cells."
!Series_overall_design	"Two-dye experiments using Universal Control RNA (Stratagene) and RNA from tissues."
!Series_overall_design	"Biological replicates - Apparently normal = 6, Paired normal = 18, Breast tumor tissues = 41"
Sample Characteristics 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determining the availability of gene expression data
is_gene_available = True  # The dataset description mentions gene expression profiling using microarray technology

# Assess variable availability and define the rows based on the sample characteristics dictionary
trait_row = None  # 'Cystic_Fibrosis' is not available in the described data

age_row = 0  # Age information is available under key '0'

gender_row = 1  # Gender information is available under key '1'

# Define conversion functions

def convert_age(value: str):
    try:
        age = int(value.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(value: str):
    try:
        gender = value.split(': ')[1]
        return 1 if gender.lower() == 'male' else 0
    except (ValueError, IndexError):
        return None

convert_trait = None  # No conversion needed as 'Cystic_Fibrosis' data is not available

# Save cohort information
save_cohort_info('GSE139038', './preprocessed/Cystic_Fibrosis/cohort_info.json', is_gene_available, trait_row is not None)
