In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Stroke/GSE161533'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Expression data from esophageal squamous cell carcinoma patients"
!Series_summary	"we conducted microarray experiments of 28 stage I-III ESCC patients based on Affymetrix Gene Chip Human Genome U133 plus 2.0 Array, performed enrichment analysis of differentially expressed genes (DEGs) as well as gene set enrichment analysis of all valid genes. Moreover, we summarized the secreted protein-encoding DEGs as well as esophagus-specific DEGs, hoping to offer some hints for early diagnosis and target for more efficacious treatment for ESCC in near future."
!Series_overall_design	"In total, there were 84 paired  normal tissues, paratumor tissues, and tumor tissues from 28 ESCC patients were chosen to perform microarray analysis."
Sample Characteristics Dictionary:
{0: ['tissue: normal tissue', 'tissue: paratumor tissue', 'tissue: tumor tissue'], 1: ['Stage: IB', 'Stage: I', 'Stage: IA', 'Stage: IIA', 'Stage: IIB', 'Stage: II', 'Stage: IIIA', 'Stage: IIIB'

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

from utils.preprocess import save_cohort_info, geo_select_clinical_features, preview_df

# Assuming sample_characteristics has been provided as shown in the instructions
sample_characteristics = {
    0: ['tissue: normal tissue', 'tissue: paratumor tissue', 'tissue: tumor tissue'],
    1: ['Stage: IB', 'Stage: I', 'Stage: IA', 'Stage: IIA', 'Stage: IIB', 'Stage: II', 'Stage: IIIA', 'Stage: IIIB'],
    2: ['age: 56', 'age: 57', 'age: 51', 'age: 64', 'age: 54', 'age: 73', 'age: 61', 'age: 71', 'age: 65', 'age: 60', 'age: 69', 'age: 63', 'age: 67', 'age: 70', 'age: 53', 'age: 75', 'age: 74'],
    3: ['gender: Male', 'gender: Female'],
    4: ['smoking history: None', 'smoking history: 30 years', 'smoking history: 20 years', 'smoking history: 36 years', 'smoking history: 50 years', 'smoking history: 40 years'],
    5: ['drinking history: None', 'drinking history: Seldom', 'drinking history: 36 years', 'drinking history: 40 years', 'drinking history: 50 years'], 
    6: ['disease history: None', 'disease history: Hypertension', 'disease history: Breast cancer', 'disease history: Cerebral  infarction', 'disease history: Lymphoma', 'disease history: Hypertension, coronary heart disease, cerebral  infarction'], 
    7: ['family history of cancer: ESCC', 'family history of cancer: None', 'family history of cancer: lung cancer', 'family history of cancer: liver cancer', 'family history of cancer: none', 'family history of cancer: Colorectal cancer', 'family history of cancer: Gastric cancer', 'family history of cancer: cancer']
}

# Setting the appropriate keys for variable availability
trait_row = 6 if any('cerebral infarction' in val.lower() for val in sample_characteristics[6]) else None
age_row = 2 if len(set(sample_characteristics[2])) > 1 else None
gender_row = 3 if len(set(sample_characteristics[3])) > 1 else None

# Function to convert values for 'Stroke'
def convert_trait(value):
    value = value.split(':')[1].strip().lower()
    if 'cerebral infarction' in value:
        return 1
    elif 'none' in value:
        return 0
    return None

# Function to convert values for 'age'
def convert_age(value):
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

# Function to convert values for 'gender'
def convert_gender(value):
    value = value.split(':')[1].strip().lower()
    if value == "male":
        return 1
    elif value == "female":
        return 0
    return None

# Save cohort information
save_cohort_info('GSE161533', './preprocessed/Stroke/cohort_info.json', is_gene_available, trait_row is not None)

# Assuming clinical_data has been previously defined and loaded
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Stroke', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Stroke/trait_data/GSE161533.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Stroke/cohort_info.json
