In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Stroke/GSE125771'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"RNA expression data from calcified human carotid atherosclerotic plaques"
!Series_summary	"Although unstable atherosclerosis in the carotid bifurcation is a significant etiology behind ischemic stroke, clinical imaging methods to distinguish stable from vulnerable lesions are lacking and selection of patients for stroke-preventive intervention still relies on surrogate variables with moderate predictive power, such as the degree of luminal narrowing. Here we combined clinical and diagnostic imaging information by comuted tomography to select patients with calcified plaques for large scale molecular analysis, in an effort to increase our understanding of the pathophysiology behind carotid plaque instability as related to patient- and plaque- phenotype."
!Series_overall_design	"Patients undergoing surgery for high-grade (>50% NASCET) carotid stenosis at the Department of Vascular Surgery, Karolinska University Hospital, Stockholm, Sweden were consec

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine if gene expression data is available
is_gene_available = True  # Based on the summary, RNA expression data is likely available

# Determine the rows for the variables 'Stroke', 'age', and 'gender'
# Identifying the keys from the Sample Characteristics Dictionary
# Stroke information is not explicitly available
trait_row = None
# Age information is available
age_row = 3
# Gender information is available
gender_row = 2

# Define data type conversion functions
def convert_trait(value):
    return None  # No data available for stroke

def convert_age(value):
    try:
        return float(value.split(':')[1].strip())
    except ValueError:
        return None

def convert_gender(value):
    gender = value.split(':')[1].strip().lower()
    if gender == 'male':
        return 1
    elif gender == 'female':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE125771', './preprocessed/Stroke/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction (Only if trait_row is not None)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Stroke', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Stroke/trait_data/GSE125771.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
