In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Polycystic_Ovary_Syndrome/GSE8157'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiling in skeletal muscle of PCOS after pioglitazone therapy"
!Series_summary	"Insulin resistance is a common metabolic abnormality in women with PCOS and leads to an elevated risk of type 2 diabetes. Studies have shown that thiazolidinediones (TZD) improve metabolic disturbances in PCOS patients. We hypothesized that the effect of TZD in PCOS is in part mediated by changes in the transcriptional profile of muscle favoring insulin sensitivity. "
!Series_summary	"Using Affymetrix microarrays, we examined the effect of pioglitazone (30 mg/day for 16 weeks) on gene expression in skeletal muscle of 10 obese women with PCOS metabolically characterized by a euglycemic-hyperinsulinemic clamp. Moreover, we explored gene expression changes between these PCOS patients before treatment and 13 healthy control women. Treatment with pioglitazone improved insulin-stimulated total, oxidative and non-oxidative glucose metabolism, and reduced fas

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if gene expression data is available
# Based on the dataset summary, it seems to be based on Affymetrix microarrays which measure gene expression
is_gene_available = True

# Assuming that the sample characteristics dictionary provided is complete
sample_characteristics_dict = {0: ['Vastus lateralis muscle']}

# Define conversion functions based on typical values
def convert_trait(value):
    value = value.split(':')[-1].strip()
    if value in ['PCOS', 'Control']:    # assuming possible values
        return 1 if value == 'PCOS' else 0
    return None

def convert_age(value):
    try:
        value = float(value.split(':')[-1].strip())
        return value
    except ValueError:
        return None

def convert_gender(value):
    value = value.split(':')[-1].strip()
    if value in ['male', 'female']:  # assuming possible values
        return 1 if value == 'male' else 0
    return None

# No explicit keys found for trait, age, and gender in the provided dictionary, using None
# Also considering that there's only one unique value for muscle sample in dictionary provided

trait_row = age_row = gender_row = None

# Save cohort information
save_cohort_info('GSE8157', './preprocessed/Polycystic_Ovary_Syndrome/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction if clinical data is available (i.e., trait_row is not None)
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Polycystic_Ovary_Syndrome', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Polycystic_Ovary_Syndrome/trait_data/GSE8157.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Polycystic_Ovary_Syndrome/cohort_info.json
