In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Endometriosis/GSE111974'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Endometrial Tissue RNA expression in Recurrent Implantation Failure vs. Conrol"
!Series_summary	"We aimed to identify altered biological processes in the endometrium that may be potential markers of receptive endometrium. RNA expression profiling of the endometrium during the window of implantation was performed in patients with Recurrent Implantation Failure (RIF) versus fertile controls."
!Series_overall_design	"24 patients with RIF treated at the IVF clinic and 24 fertile control patients recruited from the gynecology clinic of Istanbul University School of Medicine during 2014-2015 were involved in this prospective cohort study. RIF was determined as failure of pregnancy in ≥ 3 consecutive IVF cycles with ≥1 transfer(s) of good quality embryo in each cycle. Exclusion criteria for this group were active pelvic infections, undiagnosed vaginal bleeding, uterine anomalies, endometriosis, karyotype anomalies in one or both partners. Fertile control

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Sample Characteristics Dictionary placeholder
sample_char_dict = {0: ['tissue: Endometrial tissue']}  # Extrapolate based on actual dataset inspection

# Check for trait, age, gender availability
for key, value in sample_char_dict.items():
    if 'Endometriosis' in value[0]:
        trait_row = key
    if 'age' in value[0]:
        age_row = key
    if 'gender' in value[0]:
        gender_row = key

def convert_trait(value):
    try:
        return int(value.split(': ')[1])
    except:
        return None

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    gender = value.split(': ')[1].lower()
    if gender == 'male':
        return 1
    elif gender == 'female':
        return 0
    else:
        return None

save_cohort_info('GSE111974', './preprocessed/Endometriosis/cohort_info.json', is_gene_available, trait_row is not None)

# If trait data is available, proceed to clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Endometriosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Endometriosis/trait_data/GSE111974.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Endometriosis/cohort_info.json
