In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Endometriosis/GSE37837'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genome-wide expression analysis of autologous eutopic and ectopic endometrium from fertile Indian women with endometriosis"
!Series_summary	"Whole genome expression analyses of autologous, paired eutopic and ectopic endometrial samples obtained during proliferative and secretory phases of menstrual cycles from eighteen (n=18) fertile women suffering from  confirmed stage 3 (moderate) and stage 4 (severe) ovarian endometriosis were performed using whole human genome oligo microarray Agilent paltform (Cat. No. G4112F)."
!Series_overall_design	"In the present study, genome-wide expression analysis of autologous, paired eutopic and ectopic endometrial samples obtained during proliferative (n=13) and secretory (n=5) phases of menstrual cycle from fertile women (n=18) suffering from moderate (stage 3; n=8) or severe (stage 4; n=10) endometrioma was performed by using Agilent single color oligo microarray platform (G4112, 4X44K). Thus eighteen (18) eutop

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

sample_characteristics = {
    0: ['age (y): 29', 'age (y): 40', 'age (y): 33', 'age (y): 45', 'age (y): 24', 'age (y): 38', 'age (y): 28', 'age (y): 25', 'age (y): 31', 'age (y): 37', 'age (y): 30', 'age (y): 34'],
    1: ['gender: female (fertile)'],
    2: ['tissue: Autologous_eutopic', 'tissue: Endometrioma_ectopic'],
    3: ['subject id: E17', 'subject id: E20', 'subject id: E23', 'subject id: E26', 'subject id: E31', 'subject id: E32', 'subject id: E33', 'subject id: E40', 'subject id: E43', 'subject id: E48', 'subject id: E49', 'subject id: E52', 'subject id: E56', 'subject id: E57', 'subject id: E68', 'subject id: E70', 'subject id: E73', 'subject id: E75'],
    4: ['menstrual phase: Proliferative', 'menstrual phase: Secretory'],
    5: ['endometrioma severity stage: Severe (stage 4)', 'endometrioma severity stage: Moderate (stage 3)'],
    6: ['parity: Pregnancy_1; live offspriing_1', 'parity: Pregnancy_6; live offspriing_6', 'parity: Pregnancy_3; live offspriing_3', 'parity: Pregnancy_3; live offspriing_2', 'parity: Pregnancy_2; live offspriing_1', 'parity: Pregnancy_4; live offspriing_2', 'parity: Pregnancy_2; live offspriing_2', 'parity: Pregnancy_4; live offspriing_4']
}

# 1. Gene Expression Data Availability
# Based on the background information and the platform used, it is likely to contain gene expression data.
is_gene_available = True 

# 2. Variable Availability and Data Type Conversion

# 2.1 Data Availability
# Examine the sample characteristics data to identify the rows for each variable

# For Endometriosis
if 5 in sample_characteristics and len(set(sample_characteristics[5])) > 1:
    trait_row = 5

# For Age
if 0 in sample_characteristics and 'age (y)' in sample_characteristics[0][0]:
    if len(set(sample_characteristics[0])) > 1:
        age_row = 0

# For Gender
if 1 in sample_characteristics and 'gender' in sample_characteristics[1][0]:
    gender_row = None  # All entries have the same value, so set to None

# 2.2 Data Type Conversion

# Function to convert endometriosis data
def convert_trait(value):
    value = value.split(':')[1].strip()
    if 'Severe' in value:
        return 1
    elif 'Moderate' in value:
        return 0
    return None

# Function to convert age data
def convert_age(value):
    try:
        value = value.split(':')[1].strip()
        return float(value)
    except (IndexError, ValueError):
        return None

# Function to convert gender data
def convert_gender(value):
    value = value.split(':')[1].strip()
    if value.lower().startswith('female'):
        return 0
    elif value.lower() == 'male':
        return 1
    return None

# 3. Save Metadata
save_cohort_info('GSE37837', './preprocessed/Endometriosis/cohort_info.json', is_gene_available, trait_row is not None)

# 4. Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(
        clinical_data, 'Endometriosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Endometriosis/trait_data/GSE37837.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM928779': [1.0, 29.0], 'GSM928780': [1.0, 29.0], 'GSM928781': [1.0, 40.0], 'GSM928782': [1.0, 40.0], 'GSM928783': [1.0, 33.0], 'GSM928784': [1.0, 33.0], 'GSM928785': [0.0, 45.0], 'GSM928786': [0.0, 45.0], 'GSM928787': [0.0, 24.0], 'GSM928788': [0.0, 24.0], 'GSM928789': [0.0, 38.0], 'GSM928790': [0.0, 38.0], 'GSM928791': [1.0, 28.0], 'GSM928792': [1.0, 28.0], 'GSM928793': [1.0, 25.0], 'GSM928794': [1.0, 25.0], 'GSM928795': [0.0, 40.0], 'GSM928796': [0.0, 40.0], 'GSM928797': [1.0, 31.0], 'GSM928798': [1.0, 31.0], 'GSM928799': [0.0, 37.0], 'GSM928800': [0.0, 37.0], 'GSM928801': [1.0, 30.0], 'GSM928802': [1.0, 30.0], 'GSM928803': [1.0, 30.0], 'GSM928804': [1.0, 30.0], 'GSM928805': [0.0, 37.0], 'GSM928806': [0.0, 37.0], 'GSM928807': [0.0, 31.0], 'GSM928808': [0.0, 31.0], 'GSM928809': [1.0, 34.0], 'GSM928810': [1.0, 34.0], 'GSM928811': [1.0, 25.0], 'GSM928812': [1.0, 25.0], 'GSM928813': [0.0, 40.0], 'GSM928814': [0.0, 40.0]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['(+)E1A_r60_1', '(+)E1A_r60_3', '(+)E1A_r60_a104', '(+)E1A_r60_a107',
       '(+)E1A_r60_a135', '(+)E1A_r60_a20', '(+)E1A_r60_a22', '(+)E1A_r60_a97',
       '(+)E1A_r60_n11', '(+)E1A_r60_n9', '(+)eQC-39', '(+)eQC-40',
       '(+)eQC-41', '(+)eQC-42', '(-)3xSLv1', 'A_23_P100001', 'A_23_P100011',
       'A_23_P100022', 'A_23_P100056', 'A_23_P100074'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['A_23_P100001', 'A_23_P100011', 'A_23_P100022', 'A_23_P100056', 'A_23_P100074'], 'SPOT_ID': ['A_23_P100001', 'A_23_P100011', 'A_23_P100022', 'A_23_P100056', 'A_23_P100074'], 'CONTROL_TYPE': ['FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE'], 'REFSEQ': ['NM_207446', 'NM_005829', 'NM_014848', 'NM_194272', 'NM_020371'], 'GB_ACC': ['NM_207446', 'NM_005829', 'NM_014848', 'NM_194272', 'NM_020371'], 'GENE': [400451.0, 10239.0, 9899.0, 348093.0, 57099.0], 'GENE_SYMBOL': ['FAM174B', 'AP3S2', 'SV2B', 'RBPMS2', 'AVEN'], 'GENE_NAME': ['family with sequence similarity 174, member B', 'adaptor-related protein complex 3, sigma 2 subunit', 'synaptic vesicle glycoprotein 2B', 'RNA binding protein with multiple splicing 2', 'apoptosis, caspase activation inhibitor'], 'UNIGENE_ID': ['Hs.27373', 'Hs.632161', 'Hs.21754', 'Hs.436518', 'Hs.555966'], 'ENSEMBL_ID': ['ENST00000557398', nan, 'ENST00000557410', 'ENST00000300069', 'ENST00000306730'], 'TIGR_ID': [nan, nan, nan, nan, nan]

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identifying the key storing gene identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function
mapping_df = get_gene_mapping(gene_annotation, prob_col=identifier_key, gene_col=gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function, resulting in a gene expression dataframe named "gene_data"
gene_data = apply_gene_mapping(gene_data, mapping_df)

# Print the first few rows of gene_data to confirm
print("Gene Data Preview:")
print(preview_df(gene_data))


Gene Data Preview:
{'GSM928779': [-0.1643, 1.95485, 3.8214, -0.1706, -0.2091], 'GSM928780': [0.0053, -0.01875, -0.0163, -0.0010500000000000002, 0.4378], 'GSM928781': [4.9247, 0.1353, 1.9029, -0.00065, -0.0391], 'GSM928782': [4.6111, -0.010250000000000002, -0.0078, 0.0075, -0.031], 'GSM928783': [-0.0669, 0.29350000000000004, -0.0885, 3.3375, 3.6198], 'GSM928784': [2.6271, 0.19235000000000002, 0.1948, 0.21005000000000001, 0.1715], 'GSM928785': [0.0, 0.6910499999999999, -0.0216, -0.00635, -0.0448], 'GSM928786': [0.1892, 0.16515, 0.1676, 0.18285, 0.1444], 'GSM928787': [0.0821, 1.9476499999999999, 7.0638, 0.0758, 2.4767], 'GSM928788': [0.0704, 2.9129, 5.2274, 2.7564, 2.4131], 'GSM928789': [3.0583, 2.8262, 1.0544, 1.86565, -0.0299], 'GSM928790': [-1.3279, -2.23745, -2.3902, -3.9145000000000003, 0.4964], 'GSM928791': [-0.6216, -0.6456500000000001, 2.0405, 0.53315, -0.6665], 'GSM928792': [-4.2139, -1.8730499999999999, -1.6899, -3.4855, -0.7708], 'GSM928793': [0.2164, 2.0675, 0.1948, 3.168, 0.1

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Endometriosis/gene_data/GSE37837.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Endometriosis')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE37837', './preprocessed/Endometriosis/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Endometriosis/GSE37837.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Endometriosis', the least common label is '0.0' with 16 occurrences. This represents 44.44% of the dataset.
The distribution of the feature 'Endometriosis' in this dataset is fine.

Quartiles for 'Age':
  25%: 29.0
  50% (Median): 32.0
  75%: 38.0
Min: 24.0
Max: 45.0
The distribution of the feature 'Age' in this dataset is fine.

