In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Arrhythmia/GSE93101'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Molecular Prognosis of Cardiogenic Shock Patients under Extracorporeal Membrane Oxygenation"
!Series_summary	"Prognosis for cardiogenic shock patients under ECMO was our study goal. Success defined as survived more than 7 days after ECMO installation and failure died or had multiple organ failure in 7 days. Total 34 cases were enrolled, 17 success and 17 failure."
!Series_summary	"Peripheral blood mononuclear cells collected at ECMO installation were used analyzed."
!Series_overall_design	"Analysis of the cardiogenic shock patients at extracorporeal membrane oxygenation treatment by genome-wide expression and methylation. Transcriptomic profiling and DNA methylation between successful and failure groups were analyzed."
!Series_overall_design	"This submission represents the transcriptome data."
Sample Characteristics Dictionary:
{0: ['course: Acute myocarditis', 'course: Acute myocardial infarction', 'course: Dilated cardiomyopathy, DCMP', 'course:

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check if gene expression data is available
is_gene_available = True  # Transcriptome data is available

# Step 2.1: Data Availability
sample_characteristics = {
    0: ['course: Acute myocarditis', 'course: Acute myocardial infarction', 'course: Dilated cardiomyopathy, DCMP', 'course: Congestive heart failure', 'course: Dilated cardiomyopathy', 'course: Arrhythmia', 'course: Aortic dissection'],
    1: ['age: 33.4', 'age: 51.2', 'age: 51.9', 'age: 47.8', 'age: 41.5', 'age: 67.3', 'age: 52.8', 'age: 16.1', 'age: 78.9', 'age: 53.2', 'age: 70.9', 'age: 59.9', 'age: 21.9', 'age: 45.2', 'age: 52.4', 'age: 32.3', 'age: 55.8', 'age: 47', 'age: 57.3', 'age: 31.7', 'age: 49.3', 'age: 66.1', 'age: 55.9', 'age: 49.1', 'age: 63', 'age: 21', 'age: 53.6', 'age: 50.1', 'age: 37.4', 'age: 71.5'],
    2: ['gender: F', 'gender: M'],
    3: ['outcome: Success', 'outcome: Failure', 'outcome: failure']
}

# Checking data availability
# Arrhythmia Variable
for key, values in sample_characteristics.items():
    if any('course: Arrhythmia' in value for value in values):
        trait_row = key
        break

# Age Variable
for key, values in sample_characteristics.items():
    if any('age:' in value for value in values) and len(set(values)) > 1:
        age_row = key
        break

# Gender Variable
for key, values in sample_characteristics.items():
    if any('gender:' in value for value in values) and len(set(values)) > 1:
        gender_row = key
        break

# Step 2.3: Data Type Conversion Functions
def convert_trait(value):
    if 'course: Arrhythmia' in value:
        return 1
    return 0

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except ValueError:
        return None

def convert_gender(value):
    val = value.split(': ')[1]
    if val == 'F':
        return 0
    elif val == 'M':
        return 1
    return None

# Step 3: Save Metadata
save_cohort_info('GSE93101', './preprocessed/Arrhythmia/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Arrhythmia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Arrhythmia/trait_data/GSE93101.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2443799': [0.0, 33.4, 0.0], 'GSM2443800': [0.0, 51.2, 1.0], 'GSM2443801': [0.0, 51.9, 0.0], 'GSM2443802': [0.0, 47.8, 1.0], 'GSM2443803': [0.0, 41.5, 0.0], 'GSM2443804': [0.0, 67.3, 1.0], 'GSM2443805': [0.0, 52.8, 1.0], 'GSM2443806': [0.0, 16.1, 1.0], 'GSM2443807': [0.0, 78.9, 1.0], 'GSM2443808': [0.0, 53.2, 1.0], 'GSM2443809': [0.0, 70.9, 1.0], 'GSM2443810': [0.0, 59.9, 1.0], 'GSM2443811': [0.0, 21.9, 0.0], 'GSM2443812': [0.0, 45.2, 0.0], 'GSM2443813': [0.0, 52.4, 1.0], 'GSM2443814': [0.0, 32.3, 1.0], 'GSM2443815': [0.0, 52.8, 1.0], 'GSM2443816': [1.0, 55.8, 1.0], 'GSM2443817': [0.0, 47.0, 1.0], 'GSM2443818': [0.0, 55.8, 1.0], 'GSM2443819': [0.0, 57.3, 0.0], 'GSM2443820': [1.0, 31.7, 0.0], 'GSM2443821': [0.0, 49.3, 1.0], 'GSM2443822': [0.0, 66.1, 1.0], 'GSM2443823': [0.0, 55.9, 1.0], 'GSM2443824': [0.0, 49.1, 0.0], 'GSM2443825': [0.0, 63.0, 1.0], 'GSM2443826': [0.0, 21.0, 1.0], 'GSM2443827': [0.0, 53.6, 1.0], 'GSM2443828': [0.0, 50.1, 0.0], 'GSM2443829': [0.0, 37.4, 1.0], 'GSM244

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['ILMN_1343291', 'ILMN_1651209', 'ILMN_1651228', 'ILMN_1651229',
       'ILMN_1651235', 'ILMN_1651236', 'ILMN_1651237', 'ILMN_1651238',
       'ILMN_1651254', 'ILMN_1651260', 'ILMN_1651262', 'ILMN_1651268',
       'ILMN_1651278', 'ILMN_1651282', 'ILMN_1651285', 'ILMN_1651286',
       'ILMN_1651292', 'ILMN_1651303', 'ILMN_1651309', 'ILMN_1651315'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['ILMN_3166687', 'ILMN_3165566', 'ILMN_3164811', 'ILMN_3165363', 'ILMN_3166511'], 'Transcript': ['ILMN_333737', 'ILMN_333646', 'ILMN_333584', 'ILMN_333628', 'ILMN_333719'], 'Species': ['ILMN Controls', 'ILMN Controls', 'ILMN Controls', 'ILMN Controls', 'ILMN Controls'], 'Source': ['ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls'], 'Search_Key': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'ILMN_Gene': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'Source_Reference_ID': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'RefSeq_ID': [nan, nan, nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan, nan, nan], 'GI': [nan, nan, nan, nan, nan], 'Accession': ['DQ516750', 'DQ883654', 'DQ668364', 'DQ516785', 'DQ854995'], 'Symbol': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'Protein_Product': [nan, nan, nan, nan, nan], 'Array_Addres

### Step 6: Gene Identifier Mapping

In [7]:
identifier_key = 'ID'
gene_symbol_key = 'Symbol'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Arrhythmia/gene_data/GSE93101.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Arrhythmia')

# 4. Save the cohort information.
save_cohort_info('GSE93101', './preprocessed/Arrhythmia/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Arrhythmia/GSE93101.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Arrhythmia', the least common label is '1.0' with 2 occurrences. This represents 6.06% of the dataset.
The distribution of the feature 'Arrhythmia' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 45.2
  50% (Median): 52.4
  75%: 56.5
Min: 16.1
Max: 78.9
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 10 occurrences. This represents 30.30% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

