In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Mitochondrial_Disorders/GSE17078'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Cell Adhesion Molecule 1 (CADM1): A Novel Risk Factor for Venous Thrombosis"
!Series_summary	"Protein C (PC) deficiency increases the risk of venous thrombosis (VT) among members of Kindred Vermont II, but fails to fully account for the inheritance pattern. A genome scan of the pedigree supported the presence of a prothrombotic gene on chromosome 11q23 with weaker support on chromosomes 10p12 and 18p11.2-q11."
!Series_summary	"Preliminary data from Affimetrix microarray expression analysis of Blood Outgrowth Endothelial Cells of 3 members of Kindred Vermont II compared to a well established normal control group indicated that IgsF4 was decreased in patients versus controls. In addition, both statistical and pathway analysis results suggested that these genes are associated protein C.  Further studies indicated that Cell Adhesion Molecule 1 (CADM1), a member of the IgsF4 superfamily, may be associated with VT."
!Series_overall_design	"We obtained B

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = True  # Set to True based on the context

# Identify the keys for variables
trait_row = None  # No direct information about Mitochondrial_Disorders
age_row = 3  # Age information available in row 3
gender_row = 4  # Gender information available in row 4

# Define conversion functions
def convert_trait(value):
    # As no data is available for Mitochondrial_Disorders, this can be None
    return None

def convert_age(value):
    try:
        return float(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    gender = value.split(': ')[1].strip().lower()
    if gender == 'f':
        return 0
    elif gender == 'm':
        return 1
    else:
        return None

# Save cohort information
save_cohort_info('GSE17078', './preprocessed/Mitochondrial_Disorders/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction only if trait_row is not None (skipped in this case)
# In the user's context, this step may be performed based on the availability of the trait variable
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Mitochondrial_Disorders', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Mitochondrial_Disorders/trait_data/GSE17078.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
