In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Autism_spectrum_disorder_(ASD)/GSE89596'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"MicroRNA biomarkers in blood for Autism spectrum disorder"
!Series_summary	"Autism spectrum disorder (ASD) is a neurodevelopmental disorder characterized by social communication deficits and repetitive behaviors. MicroRNAs (miRNAs) have been recently recognized as potential biomarkers of ASD as they are dysregulated in various tissues of individuals with ASD. However, it remains unclear whether miRNA expression is altered in individuals with high-functioning ASD. Here, we investigated the miRNA expression profile in peripheral blood from adults with high-functioning ASD, and age and gender-matched healthy controls. Our findings may provide insights regarding the molecular clues for recognizing high-functioning ASD."
!Series_overall_design	"We profiled miRNA expression using peripheral blood from 30 ASD patients and 30 controls."
Sample Characteristics Dictionary:
{0: ['diagnosis: autism spectrum disorder (ASD)', 'diagnosis: control'], 1: ['tissue:

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Determine the availability of gene expression data
is_gene_available = False

# The dataset focuses on miRNA data, not suitable for gene expression analysis
# Update the variables and set them as available if applicable
age_row = None  # age is not mentioned in the dataset characteristics
trait_row = 0  # The key for 'Autism_spectrum_disorder_(ASD)' is 0
gender_row = 2  # The key for 'gender' is 2 

# Define conversion functions
def convert_trait(value):
    value = value.split(':')[1].strip()
    return 1 if 'autism spectrum disorder (ASD)' in value.lower() else 0

def convert_gender(value):
    value = value.split(':')[1].strip()
    return 1 if value.lower() == 'male' else 0

def convert_age(value):
    # Placeholder function since age details are not available
    return None

# Save cohort information
save_cohort_info('GSE89596', './preprocessed/Autism_spectrum_disorder_(ASD)/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Autism_spectrum_disorder_(ASD)', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Autism_spectrum_disorder_(ASD)/trait_data/GSE89596.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Autism_spectrum_disorder_(ASD)/cohort_info.json
{'GSM2385148': [0, 0], 'GSM2385149': [0, 1], 'GSM2385150': [0, 1], 'GSM2385151': [0, 0], 'GSM2385152': [0, 1], 'GSM2385153': [0, 1], 'GSM2385154': [0, 1], 'GSM2385155': [0, 1], 'GSM2385156': [0, 0], 'GSM2385157': [0, 0], 'GSM2385158': [0, 0], 'GSM2385159': [0, 1], 'GSM2385160': [0, 1], 'GSM2385161': [0, 1], 'GSM2385162': [0, 1], 'GSM2385163': [0, 1], 'GSM2385164': [0, 0], 'GSM2385165': [0, 0], 'GSM2385166': [0, 0], 'GSM2385167': [0, 1], 'GSM2385168': [0, 1], 'GSM2385169': [0, 0], 'GSM2385170': [0, 1], 'GSM2385171': [0, 1], 'GSM2385172': [0, 0], 'GSM2385173': [0, 0], 'GSM2385174': [0, 1], 'GSM2385175': [0, 1], 'GSM2385176': [0, 0], 'GSM2385177': [0, 1], 'GSM2385178': [0, 0], 'GSM2385179': [0, 0], 'GSM2385180': [0, 1], 'GSM2385181': [0, 0], 'GSM2385182': [0, 0], 'GSM2385183': [0, 1], 'GSM2385184': [0, 1], 'GSM2385185': [0, 0], 'GSM2385186': [0, 1], 'GSM2385187': [0, 0], 'GSM2385188': [0, 1], 'G