In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Craniosynostosis/GSE27976'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Calvarial osteoblast transcriptome analysis identifies genetic targets and extracellular matrix-mediated focal adhesion as potential biomarkers for single-suture craniosynostosis"
!Series_summary	"Craniosynostosis is a disease defined by premature fusion of one or more cranial sutures. The mechanistic pathology of isolated single-suture craniosynostosis is complex and while a number of genetic biomarkers and environmental predispositions have been identified, in many cases the causes remain controversial and inconclusive at best.  After controlling for variables contributing to potential bias, FGF7, SFRP4, and VCAM1 emerged as potential genetic biomarkers for single-suture craniosynostosis due to their significantly large changes in gene expression compared to the control population.  Furthermore, pathway analysis implicated focal adhesion and extracellular matrix (ECM)-receptor interaction as differentially regulated gene networks when comparing 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = 2
age_row = 0
gender_row = 1

def convert_trait(value):
    trait_mapping = {
        'Metopic Synostosis': 1,
        'Coronal Synostosis R': 1,
        'Coronal Synostosis L': 1,
        'Sagittal Synostosis': 1,
        'Control': 0
    }
    header, val = value.split(':')
    return trait_mapping.get(val.strip(), None)

def convert_age(value):
    header, val = value.split(':')
    try:
        return float(val.strip())
    except ValueError:
        return None

def convert_gender(value):
    gender_mapping = {
        'M': 1,
        'F': 0
    }
    header, val = value.split(':')
    return gender_mapping.get(val.strip(), None)

save_cohort_info('GSE27976', './preprocessed/Craniosynostosis/cohort_info.json', is_gene_available, trait_row is not None)

selected_clinical_data = geo_select_clinical_features(clinical_data, 'Craniosynostosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Craniosynostosis/trait_data/GSE27976.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM692146': [1.0, 12.87, 0.0], 'GSM692147': [1.0, 10.4, 0.0], 'GSM692148': [1.0, 12.3, 1.0], 'GSM692149': [1.0, 11.4, 1.0], 'GSM692150': [1.0, 10.1, 0.0], 'GSM692151': [1.0, 11.0, 1.0], 'GSM692152': [1.0, 4.27, 1.0], 'GSM692153': [1.0, 7.97, 1.0], 'GSM692154': [1.0, 4.33, 0.0], 'GSM692155': [1.0, 9.33, 1.0], 'GSM692156': [1.0, 7.93, 1.0], 'GSM692157': [1.0, 10.27, 1.0], 'GSM692158': [1.0, 10.87, 0.0], 'GSM692159': [1.0, 3.87, 0.0], 'GSM692160': [1.0, 3.2, 1.0], 'GSM692161': [1.0, 13.27, 1.0], 'GSM692162': [1.0, 5.6, 0.0], 'GSM692163': [1.0, 14.9, 0.0], 'GSM692164': [1.0, 3.03, 1.0], 'GSM692165': [1.0, 12.4, 1.0], 'GSM692166': [1.0, 8.9, 1.0], 'GSM692167': [1.0, 14.17, 0.0], 'GSM692168': [1.0, 6.33, 0.0], 'GSM692169': [1.0, 14.87, 1.0], 'GSM692170': [1.0, 8.4, 0.0], 'GSM692171': [1.0, 9.07, 0.0], 'GSM692172': [1.0, 13.33, 1.0], 'GSM692173': [1.0, 10.0, 0.0], 'GSM692174': [1.0, 13.23, 0.0], 'GSM692175': [1.0, 10.33, 1.0], 'GSM692176': [1.0, 14.33, 0.0], 'GSM692177': [1.0, 6.67, 1.0], '

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['7892501', '7892502', '7892503', '7892504', '7892505', '7892506',
       '7892507', '7892508', '7892509', '7892510', '7892511', '7892512',
       '7892513', '7892514', '7892515', '7892516', '7892517', '7892518',
       '7892519', '7892520'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the keys for probe IDs and gene symbols in the gene annotation dictionary

identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2. Get the dataframe storing the mapping between probe IDs and genes using the 'get_gene_mapping' function from the library
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library, and name the resulting gene expression dataframe "gene_data"
gene_data = apply_gene_mapping(gene_data, gene_mapping)
