In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Epilepsy/GSE199759'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Integrative analysis of expression profile in the glioma-related epilepsy"
!Series_summary	"To investigate the potential pathogenic mechanism of glioma-related epilepsy (GRE), we have employed analyzing of the dynamic expression profiles of microRNA/ mRNA/ lncRNA in brain tissues of glioma patients. Brain tissues of 16 patients with GRE and nine patients with glioma without epilepsy (GNE) were collected. The total RNA was dephosphorylated, labeled, and hybridized to the Agilent Human miRNA Microarray, Release 19.0, 8x60K. The cDNA was labeled and hybridized to the Agilent LncRNA+mRNA Human Gene Expression Microarray V3.0, 4x180K. The raw data was extracted from hybridized images using Agilent Feature Extraction, and quantile normalization was performed using the Agilent GeneSpring. We found that three differentially expressed miRNAs (miR-10a-5p, miR-10b-5p, miR-629-3p), six differentially expressed lncRNAs (TTN-AS1, LINC00641, SNHG14, LINC00894, S

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

is_gene_available = True

# Confirm keys based on availability
trait_row = None  # No clear trait key given in the data
age_row = 2      # for 'age'
gender_row = 1   # for 'gender'

# Define conversion functions
def convert_trait(value):
    value_part = value.split(':')[-1].strip()
    if 'GRE' in value_part:
        return 1
    elif 'GNE' in value_part:
        return 0
    else:
        return None

def convert_age(value):
    try:
        return float(value.split(':')[-1].strip().replace('y', ''))
    except ValueError:
        return None

def convert_gender(value):
    value_part = value.split(':')[-1].strip().lower()
    if value_part == 'male':
        return 1
    elif value_part == 'female':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE199759', './preprocessed/Epilepsy/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(
        clinical_data, 'Epilepsy', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender
    )
    csv_path = './preprocessed/Epilepsy/trait_data/GSE199759.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
else:
    print("Trait row for Epilepsy key was not defined or correctly identifiable.")


A new JSON file was created at: ./preprocessed/Epilepsy/cohort_info.json
Trait row for Epilepsy key was not defined or correctly identifiable.
