In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/LDL_Cholesterol_Levels/GSE28893'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genetic identification, replication, and functional fine-mapping of expression quantitative trait loci in primary human liver tissue [Illumina Expression Array]"
!Series_summary	"Most loci identified in genome wide association studies (GWAS) of complex traits reside in non-coding DNA and may contribute to phenotype via changes in gene regulation. The discovery of expression quantitative trait loci (?eQTLs?) can thus be used to more precisely identify modest but real disease associations and provide insights into their underlying molecular mechanisms. This is particularly true for analyses of expression in non-transformed cells from tissues relevant to the complex traits of interest. We have conducted two independent studies to identify genetic, including both SNPs and copy-number variants, and environmental determinants of human liver gene expression variation. We analyzed two sets of primary livers (primary dataset: n=220; replication dataset: n=

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = True  # The data comes from Illumina Expression Arrays.

# Check the availability of 'LDL_Cholesterol_Levels', 'age', and 'gender'
trait_row = None  # No specific information available for 'LDL_Cholesterol_Levels'
age_row = 1  # Key for 'age' variable
gender_row = 2  # Key for 'gender' variable

# Define conversion functions
def convert_age(value):
    try:
        return int(value.split(': ')[1])
    except:
        return None

def convert_gender(value):
    gender_val = value.split(': ')[1]
    if gender_val.upper() == 'M':
        return 1
    elif gender_val.upper() == 'F':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE28893', './preprocessed/LDL_Cholesterol_Levels/cohort_info.json', is_gene_available, trait_row is not None)

# Extract clinical features if clinical data is available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'LDL_Cholesterol_Levels', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/LDL_Cholesterol_Levels/trait_data/GSE28893.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
