In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hemochromatosis/GSE159676'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Portal fibroblasts with mesenchymal stem cell features form a reservoir of proliferative myofibroblasts in liver fibrosis"
!Series_summary	"Based on the identification  of a transcriptomic signature, including Slit2,  characterizing portal mesenchymal stem cells  (PMSC)  and derived myofibroblast (MF), we examined  the gene expression profile of in liver tissue derived  from multiple human liver disorders, including  primary sclerosing cholangitis (PSC) (n=12), non-alcoholic steatohepatitis (NASH)  (n=7) and other liver diseases (i.e., primary biliary cholangitis, autoimmune hepatitis, alcoholic liver disease and haemochromatosis) (n=8) and  compared them to healthy controls (tumor free tissue from livers with metastasis from colorectal cancer) (n=5). We found that  SLIT2 was overexpressed in the liver of patients with NASH, PSC and other chronic liver diseases. We also examined the microarray data of the human liver tissue samples for the transcr

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking for gene expression data availability
if "Human Gene 1.0 st array" in "!Series_overall_design":
    is_gene_available = True

# Identifying keys related to variables in the sample characteristics dictionary
sample_dict = {0: ['condition: Liver tissue healthy', 'condition: Non alcoholic steatohepatitis', 'condition: Primary sclerosing cholangitis', 'condition: Primary biliary cirrhosis', 'condition: Haemochromatosis', 'condition: Autoimmune hepatitis', 'condition: Alcohol related']}

# Hemochromatosis corresponds to the term 'Haemochromatosis' in the dictionary
if any("condition: Haemochromatosis" in condition for condition in sample_dict[0]):
    trait_row = 0

# No data for age and gender can be inferred from available information
age_row = None
gender_row = None

# Define conversion functions
def convert_trait(value):
    try:
        return 1 if "Haemochromatosis" in value else 0
    except:
        return None

def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except:
        return None

def convert_gender(value):
    try:
        gender = value.split(":")[1].strip().lower()
        return 0 if gender == "female" else 1 if gender == "male" else None
    except:
        return None

# Save metadata
save_cohort_info('GSE159676', './preprocessed/Hemochromatosis/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hemochromatosis', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Hemochromatosis/trait_data/GSE159676.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM4837490': [0], 'GSM4837491': [0], 'GSM4837492': [0], 'GSM4837493': [0], 'GSM4837494': [0], 'GSM4837495': [0], 'GSM4837496': [0], 'GSM4837497': [0], 'GSM4837498': [0], 'GSM4837499': [0], 'GSM4837500': [0], 'GSM4837501': [0], 'GSM4837502': [0], 'GSM4837503': [0], 'GSM4837504': [0], 'GSM4837505': [0], 'GSM4837506': [0], 'GSM4837507': [0], 'GSM4837508': [0], 'GSM4837509': [0], 'GSM4837510': [0], 'GSM4837511': [0], 'GSM4837512': [0], 'GSM4837513': [0], 'GSM4837514': [0], 'GSM4837515': [0], 'GSM4837516': [0], 'GSM4837517': [1], 'GSM4837518': [0], 'GSM4837519': [0], 'GSM4837520': [0], 'GSM4837521': [0], 'GSM4837522': [0]}
