In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Liver_cirrhosis"
cohort = "GSE139602"

# Input paths
in_trait_dir = "../../input/GEO/Liver_cirrhosis"
in_cohort_dir = "../../input/GEO/Liver_cirrhosis/GSE139602"

# Output paths
out_data_file = "../../output/preprocess/Liver_cirrhosis/GSE139602.csv"
out_gene_data_file = "../../output/preprocess/Liver_cirrhosis/gene_data/GSE139602.csv"
out_clinical_data_file = "../../output/preprocess/Liver_cirrhosis/clinical_data/GSE139602.csv"
json_path = "../../output/preprocess/Liver_cirrhosis/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Molecular characterization of chronic liver disease dynamics: from liver fibrosis to acute-on-chronic liver failure"
!Series_summary	"BACKGROUND: The molecular mechanisms driving the progression from early chronic liver disease (eCLD) to cirrhosis and, finally, acute-on-chronic liver failure (ACLF) are largely unknown. Thus, the aim of this work is to develop a network-based approach to investigate molecular pathways driving progression from eCLD to ACLF. We created 9 liver-specific biological networks capturing key pathophysiological processes potentially related to CLD. We used these networks as framework to perform gene set enrichment analyses(GSEA) and create dynamic profiles of disease progression.  RESULTS: Principal component analyses revealed that samples clustered according to the disease stage. GSEA analyses of the defined processes showed an up-regulation of inflammation, fibrosis and apoptosis networks throughout disease progression. I

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# Step 1: Assess gene expression data availability
# Given the background information, this dataset appears to contain gene expression data
# from liver biopsies of patients at different liver disease stages.
is_gene_available = True

# Step 2: Variable availability and data type conversion
# 2.1 Data Availability
# For trait (cirrhosis), we can use the disease state information from key 0
trait_row = 0
age_row = None  # Age information is not available
gender_row = None  # Gender information is not available

# 2.2 Data Type Conversion
def convert_trait(value):
    """
    Convert disease state to binary: 1 for cirrhosis, 0 for non-cirrhosis.
    """
    if value is None:
        return None
    
    # Extract the value after colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # Map disease states to binary values for liver cirrhosis
    if value == "Healthy" or value == "eCLD":
        return 0  # No cirrhosis
    elif "Cirrhosis" in value or "ACLF" in value or "liver failure" in value:
        return 1  # Has cirrhosis
    else:
        return None

def convert_age(value):
    """Placeholder function for age conversion."""
    return None

def convert_gender(value):
    """Placeholder function for gender conversion."""
    return None

# Step 3: Save metadata
# Trait data is available since trait_row is not None
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# Step 4: Clinical Feature Extraction
# Since trait_row is not None, we need to extract clinical features
if is_trait_available:
    # Create properly formatted clinical data DataFrame
    sample_characteristics = {0: ['disease state: Healthy', 'disease state: eCLD', 
                              'disease state: Compensated Cirrhosis', 
                              'disease state: Decompesated Cirrhosis', 
                              'disease state: Acute-on-chronic liver failure']}
    
    clinical_data = pd.DataFrame()
    for key, values in sample_characteristics.items():
        clinical_data[key] = values
    
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )

    # Preview the extracted clinical data
    preview_result = preview_df(selected_clinical_df)
    print("Preview of selected clinical features:")
    print(preview_result)

    # Save the clinical data to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    selected_clinical_df.to_csv(out_clinical_data_file, index=False)
else:
    print("No trait data available. Skipping clinical feature extraction.")


Preview of selected clinical features:
{0: [0.0]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Liver_cirrhosis/GSE139602/GSE139602_series_matrix.txt.gz


Gene data shape: (49386, 39)
First 20 gene/probe identifiers:
Index(['11715100_at', '11715101_s_at', '11715102_x_at', '11715103_x_at',
       '11715104_s_at', '11715105_at', '11715106_x_at', '11715107_s_at',
       '11715108_x_at', '11715109_at', '11715110_at', '11715111_s_at',
       '11715112_at', '11715113_x_at', '11715114_x_at', '11715115_s_at',
       '11715116_s_at', '11715117_x_at', '11715118_s_at', '11715119_s_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Analyze the gene identifiers based on their format
# The identifiers follow a pattern like "11715100_at", "11715101_s_at", etc.
# These appear to be Affymetrix probe IDs rather than standard human gene symbols
# Standard human gene symbols would be named like BRCA1, TP53, etc.
# Therefore, these identifiers will need to be mapped to human gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Analyze the gene annotation dataframe to identify which columns contain the gene identifiers and gene symbols
print("\nGene annotation preview:")
print(f"Columns in gene annotation: {gene_annotation.columns.tolist()}")
print(preview_df(gene_annotation, n=5))

# Check for gene information in the SPOT_ID.1 column which appears to contain gene names
print("\nAnalyzing SPOT_ID.1 column for gene symbols:")
if 'SPOT_ID.1' in gene_annotation.columns:
    # Extract a few sample values
    sample_values = gene_annotation['SPOT_ID.1'].head(3).tolist()
    for i, value in enumerate(sample_values):
        print(f"Sample {i+1} excerpt: {value[:200]}...")  # Print first 200 chars
        # Test the extract_human_gene_symbols function on these values
        symbols = extract_human_gene_symbols(value)
        print(f"  Extracted gene symbols: {symbols}")

# Try to find the probe IDs in the gene annotation
gene_data_id_prefix = gene_data.index[0].split('_')[0]  # Get prefix of first gene ID
print(f"\nGene data ID prefix: {gene_data_id_prefix}")

# Look for columns that might match the gene data IDs
for col in gene_annotation.columns:
    if gene_annotation[col].astype(str).str.contains(gene_data_id_prefix).any():
        print(f"Column '{col}' contains values matching gene data ID pattern")

# Check if there's any column that might contain transcript or gene IDs
print("\nChecking for columns containing transcript or gene related terms:")
for col in gene_annotation.columns:
    if any(term in col.upper() for term in ['GENE', 'TRANSCRIPT', 'SYMBOL', 'NAME', 'DESCRIPTION']):
        print(f"Column '{col}' may contain gene-related information")
        # Show sample values
        print(f"Sample values: {gene_annotation[col].head(3).tolist()}")



Gene annotation preview:
Columns in gene annotation: ['ID', 'GeneChip Array', 'Species Scientific Name', 'Annotation Date', 'Sequence Type', 'Sequence Source', 'Transcript ID(Array Design)', 'Target Description', 'Representative Public ID', 'Archival UniGene Cluster', 'UniGene ID', 'Genome Version', 'Alignments', 'Gene Title', 'Gene Symbol', 'Chromosomal Location', 'GB_LIST', 'SPOT_ID', 'Unigene Cluster Type', 'Ensembl', 'Entrez Gene', 'SwissProt', 'EC', 'OMIM', 'RefSeq Protein ID', 'RefSeq Transcript ID', 'FlyBase', 'AGI', 'WormBase', 'MGI Name', 'RGD Name', 'SGD accession number', 'Gene Ontology Biological Process', 'Gene Ontology Cellular Component', 'Gene Ontology Molecular Function', 'Pathway', 'InterPro', 'Trans Membrane', 'QTL', 'Annotation Description', 'Annotation Transcript Cluster', 'Transcript Assignments', 'Annotation Notes']
{'ID': ['11715100_at', '11715101_s_at', '11715102_x_at', '11715103_x_at', '11715104_s_at'], 'GeneChip Array': ['Human Genome HG-U219 Array', 'Human 

Column 'ID' contains values matching gene data ID pattern



Checking for columns containing transcript or gene related terms:
Column 'GeneChip Array' may contain gene-related information
Sample values: ['Human Genome HG-U219 Array', 'Human Genome HG-U219 Array', 'Human Genome HG-U219 Array']
Column 'Species Scientific Name' may contain gene-related information
Sample values: ['Homo sapiens', 'Homo sapiens', 'Homo sapiens']
Column 'Transcript ID(Array Design)' may contain gene-related information
Sample values: ['g21264570', 'g21264570', 'g21264570']
Column 'Target Description' may contain gene-related information
Sample values: ['g21264570 /TID=g21264570 /CNT=1 /FEA=FLmRNA /TIER=FL /STK=0 /DEF=g21264570 /REP_ORG=Homo sapiens', 'g21264570 /TID=g21264570 /CNT=1 /FEA=FLmRNA /TIER=FL /STK=0 /DEF=g21264570 /REP_ORG=Homo sapiens', 'g21264570 /TID=g21264570 /CNT=1 /FEA=FLmRNA /TIER=FL /STK=0 /DEF=g21264570 /REP_ORG=Homo sapiens']
Column 'Archival UniGene Cluster' may contain gene-related information
Sample values: ['---', '---', '---']
Column 'UniGen

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the columns for probe IDs and gene symbols in the annotation data
probe_col = 'ID'  # Column containing probe identifiers (matching gene_data.index format)
gene_col = 'Gene Symbol'  # Column containing gene symbols

print(f"Using {probe_col} for probe identifiers and {gene_col} for gene symbols")

# 2. Extract the mapping dataframe with the get_gene_mapping function
mapping_df = get_gene_mapping(gene_annotation, probe_col, gene_col)
print(f"Created mapping dataframe with shape: {mapping_df.shape}")
print(f"Sample of mapping data:")
print(mapping_df.head())

# 3. Apply gene mapping to convert probe-level expressions to gene-level expressions
gene_data = apply_gene_mapping(gene_data, mapping_df)
print(f"Successfully converted probe-level to gene-level expression data")
print(f"Gene expression data shape: {gene_data.shape}")
print(f"First 5 gene symbols:")
print(gene_data.index[:5])

# Save the gene expression data to a CSV file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Using ID for probe identifiers and Gene Symbol for gene symbols


Created mapping dataframe with shape: (49384, 2)
Sample of mapping data:


              ID       Gene
0    11715100_at   HIST1H3G
1  11715101_s_at   HIST1H3G
2  11715102_x_at   HIST1H3G
3  11715103_x_at  TNFAIP8L1
4  11715104_s_at      OTOP2
Successfully converted probe-level to gene-level expression data
Gene expression data shape: (19521, 39)
First 5 gene symbols:
Index(['A1BG', 'A1CF', 'A2BP1', 'A2LD1', 'A2M'], dtype='object', name='Gene')


Gene expression data saved to ../../output/preprocess/Liver_cirrhosis/gene_data/GSE139602.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
# Use normalize_gene_symbols_in_index to standardize gene symbols
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape before normalization: {gene_data.shape}")
print(f"Gene data shape after normalization: {normalized_gene_data.shape}")

# Save the normalized gene data to file
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene expression data saved to {out_gene_data_file}")

# Load the actual clinical data from the matrix file that was previously obtained in Step 1
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
background_info, clinical_data = get_background_and_clinical_data(matrix_file)

# Get preview of clinical data to understand its structure
print("Original clinical data preview:")
print(clinical_data.head())

# 2. If we have trait data available, proceed with linking
if trait_row is not None:
    # Extract clinical features using the original clinical data
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )

    print(f"Selected clinical data shape: {selected_clinical_df.shape}")
    print("Clinical data preview:")
    print(selected_clinical_df.head())

    # Link the clinical and genetic data
    linked_data = geo_link_clinical_genetic_data(selected_clinical_df, normalized_gene_data)
    print(f"Linked data shape before processing: {linked_data.shape}")
    print("Linked data preview (first 5 rows, 5 columns):")
    print(linked_data.iloc[:5, :5] if not linked_data.empty else "Empty dataframe")

    # 3. Handle missing values
    try:
        linked_data = handle_missing_values(linked_data, trait)
        print(f"Data shape after handling missing values: {linked_data.shape}")
    except Exception as e:
        print(f"Error handling missing values: {e}")
        linked_data = pd.DataFrame()  # Create empty dataframe if error occurs

    # 4. Check for bias in features
    if not linked_data.empty and linked_data.shape[0] > 0:
        is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait)
        print(f"Data shape after removing biased features: {linked_data.shape}")
    else:
        is_biased = True
        print("Cannot check for bias as dataframe is empty or has no rows after missing value handling")

    # 5. Validate and save cohort information
    note = ""
    if linked_data.empty or linked_data.shape[0] == 0:
        note = "Dataset contains gene expression data related to liver fibrosis progression, but linking clinical and genetic data failed, possibly due to mismatched sample IDs."
    else:
        note = "Dataset contains gene expression data for liver fibrosis progression, which is relevant to liver cirrhosis research."
    
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=True,
        is_biased=is_biased,
        df=linked_data,
        note=note
    )

    # 6. Save the linked data if usable
    if is_usable:
        os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
        linked_data.to_csv(out_data_file)
        print(f"Linked data saved to {out_data_file}")
    else:
        print("Dataset is not usable for analysis. No linked data file saved.")
else:
    # If no trait data available, validate with trait_available=False
    is_usable = validate_and_save_cohort_info(
        is_final=True,
        cohort=cohort,
        info_path=json_path,
        is_gene_available=True,
        is_trait_available=False,
        is_biased=True,  # Set to True since we can't use data without trait
        df=pd.DataFrame(),  # Empty DataFrame
        note="Dataset contains gene expression data but lacks proper clinical trait information for liver cirrhosis analysis."
    )
    
    print("Dataset is not usable for liver cirrhosis analysis due to lack of clinical trait data. No linked data file saved.")

Gene data shape before normalization: (19521, 39)
Gene data shape after normalization: (19298, 39)


Normalized gene expression data saved to ../../output/preprocess/Liver_cirrhosis/gene_data/GSE139602.csv
Original clinical data preview:
         !Sample_geo_accession              GSM4144550  \
0  !Sample_characteristics_ch1  disease state: Healthy   

               GSM4144551              GSM4144552              GSM4144553  \
0  disease state: Healthy  disease state: Healthy  disease state: Healthy   

               GSM4144554              GSM4144555           GSM4144556  \
0  disease state: Healthy  disease state: Healthy  disease state: eCLD   

            GSM4144557           GSM4144558  ...  \
0  disease state: eCLD  disease state: eCLD  ...   

                              GSM4144579  \
0  disease state: Decompesated Cirrhosis   

                              GSM4144580  \
0  disease state: Decompesated Cirrhosis   

                                      GSM4144581  \
0  disease state: Acute-on-chronic liver failure   

                                      GSM4144582  \
0 

Data shape after handling missing values: (39, 19299)
For the feature 'Liver_cirrhosis', the least common label is '0.0' with 11 occurrences. This represents 28.21% of the dataset.
The distribution of the feature 'Liver_cirrhosis' in this dataset is fine.

Data shape after removing biased features: (39, 19299)
A new JSON file was created at: ../../output/preprocess/Liver_cirrhosis/cohort_info.json


Linked data saved to ../../output/preprocess/Liver_cirrhosis/GSE139602.csv
