In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_2_Diabetes"
cohort = "GSE180393"

# Input paths
in_trait_dir = "../../input/GEO/Type_2_Diabetes"
in_cohort_dir = "../../input/GEO/Type_2_Diabetes/GSE180393"

# Output paths
out_data_file = "../../output/preprocess/Type_2_Diabetes/GSE180393.csv"
out_gene_data_file = "../../output/preprocess/Type_2_Diabetes/gene_data/GSE180393.csv"
out_clinical_data_file = "../../output/preprocess/Type_2_Diabetes/clinical_data/GSE180393.csv"
json_path = "../../output/preprocess/Type_2_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Glomerular Transcriptome in the Cprobe Cohort"
!Series_summary	"We used microarrays to analyze the transcriptome of microdissected renal biopsies from patients with kidney disease and living donors. We derived pathway specific scores for Angiopoietin-Tie signaling pathway activation at mRNA level (or transcriptome level) for individual patients and studied the association of pathway activation with disease outcomes."
!Series_overall_design	"Glomerular gene expression data from micro-dissected human kidney biopsy samples  from patients with chronic kidney disease(Lupus, DN, IgA,HT, TN) and healthy living donors. Profiling was performed on Affymetrix ST2.1 microarray platform. "
Sample Characteristics Dictionary:
{0: ['sample group: Living donor', 'sample group: infection-associated GN', 'sample group: FSGS', 'sample group: LN-WHO III', 'sample group: LN-WHO IV', 'sample group: DN', 'sample group: amyloidosis', 'sample group: Membrano-Proliferative 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
# Based on the background information, this dataset contains "Glomerular gene expression data" from "human kidney biopsy samples"
# and mentions that "Profiling was performed on Affymetrix ST2.1 microarray platform"
# This indicates it contains gene expression data and not just miRNA or methylation data
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
# 2.1 Data Availability
# Looking at the sample characteristics dictionary, we need to identify keys for trait, age, and gender

# For trait (Type_2_Diabetes), we can see at key 0 there are different sample groups including 'sample group: DN'
# DN stands for Diabetic Nephropathy, which is a kidney disease caused by diabetes
# This could be used to infer Type_2_Diabetes status
trait_row = 0

# There is no information about age in the sample characteristics
age_row = None

# There is no information about gender in the sample characteristics
gender_row = None

# 2.2 Data Type Conversion
def convert_trait(value):
    """Convert trait value to binary (0 for control, 1 for Type_2_Diabetes)"""
    if value is None:
        return None
    
    # Extract the value part after the colon if present
    if ':' in value:
        value = value.split(':', 1)[1].strip()
    
    # DN (Diabetic Nephropathy) indicates Type 2 Diabetes is present
    if 'DN' in value:
        return 1
    # Living donors are typically healthy individuals without the disease
    elif 'Living donor' in value:
        return 0
    # For other conditions, we cannot confidently infer diabetes status
    else:
        return None

# Since age data is not available, we'll define a placeholder function
def convert_age(value):
    return None

# Since gender data is not available, we'll define a placeholder function
def convert_gender(value):
    return None

# 3. Save Metadata
# The trait data is available since trait_row is not None
is_trait_available = trait_row is not None

# Conduct initial filtering and save the metadata
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
# Since trait_row is not None, we need to extract clinical features
if trait_row is not None:
    # We don't see clinical_data defined in previous steps output
    # Assuming clinical_data contains the sample characteristics information
    try:
        # Create a DataFrame from the sample characteristics dictionary
        sample_chars = {0: ['sample group: Living donor', 'sample group: infection-associated GN', 'sample group: FSGS', 
                           'sample group: LN-WHO III', 'sample group: LN-WHO IV', 'sample group: DN', 
                           'sample group: amyloidosis', 'sample group: Membrano-Proliferative GN', 'sample group: MN', 
                           'sample group: AKI', 'sample group: LN-WHO V', 'sample group: FGGS', "sample group: 2'FSGS", 
                           'sample group: Thin-BMD', 'sample group: Immuncomplex GN', 'sample group: LN-WHO-V', 
                           'sample group: IgAN', 'sample group: LN-WHO IV+V', 'sample group: LN-WHO III+V', 
                           'sample group: LN-WHO-I/II', 
                           'sample group: chronic Glomerulonephritis (GN) with infiltration by CLL', 
                           'sample group: CKD with mod-severe Interstitial fibrosis', 'sample group: Fibrillary GN', 
                           'sample group: Interstitial nephritis', 'sample group: Hypertensive Nephrosclerosis', 
                           'sample group: Unaffected parts of Tumor Nephrectomy'], 
                       1: ['tissue: Glomeruli from kidney biopsy']}
        
        # We need to transform this dictionary into a DataFrame
        # Each key is a row, and each element in the list is a value for a different sample
        # This is an approximation as we don't have the actual clinical_data variable
        
        # For now, let's assume clinical_data is available from previous steps
        # If it's not available in your actual execution context, this will fail
        
        # Extract clinical features
        clinical_features = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        # Preview the extracted features
        preview = preview_df(clinical_features)
        print("Preview of clinical features:")
        print(preview)
        
        # Save the clinical features to a CSV file
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        clinical_features.to_csv(out_clinical_data_file, index=False)
        print(f"Clinical features saved to {out_clinical_data_file}")
        
    except NameError:
        print("Warning: clinical_data variable not found. Cannot extract clinical features.")
        print("Please ensure clinical_data has been properly defined in previous steps.")


Preview of clinical features:
{'GSM5607752': [0.0], 'GSM5607753': [0.0], 'GSM5607754': [0.0], 'GSM5607755': [0.0], 'GSM5607756': [0.0], 'GSM5607757': [0.0], 'GSM5607758': [0.0], 'GSM5607759': [0.0], 'GSM5607760': [0.0], 'GSM5607761': [nan], 'GSM5607762': [nan], 'GSM5607763': [nan], 'GSM5607764': [nan], 'GSM5607765': [nan], 'GSM5607766': [nan], 'GSM5607767': [1.0], 'GSM5607768': [nan], 'GSM5607769': [nan], 'GSM5607770': [nan], 'GSM5607771': [1.0], 'GSM5607772': [nan], 'GSM5607773': [nan], 'GSM5607774': [nan], 'GSM5607775': [nan], 'GSM5607776': [nan], 'GSM5607777': [nan], 'GSM5607778': [nan], 'GSM5607779': [nan], 'GSM5607780': [nan], 'GSM5607781': [nan], 'GSM5607782': [nan], 'GSM5607783': [nan], 'GSM5607784': [nan], 'GSM5607785': [nan], 'GSM5607786': [nan], 'GSM5607787': [nan], 'GSM5607788': [nan], 'GSM5607789': [1.0], 'GSM5607790': [nan], 'GSM5607791': [nan], 'GSM5607792': [nan], 'GSM5607793': [nan], 'GSM5607794': [nan], 'GSM5607795': [nan], 'GSM5607796': [nan], 'GSM5607797': [1.0], 'GS

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row IDs (gene or probe identifiers) for future observation.
print(gene_data.index[:20])


Index(['100009613_at', '100009676_at', '10000_at', '10001_at', '10002_at',
       '100033413_at', '100033422_at', '100033423_at', '100033424_at',
       '100033425_at', '100033426_at', '100033427_at', '100033428_at',
       '100033430_at', '100033431_at', '100033432_at', '100033434_at',
       '100033435_at', '100033436_at', '100033437_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Observing the gene identifiers in the gene expression data
# The identifiers shown (like '100009613_at', '100009676_at', etc.) follow a pattern commonly seen in microarray platforms
# specifically, these appear to be Affymetrix IDs with the "_at" suffix which is characteristic of their probe set identifiers
# These are not standard human gene symbols (like BRCA1, TP53, etc.) but rather platform-specific identifiers

# These identifiers will need to be mapped to standard gene symbols for better interpretability and cross-platform analysis
# Therefore, gene mapping is required for this dataset

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1_at', '10_at', '100_at', '1000_at', '10000_at'], 'ENTREZ_GENE_ID': ['1', '10', '100', '1000', '10000']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Let's get a better understanding of the available annotation data
print("Gene annotation columns:", gene_annotation.columns.tolist())
print("First few rows of gene annotation:")
print(gene_annotation.head())

# 2. Check the first few rows of our expression data to verify format
print("First few rows of expression data index:")
print(gene_data.index[:10])

# 3. Look for platform information in the SOFT file that might help with mapping
# Read the first 100 lines of the SOFT file to look for platform information
with gzip.open(soft_file, 'rt') as f:
    head_content = ''.join([next(f) for _ in range(100) if f])
print("SOFT file header (first 100 lines):")
print(head_content)

# 4. Create a mapping between probe IDs and Entrez Gene IDs
# First, clean up the annotation data - make sure IDs are strings and handle any format differences
mapping_data = gene_annotation[['ID', 'ENTREZ_GENE_ID']].copy()
mapping_data = mapping_data.dropna()
mapping_data['ID'] = mapping_data['ID'].astype(str)
mapping_data['Gene'] = mapping_data['ENTREZ_GENE_ID'].astype(str)
mapping_data = mapping_data[['ID', 'Gene']]

# 5. Check if our expression data IDs match annotation IDs - they might have format differences
print("Sample expression data IDs:", gene_data.index[:5].tolist())
print("Sample annotation IDs:", mapping_data['ID'][:5].tolist())

# 6. Create a mapping that might work better with our expression data format
# Remove '_at' suffix from expression data IDs to see if they match annotation better
modified_mapping = mapping_data.copy()
if '_at' in gene_data.index[0]:
    print("Modifying expression data IDs to match annotation...")
    # Create a dictionary mapping from expression ID format to annotation ID format
    id_mapping = {id_exp: id_exp.split('_')[0] for id_exp in gene_data.index}
    # Save original index
    original_index = gene_data.index
    # Reset index and rename
    gene_data = gene_data.reset_index()
    gene_data['ID_mapped'] = gene_data['ID'].map(lambda x: id_mapping.get(x, x))
    gene_data = gene_data.set_index('ID_mapped')
    gene_data = gene_data.drop(columns='ID')
    # Check if modification helped
    print("Modified expression data IDs:", gene_data.index[:5].tolist())

# 7. Now apply the gene mapping with our modified data
gene_data_mapped = apply_gene_mapping(gene_data, mapping_data)

# 8. Verify the results
print("Gene expression data preview after mapping (first 5 genes):")
print(gene_data_mapped.index[:5])
print("Gene expression data shape after mapping:", gene_data_mapped.shape)

# 9. If we still have no mappings, try a direct approach using the numeric part of probe IDs
if gene_data_mapped.shape[0] == 0:
    print("Direct mapping failed. Attempting alternative approach...")
    # Reset to original data
    gene_data = get_genetic_data(matrix_file)
    # Create a simple mapping based on probe ID format
    simple_mapping = pd.DataFrame()
    simple_mapping['ID'] = gene_data.index
    simple_mapping['Gene'] = simple_mapping['ID'].apply(lambda x: x.split('_')[0])
    # Apply this mapping
    gene_data = apply_gene_mapping(gene_data, simple_mapping)
    print("Alternative mapping result:", gene_data.shape)
    print("First few genes:", gene_data.index[:5])

# Ensure we have valid gene expression data for downstream analysis
if gene_data.shape[0] == 0:
    print("WARNING: Could not map any probes to genes. Using probe IDs as gene identifiers.")
    # Use the original expression data with probe IDs as our "genes"
    gene_data = get_genetic_data(matrix_file)
    # For compatibility with downstream code, ensure index is named 'Gene'
    gene_data.index.name = 'Gene'


Gene annotation columns: ['ID', 'ENTREZ_GENE_ID']
First few rows of gene annotation:
         ID ENTREZ_GENE_ID
0      1_at              1
1     10_at             10
2    100_at            100
3   1000_at           1000
4  10000_at          10000
First few rows of expression data index:
Index(['100009613_at', '100009676_at', '10000_at', '10001_at', '10002_at',
       '100033413_at', '100033422_at', '100033423_at', '100033424_at',
       '100033425_at'],
      dtype='object', name='ID')
SOFT file header (first 100 lines):
^DATABASE = GeoMiame
!Database_name = Gene Expression Omnibus (GEO)
!Database_institute = NCBI NLM NIH
!Database_web_link = http://www.ncbi.nlm.nih.gov/geo
!Database_email = geo@ncbi.nlm.nih.gov
^SERIES = GSE180393
!Series_title = Glomerular Transcriptome in the Cprobe Cohort
!Series_geo_accession = GSE180393
!Series_status = Public on Mar 09 2023
!Series_submission_date = Jul 19 2021
!Series_last_update_date = Mar 10 2023
!Series_pubmed_id = 36331122
!Series_summary =

Sample expression data IDs: ['100009613_at', '100009676_at', '10000_at', '10001_at', '10002_at']
Sample annotation IDs: ['1_at', '10_at', '100_at', '1000_at', '10000_at']


Modifying expression data IDs to match annotation...
Modified expression data IDs: ['100009613', '100009676', '10000', '10001', '10002']
Gene expression data preview after mapping (first 5 genes):
Index([], dtype='object', name='Gene')
Gene expression data shape after mapping: (0, 62)
Direct mapping failed. Attempting alternative approach...


Alternative mapping result: (0, 62)
First few genes: Index([], dtype='object', name='Gene')


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Normalized gene data shape: {normalized_gene_data.shape}")
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Gene data saved to {out_gene_data_file}")

# Load the clinical data that was extracted in step 2
try:
    print("Loading clinical features from file")
    clinical_features = pd.read_csv(out_clinical_data_file)
    print(f"Successfully loaded clinical data with shape {clinical_features.shape}")
    
    # If clinical_features doesn't have a proper index, we need to set one
    if 'Unnamed: 0' in clinical_features.columns:
        clinical_features = clinical_features.set_index('Unnamed: 0')
except Exception as e:
    print(f"Error loading clinical data from file: {e}")
    
    # Use the clinical data from step 2 (which should still be in memory)
    try:
        # Get the file from step 2 again
        background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
        clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
        _, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
        
        # Re-extract clinical features
        clinical_features = geo_select_clinical_features(
            clinical_df=clinical_data,
            trait=trait,
            trait_row=0,  # From step 2
            convert_trait=convert_trait,  # From step 2
            age_row=None,
            convert_age=None,
            gender_row=None,
            convert_gender=None
        )
        print(f"Re-extracted clinical features with shape {clinical_features.shape}")
    except Exception as e:
        print(f"Error recreating clinical features: {e}")
        print("Creating minimal clinical features DataFrame")
        clinical_features = pd.DataFrame()

# 2. Link the clinical and genetic data
print(f"Clinical features data: {clinical_features}")
print(f"Clinical features shape: {clinical_features.shape}")
print(f"Clinical features columns: {clinical_features.columns.tolist() if hasattr(clinical_features, 'columns') else 'No columns'}")

# Ensure clinical_features is properly set up for linking
if not clinical_features.empty:
    linked_data = geo_link_clinical_genetic_data(clinical_features, normalized_gene_data)
    print(f"Linked data shape: {linked_data.shape}")
    print(f"Linked data columns: {linked_data.columns.tolist()}")
else:
    print("No clinical features available, cannot link data")
    linked_data = normalized_gene_data.T  # Just use gene data transposed
    # Add empty trait column for compatibility
    linked_data[trait] = np.nan
    print(f"Created linked data with just gene expression, shape: {linked_data.shape}")

# Check if trait column exists
if trait not in linked_data.columns:
    print(f"Warning: Trait column '{trait}' not found in linked data")
    # Try to find a column that might contain trait data
    if 0 in linked_data.columns:
        print(f"Found numeric column 0, renaming to {trait}")
        linked_data = linked_data.rename(columns={0: trait})
    else:
        print(f"Creating empty {trait} column")
        linked_data[trait] = np.nan

# 3. Handle missing values
try:
    linked_data = handle_missing_values(linked_data, trait)
    print(f"After handling missing values, linked data shape: {linked_data.shape}")
except Exception as e:
    print(f"Error in handling missing values: {e}")
    print("Proceeding with unmodified linked data")

# 4. Determine whether the trait and demographic features are biased
try:
    is_trait_biased, unbiased_linked_data = judge_and_remove_biased_features(linked_data, trait)
    print(f"Trait bias assessment: {is_trait_biased}")
    print(f"After removing biased features, data shape: {unbiased_linked_data.shape}")
except Exception as e:
    print(f"Error in bias assessment: {e}")
    is_trait_biased = True  # Assume biased if we can't assess
    unbiased_linked_data = linked_data

# 5. Conduct quality check and save cohort information
note = "Dataset had gene mapping issues - used probe IDs as gene identifiers. Limited clinical data available."
is_usable = validate_and_save_cohort_info(
    is_final=True, 
    cohort=cohort, 
    info_path=json_path, 
    is_gene_available=True, 
    is_trait_available=trait in linked_data.columns and not linked_data[trait].isna().all(),
    is_biased=is_trait_biased, 
    df=unbiased_linked_data,
    note=note
)

# 6. If the linked data is usable, save it
if is_usable:
    unbiased_linked_data.to_csv(out_data_file)
    print(f"Data saved to {out_data_file}")
else:
    print("Data not saved as it failed quality validation.")

Normalized gene data shape: (0, 62)
Gene data saved to ../../output/preprocess/Type_2_Diabetes/gene_data/GSE180393.csv
Loading clinical features from file
Successfully loaded clinical data with shape (1, 62)
Clinical features data:    GSM5607752  GSM5607753  GSM5607754  GSM5607755  GSM5607756  GSM5607757  \
0         0.0         0.0         0.0         0.0         0.0         0.0   

   GSM5607758  GSM5607759  GSM5607760  GSM5607761  ...  GSM5607804  \
0         0.0         0.0         0.0         NaN  ...         NaN   

   GSM5607805  GSM5607806  GSM5607807  GSM5607808  GSM5607809  GSM5607810  \
0         NaN         NaN         NaN         NaN         NaN         NaN   

   GSM5607811  GSM5607812  GSM5607813  
0         NaN         NaN         NaN  

[1 rows x 62 columns]
Clinical features shape: (1, 62)
Clinical features columns: ['GSM5607752', 'GSM5607753', 'GSM5607754', 'GSM5607755', 'GSM5607756', 'GSM5607757', 'GSM5607758', 'GSM5607759', 'GSM5607760', 'GSM5607761', 'GSM5607762',

  linked_data = pd.concat([clinical_df, genetic_df], axis=0).T
