In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Substance_Use_Disorder"
cohort = "GSE138297"

# Input paths
in_trait_dir = "../../input/GEO/Substance_Use_Disorder"
in_cohort_dir = "../../input/GEO/Substance_Use_Disorder/GSE138297"

# Output paths
out_data_file = "../../output/preprocess/Substance_Use_Disorder/GSE138297.csv"
out_gene_data_file = "../../output/preprocess/Substance_Use_Disorder/gene_data/GSE138297.csv"
out_clinical_data_file = "../../output/preprocess/Substance_Use_Disorder/clinical_data/GSE138297.csv"
json_path = "../../output/preprocess/Substance_Use_Disorder/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"The host response of IBS patients to allogenic and autologous faecal microbiota transfer"
!Series_summary	"In this randomised placebo-controlled trial, irritable bowel syndrome (IBS) patients were treated with faecal material from a healthy donor (n=8, allogenic FMT) or with their own faecal microbiota (n=8, autologous FMT). The faecal transplant was administered by whole colonoscopy into the caecum (30 g of stool in 150 ml sterile saline). Two weeks before the FMT (baseline) as well as two and eight weeks after the FMT, the participants underwent a sigmoidoscopy, and biopsies were collected at a standardised location (20-25 cm from the anal verge at the crossing with the arteria iliaca communis) from an uncleansed sigmoid. In patients treated with allogenic FMT, predominantly immune response-related genes sets were induced, with the strongest response two weeks after FMT. In patients treated with autologous FMT, predominantly metabolism-related g

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Determine gene expression data availability
# From the background info, we see this is microarray data of sigmoid biopsies.
# Microarray typically contains gene expression data, so it's likely available.
is_gene_available = True

# 2. Identify variable availability and create conversion functions

# 2.1 For trait (Substance Use Disorder)
# This dataset is about Irritable Bowel Syndrome (IBS) patients who received FMT.
# The experimental condition (row 6) indicates whether they received allogenic or autologous FMT.
# This is not directly about substance use disorder, but we can use the FMT type as our trait.
trait_row = 6

def convert_trait(value):
    if value is None:
        return None
    if ':' in str(value):
        value = value.split(':', 1)[1].strip()
    if 'Allogenic' in value:
        return 1  # Allogenic FMT
    elif 'Autologous' in value:
        return 0  # Autologous FMT
    return None

# 2.2 For age
# Age is available in row 3
age_row = 3

def convert_age(value):
    if value is None:
        return None
    if ':' in str(value):
        value = value.split(':', 1)[1].strip()
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

# 2.3 For gender
# Gender is available in row 1 as "sex (female=1, male=0)"
# Note: This is opposite of our convention (we want male=1, female=0)
gender_row = 1

def convert_gender(value):
    if value is None:
        return None
    if ':' in str(value):
        value = value.split(':', 1)[1].strip()
    try:
        # The dataset uses 1 for female, 0 for male
        # But we need to convert to our convention: 0 for female, 1 for male
        gender_value = int(value)
        return 1 - gender_value  # Invert the value
    except (ValueError, TypeError):
        return None

# 3. Save metadata
# Determine trait data availability
is_trait_available = trait_row is not None
validate_and_save_cohort_info(is_final=False, cohort=cohort, info_path=json_path, 
                             is_gene_available=is_gene_available, 
                             is_trait_available=is_trait_available)

# 4. Clinical Feature Extraction (only if trait_row is not None)
if trait_row is not None:
    # Use the library function to extract clinical features
    clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,  # This is assumed to be defined in a previous step
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the dataframe
    preview = preview_df(clinical_df)
    print("Clinical Data Preview:")
    print(preview)
    
    # Save clinical data to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Clinical Data Preview:
{'GSM4104672': [1.0, 49.0, 0.0], 'GSM4104673': [1.0, 49.0, 0.0], 'GSM4104674': [1.0, 49.0, 0.0], 'GSM4104675': [1.0, 21.0, 1.0], 'GSM4104676': [1.0, 21.0, 1.0], 'GSM4104677': [1.0, 21.0, 1.0], 'GSM4104678': [0.0, 31.0, 1.0], 'GSM4104679': [0.0, 31.0, 1.0], 'GSM4104680': [0.0, 31.0, 1.0], 'GSM4104681': [0.0, 59.0, 1.0], 'GSM4104682': [0.0, 59.0, 1.0], 'GSM4104683': [0.0, 59.0, 1.0], 'GSM4104684': [1.0, 40.0, 1.0], 'GSM4104685': [1.0, 40.0, 1.0], 'GSM4104686': [1.0, 40.0, 1.0], 'GSM4104687': [0.0, 33.0, 0.0], 'GSM4104688': [0.0, 33.0, 0.0], 'GSM4104689': [0.0, 33.0, 0.0], 'GSM4104690': [1.0, 28.0, 1.0], 'GSM4104691': [1.0, 28.0, 1.0], 'GSM4104692': [1.0, 28.0, 1.0], 'GSM4104693': [0.0, 40.0, 0.0], 'GSM4104694': [0.0, 40.0, 0.0], 'GSM4104695': [0.0, 40.0, 0.0], 'GSM4104696': [1.0, 36.0, 0.0], 'GSM4104697': [1.0, 36.0, 0.0], 'GSM4104698': [1.0, 36.0, 0.0], 'GSM4104699': [1.0, 50.0, 1.0], 'GSM4104700': [1.0, 50.0, 1.0], 'GSM4104701': [1.0, 50.0, 1.0], 'GSM4104702': [0

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the file paths for the SOFT file and matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. First, let's examine the structure of the matrix file to understand its format
import gzip

# Peek at the first few lines of the file to understand its structure
with gzip.open(matrix_file, 'rt') as file:
    # Read first 100 lines to find the header structure
    for i, line in enumerate(file):
        if '!series_matrix_table_begin' in line:
            print(f"Found data marker at line {i}")
            # Read the next line which should be the header
            header_line = next(file)
            print(f"Header line: {header_line.strip()}")
            # And the first data line
            first_data_line = next(file)
            print(f"First data line: {first_data_line.strip()}")
            break
        if i > 100:  # Limit search to first 100 lines
            print("Matrix table marker not found in first 100 lines")
            break

# 3. Now try to get the genetic data with better error handling
try:
    gene_data = get_genetic_data(matrix_file)
    print(gene_data.index[:20])
except KeyError as e:
    print(f"KeyError: {e}")
    
    # Alternative approach: manually extract the data
    print("\nTrying alternative approach to read the gene data:")
    with gzip.open(matrix_file, 'rt') as file:
        # Find the start of the data
        for line in file:
            if '!series_matrix_table_begin' in line:
                break
                
        # Read the headers and data
        import pandas as pd
        df = pd.read_csv(file, sep='\t', index_col=0)
        print(f"Column names: {df.columns[:5]}")
        print(f"First 20 row IDs: {df.index[:20]}")
        gene_data = df


Found data marker at line 67
Header line: "ID_REF"	"GSM4104672"	"GSM4104673"	"GSM4104674"	"GSM4104675"	"GSM4104676"	"GSM4104677"	"GSM4104678"	"GSM4104679"	"GSM4104680"	"GSM4104681"	"GSM4104682"	"GSM4104683"	"GSM4104684"	"GSM4104685"	"GSM4104686"	"GSM4104687"	"GSM4104688"	"GSM4104689"	"GSM4104690"	"GSM4104691"	"GSM4104692"	"GSM4104693"	"GSM4104694"	"GSM4104695"	"GSM4104696"	"GSM4104697"	"GSM4104698"	"GSM4104699"	"GSM4104700"	"GSM4104701"	"GSM4104702"	"GSM4104703"	"GSM4104704"	"GSM4104705"	"GSM4104706"	"GSM4104707"	"GSM4104708"	"GSM4104709"	"GSM4104710"	"GSM4104711"	"GSM4104712"	"GSM4104713"	"GSM4104714"	"GSM4104715"	"GSM4104716"
First data line: 16650001	1.605655144	1.843828454	1.899724264	1.617480923	1.920638396	3.63385594	1.93408369	1.573835005	1.409345167	2.211837425	1.692594326	1.483653161	1.036818679	1.672944967	3.680559475	2.979731225	3.205596204	3.460458409	1.751848193	0.744824546	3.075519289	2.189577606	1.730044194	2.292415021	2.369373599	2.584867499	3.099427478	1.189063212	1.32

Index(['16650001', '16650003', '16650005', '16650007', '16650009', '16650011',
       '16650013', '16650015', '16650017', '16650019', '16650021', '16650023',
       '16650025', '16650027', '16650029', '16650031', '16650033', '16650035',
       '16650037', '16650041'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# Upon examination of the gene identifiers (like 16650001, 16650003, etc.), 
# these appear to be Illumina probe IDs rather than standard human gene symbols.
# Illumina IDs typically need to be mapped to gene symbols for biological interpretation.

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Let's first examine the structure of the SOFT file before trying to parse it
import gzip

# Look at the first few lines of the SOFT file to understand its structure
print("Examining SOFT file structure:")
try:
    with gzip.open(soft_file, 'rt') as file:
        # Read first 20 lines to understand the file structure
        for i, line in enumerate(file):
            if i < 20:
                print(f"Line {i}: {line.strip()}")
            else:
                break
except Exception as e:
    print(f"Error reading SOFT file: {e}")

# 2. Now let's try a more robust approach to extract the gene annotation
# Instead of using the library function which failed, we'll implement a custom approach
try:
    # First, look for the platform section which contains gene annotation
    platform_data = []
    with gzip.open(soft_file, 'rt') as file:
        in_platform_section = False
        for line in file:
            if line.startswith('^PLATFORM'):
                in_platform_section = True
                continue
            if in_platform_section and line.startswith('!platform_table_begin'):
                # Next line should be the header
                header = next(file).strip()
                platform_data.append(header)
                # Read until the end of the platform table
                for table_line in file:
                    if table_line.startswith('!platform_table_end'):
                        break
                    platform_data.append(table_line.strip())
                break
    
    # If we found platform data, convert it to a DataFrame
    if platform_data:
        import pandas as pd
        import io
        platform_text = '\n'.join(platform_data)
        gene_annotation = pd.read_csv(io.StringIO(platform_text), delimiter='\t', 
                                      low_memory=False, on_bad_lines='skip')
        print("\nGene annotation preview:")
        print(preview_df(gene_annotation))
    else:
        print("Could not find platform table in SOFT file")
        
        # Try an alternative approach - extract mapping from other sections
        with gzip.open(soft_file, 'rt') as file:
            for line in file:
                if 'ANNOTATION information' in line or 'annotation information' in line:
                    print(f"Found annotation information: {line.strip()}")
                if line.startswith('!Platform_title') or line.startswith('!platform_title'):
                    print(f"Platform title: {line.strip()}")
            
except Exception as e:
    print(f"Error processing gene annotation: {e}")


Examining SOFT file structure:
Line 0: ^DATABASE = GeoMiame
Line 1: !Database_name = Gene Expression Omnibus (GEO)
Line 2: !Database_institute = NCBI NLM NIH
Line 3: !Database_web_link = http://www.ncbi.nlm.nih.gov/geo
Line 4: !Database_email = geo@ncbi.nlm.nih.gov
Line 5: ^SERIES = GSE138297
Line 6: !Series_title = The host response of IBS patients to allogenic and autologous faecal microbiota transfer
Line 7: !Series_geo_accession = GSE138297
Line 8: !Series_status = Public on Oct 14 2019
Line 9: !Series_submission_date = Oct 02 2019
Line 10: !Series_last_update_date = Oct 16 2019
Line 11: !Series_pubmed_id = 31597320
Line 12: !Series_summary = In this randomised placebo-controlled trial, irritable bowel syndrome (IBS) patients were treated with faecal material from a healthy donor (n=8, allogenic FMT) or with their own faecal microbiota (n=8, autologous FMT). The faecal transplant was administered by whole colonoscopy into the caecum (30 g of stool in 150 ml sterile saline). Two wee


Gene annotation preview:
{'ID': [16657436, 16657440, 16657445, 16657447, 16657450], 'probeset_id': [16657436, 16657440, 16657445, 16657447, 16657450], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'strand': ['+', '+', '+', '+', '+'], 'start': ['12190', '29554', '69091', '160446', '317811'], 'stop': ['13639', '31109', '70008', '161525', '328581'], 'total_probes': [25, 28, 8, 13, 36], 'gene_assignment': ['NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1 // 1p36.33 // 100287102 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 15q26.3 // 100288486 /// NR_051985 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 9 // 15q26.3 // 100288486 /// NR_045117 // DDX11L10 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 10 // 16p13.3 // 100287029 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 2 // 2q13 // 84771 /// NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 l

### Step 6: Gene Identifier Mapping

In [7]:
# Looking at the preview data, we see:
# - Gene expression data has probes like '16650001' as index
# - Gene annotation has 'ID' column that looks like the same probe IDs 
# - 'gene_assignment' column contains gene symbol information

# 1. Define the column names that map to probe IDs and gene symbols
prob_col = 'ID'  # The column containing probe IDs
gene_col = 'gene_assignment'  # The column containing gene symbol information

# 2. Get the gene mapping dataframe
mapping_df = get_gene_mapping(gene_annotation, prob_col, gene_col)

# Preview the mapping dataframe
print("Gene mapping preview:")
print(mapping_df.head())
print(f"Number of mappings: {len(mapping_df)}")

# 3. Convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(gene_data, mapping_df)
print("\nGene expression data preview:")
print(f"Shape: {gene_data.shape}")
print(f"First few genes: {list(gene_data.index[:5])}")

# Save the gene expression data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Gene mapping preview:
         ID                                               Gene
0  16657436  NR_046018 // DDX11L1 // DEAD/H (Asp-Glu-Ala-As...
1  16657440  ENST00000473358 // MIR1302-11 // microRNA 1302...
2  16657445  NM_001005484 // OR4F5 // olfactory receptor, f...
3  16657447                                                ---
4  16657450  AK302511 // LOC100132062 // uncharacterized LO...
Number of mappings: 53617



Gene expression data preview:
Shape: (81076, 45)
First few genes: ['A-', 'A-2', 'A-52', 'A-E', 'A-I']


Gene expression data saved to ../../output/preprocess/Substance_Use_Disorder/gene_data/GSE138297.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the obtained gene expression data
print(f"Gene data shape before normalization: {gene_data.shape}")

# Normalize gene symbols using NCBI Gene database
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Gene data shape after normalization: {normalized_gene_data.shape}")

# Save the normalized gene data
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
normalized_gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# 2. Check if trait data is available before proceeding with clinical data extraction
if trait_row is None:
    print("Trait row is None. Cannot extract trait information from clinical data.")
    # Create an empty dataframe for clinical features
    clinical_features = pd.DataFrame()
    
    # Create an empty dataframe for linked data
    linked_data = pd.DataFrame()
    
    # Validate and save cohort info
    validate_and_save_cohort_info(
        is_final=True, 
        cohort=cohort, 
        info_path=json_path, 
        is_gene_available=True, 
        is_trait_available=False,  # Trait data is not available
        is_biased=True,  # Not applicable but required
        df=pd.DataFrame(),  # Empty dataframe
        note=f"Dataset contains gene expression data but lacks clear trait indicators for {trait} status."
    )
    print("Data was determined to be unusable due to missing trait indicators and was not saved")
else:
    try:
        # Get the file paths for the matrix file to extract clinical data
        _, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
        
        # Get raw clinical data from the matrix file
        _, clinical_raw = get_background_and_clinical_data(matrix_file)
        
        # Verify clinical data structure
        print("Raw clinical data shape:", clinical_raw.shape)
        
        # Extract clinical features using the defined conversion functions
        clinical_features = geo_select_clinical_features(
            clinical_df=clinical_raw,
            trait=trait,
            trait_row=trait_row,
            convert_trait=convert_trait,
            age_row=age_row,
            convert_age=convert_age,
            gender_row=gender_row,
            convert_gender=convert_gender
        )
        
        print("Clinical features:")
        print(clinical_features)
        
        # Save clinical features to file
        os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
        clinical_features.to_csv(out_clinical_data_file)
        print(f"Clinical features saved to {out_clinical_data_file}")
        
        # 3. Link clinical and genetic data
        linked_data = geo_link_clinical_genetic_data(clinical_features, normalized_gene_data)
        print(f"Linked data shape: {linked_data.shape}")
        print("Linked data preview (first 5 rows, first 5 columns):")
        print(linked_data.iloc[:5, :5])
        
        # 4. Handle missing values
        print("Missing values before handling:")
        print(f"  Trait ({trait}) missing: {linked_data[trait].isna().sum()} out of {len(linked_data)}")
        if 'Age' in linked_data.columns:
            print(f"  Age missing: {linked_data['Age'].isna().sum()} out of {len(linked_data)}")
        if 'Gender' in linked_data.columns:
            print(f"  Gender missing: {linked_data['Gender'].isna().sum()} out of {len(linked_data)}")
        
        gene_cols = [col for col in linked_data.columns if col not in [trait, 'Age', 'Gender']]
        print(f"  Genes with >20% missing: {sum(linked_data[gene_cols].isna().mean() > 0.2)}")
        print(f"  Samples with >5% missing genes: {sum(linked_data[gene_cols].isna().mean(axis=1) > 0.05)}")
        
        cleaned_data = handle_missing_values(linked_data, trait)
        print(f"Data shape after handling missing values: {cleaned_data.shape}")
        
        # 5. Evaluate bias in trait and demographic features
        is_trait_biased = False
        if len(cleaned_data) > 0:
            trait_biased, cleaned_data = judge_and_remove_biased_features(cleaned_data, trait)
            is_trait_biased = trait_biased
        else:
            print("No data remains after handling missing values.")
            is_trait_biased = True
        
        # 6. Final validation and save
        is_usable = validate_and_save_cohort_info(
            is_final=True, 
            cohort=cohort, 
            info_path=json_path, 
            is_gene_available=True, 
            is_trait_available=True, 
            is_biased=is_trait_biased, 
            df=cleaned_data,
            note=f"Dataset contains gene expression data for {trait} analysis."
        )
        
        # 7. Save if usable
        if is_usable and len(cleaned_data) > 0:
            os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
            cleaned_data.to_csv(out_data_file)
            print(f"Linked data saved to {out_data_file}")
        else:
            print("Data was determined to be unusable or empty and was not saved")
            
    except Exception as e:
        print(f"Error processing data: {e}")
        # Handle the error case by still recording cohort info
        validate_and_save_cohort_info(
            is_final=True, 
            cohort=cohort, 
            info_path=json_path, 
            is_gene_available=True, 
            is_trait_available=False,  # Mark as not available due to processing issues
            is_biased=True, 
            df=pd.DataFrame(),  # Empty dataframe
            note=f"Error processing data for {trait}: {str(e)}"
        )
        print("Data was determined to be unusable and was not saved")

Gene data shape before normalization: (81076, 45)
Gene data shape after normalization: (23274, 45)


Normalized gene data saved to ../../output/preprocess/Substance_Use_Disorder/gene_data/GSE138297.csv
Raw clinical data shape: (7, 46)
Clinical features:
                        GSM4104672  GSM4104673  GSM4104674  GSM4104675  \
Substance_Use_Disorder         1.0         1.0         1.0         1.0   
Age                           49.0        49.0        49.0        21.0   
Gender                         0.0         0.0         0.0         1.0   

                        GSM4104676  GSM4104677  GSM4104678  GSM4104679  \
Substance_Use_Disorder         1.0         1.0         0.0         0.0   
Age                           21.0        21.0        31.0        31.0   
Gender                         1.0         1.0         1.0         1.0   

                        GSM4104680  GSM4104681  ...  GSM4104707  GSM4104708  \
Substance_Use_Disorder         0.0         0.0  ...         1.0         0.0   
Age                           31.0        59.0  ...        23.0        50.0   
Gender          

Data shape after handling missing values: (45, 23277)
For the feature 'Substance_Use_Disorder', the least common label is '0.0' with 21 occurrences. This represents 46.67% of the dataset.
The distribution of the feature 'Substance_Use_Disorder' in this dataset is fine.

Quartiles for 'Age':
  25%: 28.0
  50% (Median): 36.0
  75%: 49.0
Min: 21.0
Max: 59.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 21 occurrences. This represents 46.67% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



Linked data saved to ../../output/preprocess/Substance_Use_Disorder/GSE138297.csv
