In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Psoriasis"
cohort = "GSE254707"

# Input paths
in_trait_dir = "../../input/GEO/Psoriasis"
in_cohort_dir = "../../input/GEO/Psoriasis/GSE254707"

# Output paths
out_data_file = "../../output/preprocess/Psoriasis/GSE254707.csv"
out_gene_data_file = "../../output/preprocess/Psoriasis/gene_data/GSE254707.csv"
out_clinical_data_file = "../../output/preprocess/Psoriasis/clinical_data/GSE254707.csv"
json_path = "../../output/preprocess/Psoriasis/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Single-cell transcriptomic analysis identifies infiltrating plasmacytoid dendritic cells in psoriasis epidermis"
!Series_summary	"The study focuses on the cellular composition of the psoriasis epidermis, using single-cell transcriptomics to identify cell subsets and their interactions in both healthy and psoriatic skin. The research uncovers three keratinocyte populations and seven immune cell subsets exclusive to psoriatic lesions. A significant finding is the identification of a previously undetected population of plasmacytoid dendritic cells (pDCs) in the psoriatic epidermis, suggesting their role in the disease's pathogenesis. The study also highlights enhanced keratinocyte-immune cell interactions in psoriatic lesions, contributing to our understanding of psoriasis at the cellular level."
!Series_overall_design	"Epidermal sheets from biopsies obtained from lesional and nonlesional skin of 5 untreated psoriasis patients and 3 healthy donors we

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
import os
import pandas as pd
import numpy as np
import json
from typing import Dict, Any, Optional, Callable

# 1. Analyze gene expression data availability
is_gene_available = True  # The dataset contains RNA-seq data based on the series information

# 2. Variable Availability and Data Type Conversion

# 2.1 Identify row keys for each variable
trait_row = 5  # 'diagnosis: Psoriasis', 'diagnosis: Healthy'
age_row = None  # Age information is not available in the provided data
gender_row = None  # Gender information is not available in the provided data

# 2.2 Define conversion functions for each variable

def convert_trait(value: str) -> int:
    """Convert trait values to binary format (0: Healthy, 1: Psoriasis)."""
    if not isinstance(value, str):
        return None
    
    value = value.lower().strip()
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    if value == "psoriasis":
        return 1
    elif value == "healthy":
        return 0
    return None

def convert_age(value: str) -> Optional[float]:
    """Convert age values to continuous format."""
    # Age information is unavailable, but including function for completeness
    return None

def convert_gender(value: str) -> Optional[int]:
    """Convert gender values to binary format (0: Female, 1: Male)."""
    # Gender information is unavailable, but including function for completeness
    return None

# 3. Save metadata about cohort usability
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction (if trait data is available)
if trait_row is not None:
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    
    # Sample characteristics dictionary needs to be loaded from input data
    # Assuming clinical_data is already available in the environment
    # If not, we need to first load it from a suitable source
    
    # Extract and process clinical features
    selected_clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Display a preview of the clinical data
    print("Preview of extracted clinical data:")
    preview = preview_df(selected_clinical_df)
    print(preview)
    
    # Save the clinical data to file
    selected_clinical_df.to_csv(out_clinical_data_file)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of extracted clinical data:
{'GSM8049207': [1.0], 'GSM8049208': [1.0], 'GSM8049209': [1.0], 'GSM8049210': [1.0], 'GSM8049211': [1.0], 'GSM8049212': [1.0], 'GSM8049213': [1.0], 'GSM8049214': [1.0], 'GSM8049215': [1.0], 'GSM8049216': [1.0], 'GSM8049217': [1.0], 'GSM8049218': [1.0], 'GSM8049219': [1.0], 'GSM8049220': [1.0], 'GSM8049221': [1.0], 'GSM8049222': [1.0], 'GSM8049223': [1.0], 'GSM8049224': [1.0], 'GSM8049225': [1.0], 'GSM8049226': [1.0], 'GSM8049227': [1.0], 'GSM8049228': [1.0], 'GSM8049229': [1.0], 'GSM8049230': [1.0], 'GSM8049231': [1.0], 'GSM8049232': [1.0], 'GSM8049233': [1.0], 'GSM8049234': [1.0], 'GSM8049235': [1.0], 'GSM8049236': [1.0], 'GSM8049237': [1.0], 'GSM8049238': [1.0], 'GSM8049239': [1.0], 'GSM8049240': [1.0], 'GSM8049241': [1.0], 'GSM8049242': [1.0], 'GSM8049243': [1.0], 'GSM8049244': [1.0], 'GSM8049245': [1.0], 'GSM8049246': [1.0], 'GSM8049247': [1.0], 'GSM8049248': [1.0], 'GSM8049249': [1.0], 'GSM8049250': [1.0], 'GSM8049251': [1.0], 'GSM8049252': [1.0

### Step 3: Gene Data Extraction

In [4]:
# 1. Re-identify the SOFT and matrix files to ensure we have the correct paths
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# Print the file size to verify it's accessible
import os
file_size = os.path.getsize(matrix_file)
print(f"Matrix file size: {file_size} bytes")

# Read a portion of the file after the marker to see the gene expression data structure
with gzip.open(matrix_file, 'rt') as f:
    # Skip to the marker line
    marker_line_found = False
    for line in f:
        if '!series_matrix_table_begin' in line:
            marker_line_found = True
            break
    
    if marker_line_found:
        # Read the next few lines to see what the data looks like
        print("\nLines immediately after the marker:")
        for i in range(5):  # Print 5 lines after the marker
            line = f.readline().strip()
            print(line)

# Try a different approach to extract gene data - manually parse the file
gene_data = None
try:
    with gzip.open(matrix_file, 'rt') as f:
        # Skip to the start of the gene data
        for line in f:
            if '!series_matrix_table_begin' in line:
                break
        
        # Read the header line
        header = f.readline().strip().split('\t')
        
        # Read the data lines
        data_rows = []
        indices = []
        
        for line in f:
            if '!series_matrix_table_end' in line:  # Stop at end marker if present
                break
            if line.strip():  # Skip empty lines
                parts = line.strip().split('\t')
                if len(parts) > 1:  # Ensure there's at least an ID and one value
                    indices.append(parts[0])
                    data_rows.append(parts[1:])
        
        if indices and data_rows:
            # Create DataFrame
            gene_data = pd.DataFrame(data_rows, index=indices, columns=header[1:])
            print(f"\nManually extracted gene data with shape: {gene_data.shape}")
        else:
            print("\nNo gene data found after the marker.")
            
            # Check if there are any genes at all in the file
            f.seek(0)  # Go back to beginning
            gene_count = 0
            for line in f:
                if line.startswith('ENSG') or line.startswith('NM_') or line.startswith('XM_'):
                    gene_count += 1
                    if gene_count == 1:
                        print(f"Sample gene line: {line.strip()}")
                    if gene_count >= 5:
                        break
            
            if gene_count > 0:
                print(f"Found {gene_count} potential gene lines.")
            else:
                print("No obvious gene identifiers found in the file.")
except Exception as e:
    print(f"Error in manual parsing: {str(e)}")

# Inspect the SOFT file to see if it contains gene expression data
print("\nChecking SOFT file for gene data...")
with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
    # Sample the first 100 lines to look for gene-related content
    for i, line in enumerate(f):
        if i < 100 and ('EXPR' in line or 'ID_REF' in line or line.startswith('ENSG') or 'gene' in line.lower()):
            print(f"Line {i}: {line.strip()}")
        if i >= 100:
            break

# Try reading gene expression data from the SOFT file
try:
    print("\nAttempting to extract gene data from SOFT file...")
    gene_expr_section = False
    gene_data_lines = []
    with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
        for line in f:
            if '!Sample_table_begin' in line:
                gene_expr_section = True
                # Get the header line
                header_line = f.readline().strip()
                gene_data_lines.append(header_line)
                continue
            if gene_expr_section and '!Sample_table_end' in line:
                gene_expr_section = False
                break
            if gene_expr_section:
                gene_data_lines.append(line.strip())
    
    if gene_data_lines:
        # Create a DataFrame from the gene data lines
        gene_data_str = '\n'.join(gene_data_lines)
        gene_data = pd.read_csv(io.StringIO(gene_data_str), delimiter='\t', index_col=0)
        print(f"Successfully extracted gene data from SOFT file. Shape: {gene_data.shape}")
except Exception as e:
    print(f"Error extracting gene data from SOFT file: {str(e)}")

# Print gene data info if available
if gene_data is not None and not gene_data.empty:
    print("\nFirst 20 gene/probe identifiers:")
    print(gene_data.index[:20].tolist())
    print(f"\nGene data dimensions: {gene_data.shape[0]} genes × {gene_data.shape[1]} samples")
else:
    print("\nFailed to extract gene data from both matrix and SOFT files.")
    print("This dataset appears to have single-cell RNA-seq data which may not be in the standard GEO matrix format.")
    print("The dataset likely requires special parsing for single-cell data.")
    
    # Set is_gene_available to False since we couldn't extract the gene data in the expected format
    is_gene_available = False

Matrix file size: 394363 bytes



Lines immediately after the marker:
"ID_REF"	"GSM8049207"	"GSM8049208"	"GSM8049209"	"GSM8049210"	"GSM8049211"	"GSM8049212"	"GSM8049213"	"GSM8049214"	"GSM8049215"	"GSM8049216"	"GSM8049217"	"GSM8049218"	"GSM8049219"	"GSM8049220"	"GSM8049221"	"GSM8049222"	"GSM8049223"	"GSM8049224"	"GSM8049225"	"GSM8049226"	"GSM8049227"	"GSM8049228"	"GSM8049229"	"GSM8049230"	"GSM8049231"	"GSM8049232"	"GSM8049233"	"GSM8049234"	"GSM8049235"	"GSM8049236"	"GSM8049237"	"GSM8049238"	"GSM8049239"	"GSM8049240"	"GSM8049241"	"GSM8049242"	"GSM8049243"	"GSM8049244"	"GSM8049245"	"GSM8049246"	"GSM8049247"	"GSM8049248"	"GSM8049249"	"GSM8049250"	"GSM8049251"	"GSM8049252"	"GSM8049253"	"GSM8049254"	"GSM8049255"	"GSM8049256"	"GSM8049257"	"GSM8049258"	"GSM8049259"	"GSM8049260"	"GSM8049261"	"GSM8049262"	"GSM8049263"	"GSM8049264"	"GSM8049265"	"GSM8049266"	"GSM8049267"	"GSM8049268"	"GSM8049269"	"GSM8049270"	"GSM8049271"	"GSM8049272"	"GSM8049273"	"GSM8049274"	"GSM8049275"	"GSM8049276"	"GSM8049277"	"GSM8049278"	"GSM8049279"	"GSM8

No obvious gene identifiers found in the file.

Checking SOFT file for gene data...
Line 1: !Database_name = Gene Expression Omnibus (GEO)
Line 11: !Series_summary = The study focuses on the cellular composition of the psoriasis epidermis, using single-cell transcriptomics to identify cell subsets and their interactions in both healthy and psoriatic skin. The research uncovers three keratinocyte populations and seven immune cell subsets exclusive to psoriatic lesions. A significant finding is the identification of a previously undetected population of plasmacytoid dendritic cells (pDCs) in the psoriatic epidermis, suggesting their role in the disease's pathogenesis. The study also highlights enhanced keratinocyte-immune cell interactions in psoriatic lesions, contributing to our understanding of psoriasis at the cellular level.

Attempting to extract gene data from SOFT file...



Failed to extract gene data from both matrix and SOFT files.
This dataset appears to have single-cell RNA-seq data which may not be in the standard GEO matrix format.
The dataset likely requires special parsing for single-cell data.
