In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE182870"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE182870"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE182870.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE182870.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE182870.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Germline-like TCR alpha chains dominate shared self-reactive T cell receptors in type 1 diabetes"
!Series_summary	"Human islet antigen reactive CD4+ memory T cells (IAR T cells) play a key role in the pathogenesis of autoimmune type 1 diabetes (T1D). Using single cell RNA-sequencing (scRNA-seq) to identify T cell receptors (TCRs) in IAR T cells, we have identified a class of TCRs that share TCR alpha chains between individuals (“public”)."
!Series_overall_design	"2767 cells total were sequenced. Cells were collected from 12 healthy, 24 new-onset, and 12 established T1D donors."
Sample Characteristics Dictionary:
{0: ['library id: lib10600', 'library id: lib10601', 'library id: lib10602', 'library id: lib10603', 'library id: lib10604', 'library id: lib10605', 'library id: lib10606', 'library id: lib10607', 'library id: lib10608', 'library id: lib10609', 'library id: lib10610', 'library id: lib10611', 'library id: lib10612', 'library id: lib10613', 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Determine gene expression data availability
# Based on background information, this dataset appears to contain scRNA-seq data, which includes gene expression data
is_gene_available = True

# 2. Identify variables and create conversion functions

# 2.1 Data Availability
# For trait: Data is in row 1 as "study group" (HC = healthy control, T1D = Type 1 Diabetes, new onset T1D = Type 1 Diabetes)
trait_row = 1

# For age: Data is in row 3
age_row = 3

# For gender: Data is in row 4 as "Sex"
gender_row = 4

# 2.2 Data Type Conversion Functions

def convert_trait(value):
    """Convert trait values to binary: 0 for healthy control, 1 for Type 1 Diabetes"""
    if pd.isna(value) or value is None:
        return None
    
    # Extract value after colon if it exists
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    if value == "HC":
        return 0  # Healthy control
    elif value == "T1D" or value == "new onset T1D":
        return 1  # Type 1 Diabetes
    else:
        return None

def convert_age(value):
    """Convert age values to continuous numeric values"""
    if pd.isna(value) or value is None:
        return None
    
    # Extract value after colon if it exists
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

def convert_gender(value):
    """Convert gender values to binary: 0 for female, 1 for male"""
    if pd.isna(value) or value is None:
        return None
    
    # Extract value after colon if it exists
    if ":" in value:
        value = value.split(":", 1)[1].strip()
    
    if value.lower() == "female":
        return 0
    elif value.lower() == "male":
        return 1
    else:
        return None

# 3. Save metadata (initial filtering)
is_trait_available = trait_row is not None
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available
)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Extract clinical features
    clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age,
        gender_row=gender_row,
        convert_gender=convert_gender
    )
    
    # Preview the extracted clinical features
    print("Clinical Features Preview:")
    preview = preview_df(clinical_df)
    print(preview)
    
    # Save clinical data to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Clinical Features Preview:


{'GSM5538916': [0.0, 30.0, 1.0], 'GSM5538917': [0.0, 30.0, 1.0], 'GSM5538918': [0.0, 30.0, 1.0], 'GSM5538919': [0.0, 30.0, 1.0], 'GSM5538920': [0.0, 30.0, 1.0], 'GSM5538921': [0.0, 30.0, 1.0], 'GSM5538922': [0.0, 30.0, 1.0], 'GSM5538923': [0.0, 30.0, 1.0], 'GSM5538924': [0.0, 30.0, 1.0], 'GSM5538925': [0.0, 30.0, 1.0], 'GSM5538926': [0.0, 30.0, 1.0], 'GSM5538927': [0.0, 30.0, 1.0], 'GSM5538928': [0.0, 30.0, 1.0], 'GSM5538929': [0.0, 30.0, 1.0], 'GSM5538930': [0.0, 30.0, 1.0], 'GSM5538931': [0.0, 30.0, 1.0], 'GSM5538932': [0.0, 30.0, 1.0], 'GSM5538933': [0.0, 30.0, 1.0], 'GSM5538934': [0.0, 30.0, 1.0], 'GSM5538935': [0.0, 30.0, 1.0], 'GSM5538936': [0.0, 30.0, 1.0], 'GSM5538937': [0.0, 30.0, 1.0], 'GSM5538938': [0.0, 30.0, 1.0], 'GSM5538939': [0.0, 30.0, 1.0], 'GSM5538940': [0.0, 30.0, 1.0], 'GSM5538941': [0.0, 30.0, 1.0], 'GSM5538942': [0.0, 30.0, 1.0], 'GSM5538943': [0.0, 30.0, 1.0], 'GSM5538944': [0.0, 30.0, 1.0], 'GSM5538945': [0.0, 30.0, 1.0], 'GSM5538946': [0.0, 30.0, 1.0], 'GSM553

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# Inspect the first few lines of the file to understand its structure
print("Inspecting matrix file structure:")
with gzip.open(matrix_file, 'rt') as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i > 10:  # Print just enough lines to understand the structure
            break
        
# Look specifically for the marker that indicates the beginning of the gene expression data
marker_found = False
with gzip.open(matrix_file, 'rt') as f:
    for i, line in enumerate(f):
        if "!series_matrix_table" in line.lower():
            print(f"Found marker at line {i}: {line.strip()}")
            marker_found = True
            # Check if the next line is the end marker
            next_line = next(f, None)
            if next_line and "!series_matrix_table_end" in next_line:
                print(f"Found end marker immediately after begin marker: {next_line.strip()}")
                print("This indicates the matrix file does not contain gene expression data in the expected format.")
            break

# Based on our examination, this dataset appears to be focused on T cell receptors rather than 
# standard gene expression measurements suitable for our analysis
print("\nConclusion: This dataset contains single-cell RNA sequencing data focused on T cell receptors,")
print("but does not contain a standard gene expression matrix required for our analysis.")

# Update our metadata to reflect that gene expression data is not available in usable format
is_gene_available = False
is_trait_available = trait_row is not None

# Save this information to the metadata
validate_and_save_cohort_info(
    is_final=False,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=is_gene_available,
    is_trait_available=is_trait_available,
    note="Dataset contains scRNA-seq data of T cell receptors but not standard gene expression matrix."
)

print("\nMetadata updated to reflect that gene expression data is not available in the required format.")

Matrix file found: ../../input/GEO/Type_1_Diabetes/GSE182870/GSE182870_series_matrix.txt.gz
Inspecting matrix file structure:
!Series_title	"Germline-like TCR alpha chains dominate shared self-reactive T cell receptors in type 1 diabetes"
!Series_geo_accession	"GSE182870"
!Series_status	"Public on Dec 15 2021"
!Series_submission_date	"Aug 26 2021"
!Series_last_update_date	"Jun 28 2024"
!Series_pubmed_id	"34806648"
!Series_pubmed_id	"37886513"
!Series_pubmed_id	"38871688"
!Series_summary	"Human islet antigen reactive CD4+ memory T cells (IAR T cells) play a key role in the pathogenesis of autoimmune type 1 diabetes (T1D). Using single cell RNA-sequencing (scRNA-seq) to identify T cell receptors (TCRs) in IAR T cells, we have identified a class of TCRs that share TCR alpha chains between individuals (“public”)."
!Series_overall_design	"2767 cells total were sequenced. Cells were collected from 12 healthy, 24 new-onset, and 12 established T1D donors."
!Series_type	"Expression profiling by