In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

# Path Configuration
from tools.preprocess import *

# Processing context
trait = "Type_1_Diabetes"
cohort = "GSE156035"

# Input paths
in_trait_dir = "../../input/GEO/Type_1_Diabetes"
in_cohort_dir = "../../input/GEO/Type_1_Diabetes/GSE156035"

# Output paths
out_data_file = "../../output/preprocess/Type_1_Diabetes/GSE156035.csv"
out_gene_data_file = "../../output/preprocess/Type_1_Diabetes/gene_data/GSE156035.csv"
out_clinical_data_file = "../../output/preprocess/Type_1_Diabetes/clinical_data/GSE156035.csv"
json_path = "../../output/preprocess/Type_1_Diabetes/cohort_info.json"


### Step 1: Initial Data Loading

In [2]:
from tools.preprocess import *
# 1. Identify the paths to the SOFT file and the matrix file
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Inflammatory pathways in peripheral blood expression profile of recent-onset type 1 diabetes"
!Series_summary	"Changes in innate and adaptive immunity occurring in and around pancreatic islets can also be observed in peripheral  blood mononuclear cells (PBMC) of T1D patients  in Caucasians. The aim of our study was to investigate whether gene expression patterns of PBMC could complement islet autoantibodies for T1D pathogenic mechanisms in the higlty admixed Brazilian population. Methods: We assessed global gene expression in PBMC from  two groups mached for age, sex and BMI: The T1D group with 20 patients with recent-onset T1D (≤ 6 months from diagnosis, in a time  when the autoimmune process is still highly active), testing positive for one or more  islet autoantibodies and 20 islet autoantibody-negative healthy controls (Control group). Results: we identified 474 differentially expressed genes between groups. The most expressed genes in T1D gro

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
# 1. Gene Expression Data Availability
is_gene_available = True  # Based on the background info, this dataset includes gene expression from PBMCs

# 2. Data Availability and Type Conversion
# 2.1 Trait (Type 1 Diabetes)
trait_row = 2  # "diagnosis" is at key 2 in the sample characteristics dictionary

# Define conversion function for the trait variable
def convert_trait(value):
    if ":" not in value:
        return None
    value = value.split(":", 1)[1].strip().lower()
    if "type 1 diabetes" in value:
        return 1
    elif "healthy control" in value or "control" in value:
        return 0
    return None

# 2.2 Age
age_row = None  # Age information is not available in the sample characteristics

def convert_age(value):
    # This function is defined but won't be used since age data is not available
    if ":" not in value:
        return None
    value = value.split(":", 1)[1].strip()
    try:
        return float(value)
    except:
        return None

# 2.3 Gender
gender_row = 0  # "gender" is at key 0 in the sample characteristics dictionary

def convert_gender(value):
    if ":" not in value:
        return None
    value = value.split(":", 1)[1].strip().lower()
    if "female" in value:
        return 0
    elif "male" in value:
        return 1
    return None

# 3. Save Metadata - Initial Filtering
is_trait_available = trait_row is not None
validate_and_save_cohort_info(is_final=False, 
                             cohort=cohort, 
                             info_path=json_path, 
                             is_gene_available=is_gene_available, 
                             is_trait_available=is_trait_available)

# 4. Clinical Feature Extraction
if trait_row is not None:
    # Extract clinical features using the provided function
    clinical_df = geo_select_clinical_features(
        clinical_df=clinical_data,  # This is provided from previous steps
        trait=trait,
        trait_row=trait_row,
        convert_trait=convert_trait,
        age_row=age_row,
        convert_age=convert_age if age_row is not None else None,
        gender_row=gender_row,
        convert_gender=convert_gender if gender_row is not None else None
    )
    
    # Preview the extracted clinical data
    preview = preview_df(clinical_df)
    print("Preview of clinical data:")
    print(preview)
    
    # Save the clinical data to CSV
    os.makedirs(os.path.dirname(out_clinical_data_file), exist_ok=True)
    clinical_df.to_csv(out_clinical_data_file, index=False)
    print(f"Clinical data saved to {out_clinical_data_file}")


Preview of clinical data:
{'GSM4720871': [0.0, 0.0], 'GSM4720872': [0.0, 0.0], 'GSM4720873': [0.0, 1.0], 'GSM4720874': [0.0, 0.0], 'GSM4720875': [0.0, 1.0], 'GSM4720876': [0.0, 0.0], 'GSM4720877': [0.0, 1.0], 'GSM4720878': [0.0, 1.0], 'GSM4720879': [0.0, 0.0], 'GSM4720880': [0.0, 0.0], 'GSM4720881': [0.0, 1.0], 'GSM4720882': [0.0, 0.0], 'GSM4720883': [0.0, 0.0], 'GSM4720884': [0.0, 0.0], 'GSM4720885': [0.0, 0.0], 'GSM4720886': [0.0, 0.0], 'GSM4720887': [0.0, 1.0], 'GSM4720888': [0.0, 1.0], 'GSM4720889': [0.0, 0.0], 'GSM4720890': [0.0, 0.0], 'GSM4720891': [1.0, 0.0], 'GSM4720892': [1.0, 0.0], 'GSM4720893': [1.0, 1.0], 'GSM4720894': [1.0, 1.0], 'GSM4720895': [1.0, 1.0], 'GSM4720896': [1.0, 0.0], 'GSM4720897': [1.0, 0.0], 'GSM4720898': [1.0, 0.0], 'GSM4720899': [1.0, 0.0], 'GSM4720900': [1.0, 1.0], 'GSM4720901': [1.0, 1.0], 'GSM4720902': [1.0, 0.0], 'GSM4720903': [1.0, 0.0], 'GSM4720904': [1.0, 0.0], 'GSM4720905': [1.0, 0.0], 'GSM4720906': [1.0, 0.0], 'GSM4720907': [1.0, 1.0], 'GSM4720908

### Step 3: Gene Data Extraction

In [4]:
# 1. Get the SOFT and matrix file paths again 
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
print(f"Matrix file found: {matrix_file}")

# 2. Use the get_genetic_data function from the library to get the gene_data
try:
    gene_data = get_genetic_data(matrix_file)
    print(f"Gene data shape: {gene_data.shape}")
    
    # 3. Print the first 20 row IDs (gene or probe identifiers)
    print("First 20 gene/probe identifiers:")
    print(gene_data.index[:20])
except Exception as e:
    print(f"Error extracting gene data: {e}")


Matrix file found: ../../input/GEO/Type_1_Diabetes/GSE156035/GSE156035_series_matrix.txt.gz


Gene data shape: (60901, 40)
First 20 gene/probe identifiers:
Index(['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16',
       '17', '18', '19', '20', '21', '22', '23'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
# The identifiers observed in the gene data appear to be numeric identifiers (4, 5, 6, etc.)
# These are not standard human gene symbols, which would typically be alphanumeric like BRCA1, TP53, etc.
# Therefore, these identifiers will need to be mapped to proper gene symbols

requires_gene_mapping = True


### Step 5: Gene Annotation

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the SOFT file.
gene_annotation = get_gene_annotation(soft_file)

# Check if there are any platforms defined in the SOFT file that might contain annotation data
with gzip.open(soft_file, 'rt') as f:
    soft_content = f.read()

# Look for platform sections in the SOFT file
platform_sections = re.findall(r'^!Platform_title\s*=\s*(.+)$', soft_content, re.MULTILINE)
if platform_sections:
    print(f"Platform title found: {platform_sections[0]}")

# Try to extract more annotation data by reading directly from the SOFT file
# Look for lines that might contain gene symbol mappings
symbol_pattern = re.compile(r'ID_REF\s+Symbol|ID\s+Gene Symbol', re.IGNORECASE)
annotation_lines = []
with gzip.open(soft_file, 'rt') as f:
    for line in f:
        if symbol_pattern.search(line):
            annotation_lines.append(line)
            # Collect the next few lines to see the annotation structure
            for _ in range(10):
                annotation_lines.append(next(f, ''))

if annotation_lines:
    print("Found potential gene symbol mappings:")
    for line in annotation_lines:
        print(line.strip())

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("\nGene annotation preview:")
print(preview_df(gene_annotation, n=10))

# If we need an alternative source of mapping, check if there are any other annotation files in the cohort directory
cohort_files = os.listdir(in_cohort_dir)
annotation_files = [f for f in cohort_files if 'annotation' in f.lower() or 'platform' in f.lower()]
if annotation_files:
    print("\nAdditional annotation files found in the cohort directory:")
    for file in annotation_files:
        print(file)


Platform title found: Agilent-072363 SurePrint G3 Human GE v3 8x60K Microarray 039494 [Feature Number Version]



Gene annotation preview:
{'ID': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], 'COL': ['192', '192', '192', '192', '192', '192', '192', '192', '192', '192'], 'ROW': [328.0, 326.0, 324.0, 322.0, 320.0, 318.0, 316.0, 314.0, 312.0, 310.0], 'NAME': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872', 'A_33_P3267760', 'A_32_P194264', 'A_23_P153745', 'A_33_P3352837', 'A_21_P0011260'], 'SPOT_ID': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'A_21_P0014386', 'A_33_P3396872', 'A_33_P3267760', 'A_32_P194264', 'A_23_P153745', 'A_33_P3352837', 'A_21_P0011260'], 'CONTROL_TYPE': ['pos', 'pos', 'pos', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE'], 'REFSEQ': [nan, nan, nan, nan, 'NM_001105533', nan, 'NM_001008708', 'NM_006332', nan, nan], 'GB_ACC': [nan, nan, nan, nan, 'NM_001105533', nan, 'NM_001008708', 'NM_006332', nan, nan], 'LOCUSLINK_ID': [nan, nan, nan, nan, 79974.0, 54880.0, 494143.0, 10437.0, nan, nan], 'GENE_SYMBOL': [nan, nan, nan, nan

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify which columns in gene_annotation store probe IDs and gene symbols
# From the preview, I can see:
# - 'ID' column contains numeric identifiers (matching what we saw in gene_data)
# - 'GENE_SYMBOL' column contains gene symbols like 'CPED1', 'BCOR', etc.

# 2. Get gene mapping dataframe by extracting probe ID and gene symbol columns
mapping_df = get_gene_mapping(
    annotation=gene_annotation,
    prob_col='ID',
    gene_col='GENE_SYMBOL'
)

print(f"Mapping dataframe shape: {mapping_df.shape}")
print("First 5 rows of mapping data:")
print(mapping_df.head())

# 3. Apply gene mapping to convert probe-level measurements to gene expression data
gene_data = apply_gene_mapping(expression_df=gene_data, mapping_df=mapping_df)

# Check the resulting gene expression data
print(f"\nGene expression data shape: {gene_data.shape}")
print("First 5 gene symbols:")
print(gene_data.index[:5])
print("First 3 samples with first 3 genes:")
print(gene_data.iloc[:3, :3])

# Save gene expression data to CSV
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Gene expression data saved to {out_gene_data_file}")


Mapping dataframe shape: (51544, 2)
First 5 rows of mapping data:
    ID    Gene
4    5   CPED1
5    6    BCOR
6    7   CHAC2
7    8   IFI30
11  12  GPR146



Gene expression data shape: (29222, 40)
First 5 gene symbols:
Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A1CF-2', 'A1CF-3'], dtype='object', name='Gene')
First 3 samples with first 3 genes:
          GSM4720871  GSM4720872  GSM4720873
Gene                                        
A1BG       10.020543   10.814271   10.521009
A1BG-AS1    7.112705    7.259896    7.453378
A1CF        2.146216    2.144462    2.158102


Gene expression data saved to ../../output/preprocess/Type_1_Diabetes/gene_data/GSE156035.csv


### Step 7: Data Normalization and Linking

In [8]:
# 1. Normalize gene symbols in the gene expression data
gene_data = normalize_gene_symbols_in_index(gene_data)
print(f"Normalized gene data shape: {gene_data.shape}")
print(f"First 5 gene symbols after normalization: {gene_data.index[:5]}")
os.makedirs(os.path.dirname(out_gene_data_file), exist_ok=True)
gene_data.to_csv(out_gene_data_file)
print(f"Normalized gene data saved to {out_gene_data_file}")

# 2. Load clinical data from Step 2
# Get the SOFT and matrix file paths again for reference
soft_file, matrix_file = geo_get_relevant_filepaths(in_cohort_dir)
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# Re-extract clinical features properly using the determined row indices
def convert_trait(value):
    if ":" not in value:
        return None
    value = value.split(":", 1)[1].strip().lower()
    if "type 1 diabetes" in value:
        return 1
    elif "healthy control" in value or "control" in value:
        return 0
    return None

def convert_gender(value):
    if ":" not in value:
        return None
    value = value.split(":", 1)[1].strip().lower()
    if "female" in value:
        return 0
    elif "male" in value:
        return 1
    return None

# Use the correct row indices from Step 2
clinical_df = geo_select_clinical_features(
    clinical_df=clinical_data,
    trait=trait,
    trait_row=2,  # diagnosis is at row 2 in sample characteristics
    convert_trait=convert_trait,
    age_row=None,  # No age data available
    convert_age=None,
    gender_row=0,  # gender is at row 0 in sample characteristics
    convert_gender=convert_gender
)

print(f"Clinical data shape: {clinical_df.shape}")
print("Clinical data sample:")
print(clinical_df.iloc[:, :5])  # Show first 5 columns

# 2. Link clinical and genetic data
linked_data = geo_link_clinical_genetic_data(clinical_df, gene_data)
print(f"Linked data shape: {linked_data.shape}")
print("Linked data first 5 columns:")

# Identify the column names for verification
all_columns = linked_data.columns.tolist()
print(f"First few column names: {all_columns[:5]}")
print(linked_data.iloc[:5, :5])

# 3. Handle missing values - make sure we use the correct trait column name
# The trait column should be the first in the linked data
trait_col = linked_data.columns[0]
print(f"Trait column name detected: {trait_col}")

linked_data = handle_missing_values(linked_data, trait_col)
print(f"Data shape after handling missing values: {linked_data.shape}")

# 4. Check for bias in features
is_biased, linked_data = judge_and_remove_biased_features(linked_data, trait_col)

# 5. Validate and save cohort information
is_usable = validate_and_save_cohort_info(
    is_final=True,
    cohort=cohort,
    info_path=json_path,
    is_gene_available=True,
    is_trait_available=True,
    is_biased=is_biased,
    df=linked_data,
    note="Dataset contains gene expression data from peripheral blood mononuclear cells of 20 Type 1 Diabetes patients and 20 healthy controls."
)

# 6. Save the linked data if usable
if is_usable:
    os.makedirs(os.path.dirname(out_data_file), exist_ok=True)
    linked_data.to_csv(out_data_file)
    print(f"Linked data saved to {out_data_file}")
else:
    print("Dataset is not usable for analysis. No linked data file saved.")

Normalized gene data shape: (20778, 40)
First 5 gene symbols after normalization: Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1'], dtype='object', name='Gene')


Normalized gene data saved to ../../output/preprocess/Type_1_Diabetes/gene_data/GSE156035.csv


Clinical data shape: (2, 40)
Clinical data sample:
                 GSM4720871  GSM4720872  GSM4720873  GSM4720874  GSM4720875
Type_1_Diabetes         0.0         0.0         0.0         0.0         0.0
Gender                  0.0         0.0         1.0         0.0         1.0
Linked data shape: (40, 20780)
Linked data first 5 columns:
First few column names: ['Type_1_Diabetes', 'Gender', 'A1BG', 'A1BG-AS1', 'A1CF']
            Type_1_Diabetes  Gender       A1BG  A1BG-AS1      A1CF
GSM4720871              0.0     0.0  10.020543  7.112705  2.146216
GSM4720872              0.0     0.0  10.814271  7.259896  2.144462
GSM4720873              0.0     1.0  10.521009  7.453378  2.158102
GSM4720874              0.0     0.0  10.423078  7.020040  2.142261
GSM4720875              0.0     1.0   9.777265  5.883278  2.113389
Trait column name detected: Type_1_Diabetes


Data shape after handling missing values: (40, 20780)
For the feature 'Type_1_Diabetes', the least common label is '0.0' with 20 occurrences. This represents 50.00% of the dataset.
The distribution of the feature 'Type_1_Diabetes' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 15 occurrences. This represents 37.50% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



Linked data saved to ../../output/preprocess/Type_1_Diabetes/GSE156035.csv
