In [9]:
import pandas as pd
from pybiomart import Server
import numpy as np
import re
from io import StringIO
from sklearn.feature_selection import mutual_info_classif

## Find the intersection genes between Training and Testing datasets

In [10]:
# Read Gene from the Train data
file_path = "GSE282742_TPM.txt"
train_df = pd.read_csv(file_path, sep='\t')
# display(train_df.head(5))
train_genes = train_df['gene_id'].tolist()
print(f"Genes in Train Data: {len(train_genes)}")

# Read Gene from the Test data
test_file_path = 'GSE249477_raw_count_normalize_04-10-2025.csv' 
test_df = pd.read_csv(test_file_path)
# display(test_df.head(5))
gene_test_df = test_df[['Identifier', 'Name']].copy()
test_genes = gene_test_df['Identifier'].tolist()
print(f"Genes in Test Data: {len(test_genes)}")

# Get the intersection genes
intersection_genes = list(set(train_genes) & set(test_genes))
print(f"Intersection genes: {len(intersection_genes)}")
genes_id_and_symbol = gene_test_df[gene_test_df['Identifier'].isin(intersection_genes)].reset_index(drop=True)
genes_id_and_symbol = genes_id_and_symbol.rename(columns={'Identifier':'gene_id','Name':'gene_symbol'})
print(genes_id_and_symbol.shape)
display(genes_id_and_symbol.head(5))

Genes in Train Data: 61860
Genes in Test Data: 21492
Intersection genes: 21462
(21462, 2)


  test_df = pd.read_csv(test_file_path)


Unnamed: 0,gene_id,gene_symbol
0,ENSG00000186092,OR4F5
1,ENSG00000284733,OR4F29
2,ENSG00000284662,OR4F16
3,ENSG00000187634,SAMD11
4,ENSG00000188976,NOC2L


## Training dataset preprocessing

In [19]:
# Preprocess Train Series Matrix
def extract_geo_metadata_to_dataframe(file_path):
    """
    Parses metadata lines (!Sample_title, !Sample_description, 
    !Sample_characteristics_ch1) from a GEO Series Matrix file content 
    and returns a pandas DataFrame.
    """
    
    # 1. Initialize lists to hold the data
    metadata = {}
    
    # 2. Define the exact line prefixes we are interested in
    TARGET_LINES = [
        "!Sample_geo_accession", 
        "!Sample_title",
        "!Sample_description",
        "!Sample_characteristics_ch1"
    ]
    
    # 3. Read content line by line
    with open(file_path, 'r') as f:
        file_content = f.read()

    # 4. Process each line to extract relevant metadata
    for line in file_content.split('\n'):
        # Check if the line is one of our target metadata lines and is not blank
        if line.startswith(tuple(TARGET_LINES)):
            # Split the line by the first tab to separate the field name from the values
            parts = line.split('\t', 1)
            
            if len(parts) < 2:
                continue
            
            field_name = parts[0]
            raw_values = parts[1]
            
            # Use regex to strip the quotes and split by tab
            # This handles cases where values themselves contain spaces
            values = re.findall(r'"(.*?)"', raw_values)
            
            if not values:
                continue

            # Store the Sample Accession IDs (GSMs) separately for column headers
            if field_name == "!Sample_geo_accession":
                sample_ids = values
                metadata['Sample_ID'] = sample_ids
            
            # Handle Sample Characteristics (multiple rows with key: value pairs)
            elif field_name == "!Sample_characteristics_ch1" or field_name == "!Sample_description":
                # The characteristic is always stored as "key: value" (e.g., "age: 81y.o.")
                # We need to split the key and value and create new dictionary entries
                
                # The first sample determines the key structure
                if ':' in values[0]:
                    key_value = values[0].split(':', 1)
                    key = key_value[0].strip().replace(' ', '_')
                else:
                    # Skip if the characteristic line doesn't conform (e.g., just 'blood')
                    continue 

                # Store the key-value pairs. Since keys can repeat (e.g., 'tissue', 'disease state', 'age', 'Sex'),
                # we use the value of the first element of the pair as the unique column name
                # E.g., 'disease state' -> 'disease state'
                #       'disease state: Alzheimer\'s disease' -> 'disease state'
                # We use the full label (e.g., 'disease state') as the column name for clarity.
                
                if key not in metadata:
                    metadata[key] = []
                    
                for val in values:
                    # Extract the value part after the first colon
                    try:
                        if key == 'age':
                            # Special handling for age to remove 'y.o.' suffix
                            value_only = val.split(':', 1)[1].strip().replace('y', '').strip()
                        else:
                            value_only = val.split(':', 1)[1].strip()
                        metadata[key].append(value_only)
                    except IndexError:
                        # Handle cases where value might be missing the colon separator
                        metadata[key].append(val)
            
            # Handle single-row metadata like Sample_title and Sample_description
            else:
                # Clean the field name for the column header (e.g., remove '!' and use 'Sample_title')
                col_name = field_name.replace('!', '')
                metadata[col_name] = values
    # print(metadata)
    # 5. Convert the collected dictionary into a DataFrame
    # If the Sample_ID list exists, use it for the index
    if 'Sample_ID' in metadata and metadata['Sample_ID']:
        
        # Prepare data for DataFrame, ensuring all lists have the same length as sample_ids
        data_for_df = {}
        expected_len = len(metadata['Sample_ID'])
        
        for key, val_list in metadata.items():
            if key != 'Sample_ID' and len(val_list) == expected_len:
                data_for_df[key] = val_list

        # Create DataFrame
        df = pd.DataFrame(data_for_df, index=metadata['Sample_ID'])
        df.index.name = "Sample_ID"
        return df.reset_index()

    return pd.DataFrame()

#  === Extract the metadata
test_series_matrix_file = 'GSE282742_series_matrix.txt'
metadata_df = extract_geo_metadata_to_dataframe(test_series_matrix_file)
metadata_df = metadata_df.rename(columns = {'Library_name' :'Sample_description'})
metadata_df = metadata_df[['Sample_description','disease_state']]
metadata_df = metadata_df.rename(columns={'Sample_description':'samples'})
metadata_df = metadata_df.set_index('samples')
print("--- Sample Metadata DataFrame ---")
# display(metadata_df.head())
print(f"\nDataFrame shape: {metadata_df.shape}")

# === Collect only shared genes
filtered_train_df = train_df[train_df['gene_id'].isin(intersection_genes)].reset_index(drop=True)
filtered_train_df = filtered_train_df.rename(columns={'gene_id':'samples'})
filtered_train_df = filtered_train_df.set_index('samples')
transposed_train_df = filtered_train_df.T
train_normalized = np.log2(transposed_train_df + 1) # Normalization step
print(f"Filtered Train Data shape: {train_normalized.shape}") 

# === Merge gene expression with metadata
print("--- Train with label DataFrame ---")
train_with_label_df = metadata_df.merge(train_normalized, left_index=True, right_index=True)
# display(train_with_label_df.head(5))
print(f"Merged Train Data shape: {train_with_label_df.shape}")

# === Filter high mutual information genes
mi_scores = mutual_info_classif(
        train_normalized.values, 
        train_with_label_df['disease_state'],
        discrete_features=False,
        n_neighbors=3,
        random_state=42
    )

mi_series = pd.Series(mi_scores, index=train_normalized.columns)
top_genes = mi_series.sort_values(ascending=False).head(2000).index
keep_columns = ['disease_state'] + top_genes.tolist()
df_filtered = train_with_label_df[keep_columns]
print("--- Final Train DataFrame ---")
print(f"Final Train Data shape: {df_filtered.shape}")

--- Sample Metadata DataFrame ---

DataFrame shape: (116, 1)
Filtered Train Data shape: (116, 21462)
--- Train with label DataFrame ---
Merged Train Data shape: (116, 21463)
--- Final Train DataFrame ---
Final Train Data shape: (116, 2001)


In [23]:
df_filtered.to_csv('train_mi_data.csv', index=True)

## Test dataset preprocessing

In [27]:
# Preprocess Test Series Matrix
def extract_geo_metadata_to_dataframe(file_path):
    """
    Parses metadata lines (!Sample_title, !Sample_description, 
    !Sample_characteristics_ch1) from a GEO Series Matrix file content 
    and returns a pandas DataFrame.
    """
    
    # 1. Initialize lists to hold the data
    metadata = {}
    
    # 2. Define the exact line prefixes we are interested in
    TARGET_LINES = [
        "!Sample_geo_accession", 
        "!Sample_title",
        "!Sample_description",
        "!Sample_characteristics_ch1"
    ]
    
    # 3. Read content line by line
    with open(file_path, 'r') as f:
        file_content = f.read()

    # 4. Process each line to extract relevant metadata
    for line in file_content.split('\n'):
        # Check if the line is one of our target metadata lines and is not blank
        if line.startswith(tuple(TARGET_LINES)):
            # Split the line by the first tab to separate the field name from the values
            parts = line.split('\t', 1)
            
            if len(parts) < 2:
                continue
            
            field_name = parts[0]
            raw_values = parts[1]
            
            # Use regex to strip the quotes and split by tab
            # This handles cases where values themselves contain spaces
            values = re.findall(r'"(.*?)"', raw_values)
            
            if not values:
                continue

            # Store the Sample Accession IDs (GSMs) separately for column headers
            if field_name == "!Sample_geo_accession":
                sample_ids = values
                metadata['Sample_ID'] = sample_ids
            
            # Handle Sample Characteristics (multiple rows with key: value pairs)
            elif field_name == "!Sample_characteristics_ch1":
                # The characteristic is always stored as "key: value" (e.g., "age: 81y.o.")
                # We need to split the key and value and create new dictionary entries
                
                # The first sample determines the key structure
                if ':' in values[0]:
                    key_value = values[0].split(':', 1)
                    key = key_value[0].strip().replace(' ', '_')
                else:
                    # Skip if the characteristic line doesn't conform (e.g., just 'blood')
                    continue 

                # Store the key-value pairs. Since keys can repeat (e.g., 'tissue', 'disease state', 'age', 'Sex'),
                # we use the value of the first element of the pair as the unique column name
                # E.g., 'disease state' -> 'disease state'
                #       'disease state: Alzheimer\'s disease' -> 'disease state'
                # We use the full label (e.g., 'disease state') as the column name for clarity.
                
                if key not in metadata:
                    metadata[key] = []
                    
                for val in values:
                    # Extract the value part after the first colon
                    try:
                        if key == 'age':
                            # Special handling for age to remove 'y.o.' suffix
                            value_only = val.split(':', 1)[1].strip().replace('y.o.', '').strip()
                        else:
                            value_only = val.split(':', 1)[1].strip()
                        metadata[key].append(value_only)
                    except IndexError:
                        # Handle cases where value might be missing the colon separator
                        metadata[key].append(val)
            
            # Handle single-row metadata like Sample_title and Sample_description
            else:
                # Clean the field name for the column header (e.g., remove '!' and use 'Sample_title')
                col_name = field_name.replace('!', '')
                metadata[col_name] = values
    # print(metadata)
    # 5. Convert the collected dictionary into a DataFrame
    # If the Sample_ID list exists, use it for the index
    if 'Sample_ID' in metadata and metadata['Sample_ID']:
        
        # Prepare data for DataFrame, ensuring all lists have the same length as sample_ids
        data_for_df = {}
        expected_len = len(metadata['Sample_ID'])
        
        for key, val_list in metadata.items():
            if key != 'Sample_ID' and len(val_list) == expected_len:
                data_for_df[key] = val_list

        # Create DataFrame
        df = pd.DataFrame(data_for_df, index=metadata['Sample_ID'])
        df.index.name = "Sample_ID"
        return df.reset_index()

    return pd.DataFrame()

# === Extract the information
test_series_matrix_file = 'GSE249477_series_matrix.txt'
metadata_df = extract_geo_metadata_to_dataframe(test_series_matrix_file)
metadata_df = metadata_df[['Sample_description','disease_state']]
metadata_df = metadata_df.rename(columns={'Sample_description':'samples'})
metadata_df = metadata_df.set_index('samples')
disease_state_mapping_dict = {"Alzheimer's disease": 'AD', 
                              "mild cognitive impairment due to Alzheimer's disease": 'MCI',
                              "cognitively normal control":'C'}
metadata_df['disease_state'] = metadata_df['disease_state'].map(disease_state_mapping_dict)
metadata_df = metadata_df[(metadata_df['disease_state'] != 'C')]
print("--- Sample Metadata DataFrame ---")
# display(metadata_df.head())
print(f"\nDataFrame shape: {metadata_df.shape}")

# === Collect only shared genes
filtered_test_df = test_df[test_df['Identifier'].isin(top_genes.tolist())].reset_index(drop=True)
gene_id_column = 'Identifier' 
test_col_list = filtered_test_df.columns.tolist()
regex_pattern = r".*TPM.*"
gene_expressions = [item for item in test_col_list if re.search(regex_pattern, item, re.IGNORECASE)]
columns_to_select = [gene_id_column] + gene_expressions
test_subset_df = filtered_test_df[columns_to_select].copy()
test_subset_df = test_subset_df.rename(columns={'Identifier':'samples'})
test_subset_df = test_subset_df.set_index('samples')
transposed_test_df = test_subset_df.T
transposed_test_df.index = transposed_test_df.index.str.split(' ').str[0]
test_normalized = np.log2(transposed_test_df + 1) # Normalization step
print(f"Filtered Test Data shape: {test_normalized.shape}") 


# === Merge gene expression with metadata
print("--- Test with label DataFrame ---")
test_with_label_df = metadata_df.merge(test_normalized, left_index=True, right_index=True)
# display(train_with_label_df.head(5))
print(f"Merged Test Data shape: {test_with_label_df.shape}")

--- Sample Metadata DataFrame ---

DataFrame shape: (41, 1)
Filtered Test Data shape: (62, 2000)
--- Test with label DataFrame ---
Merged Test Data shape: (41, 2001)


In [28]:
test_with_label_df.to_csv('test_mi_data.csv', index=True)

In [29]:
# Check evaluation
train_genes = df_filtered.columns.tolist()
test_genes = test_with_label_df.columns.tolist()
shared_genes = list(set(train_genes) & set(test_genes))
print(f"Number of shared genes between Train and Test after MI filtering: {len(shared_genes)}")

Number of shared genes between Train and Test after MI filtering: 2001


## Code for searching gene symbol

In [None]:
CORRECT_FILTER_NAME = 'link_ensembl_gene_id'

def map_ensembl_to_symbol_final(ensembl_ids):
    """
    Maps a list of Ensembl Gene IDs to their HGNC Gene Symbols using 
    the correct filter name ('gene_id').

    Args:
        ensembl_ids (list): A list of Ensembl Gene IDs (e.g., ['ENSG...']).

    Returns:
        pandas.DataFrame: A DataFrame with 'Ensembl_ID' and 'Gene_Symbol' columns.
    """
    try:
        # Use the default server which worked for filter discovery
        server = Server(host='http://www.ensembl.org')
    except Exception as e:
        print(f"Failed to connect to Ensembl server: {e}")
        return pd.DataFrame()

    # 2. Select the Mart and Dataset
    dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                     .datasets['hsapiens_gene_ensembl'])

    # 3. Define the desired attributes (output columns)
    attributes = ['ensembl_gene_id', 'hgnc_symbol']

    # 4. CRITICAL FIX: Use the discovered filter name 'gene_id'
    filters = {CORRECT_FILTER_NAME: ensembl_ids} 

    print(f"Querying Ensembl BioMart using filter '{CORRECT_FILTER_NAME}'...")
    
    try:
        # 5. Execute the query
        results_df = dataset.query(attributes=attributes, filters=filters)
        
        # Rename columns for clarity 
        results_df.columns = ['gene_id', 'gene_symbol']
        
        # Remove any rows where the Gene_Symbol is blank (unmapped IDs)
        results_df = results_df[results_df['gene_symbol'] != ''].reset_index(drop=True)
        
        print(f"Successfully retrieved mappings for {len(results_df)} IDs.")
        return results_df

    except Exception as e:
        print(f"An error occurred during the BioMart query: {e}")
        return pd.DataFrame()

In [35]:
# Save the DataFrame "genes_id_and_symbol name" to a CSV file
genes_id_and_symbolresult = map_ensembl_to_symbol_final(top_genes.tolist())
genes_id_and_symbol.to_csv('gene_id_and_gene_symbols.csv', index=False)

Querying Ensembl BioMart using filter 'link_ensembl_gene_id'...
An error occurred during the BioMart query: 414 Client Error: Request-URI Too Large for url: http://www.ensembl.org:80/biomart/martservice?query=%3CQuery+virtualSchemaName%3D%22default%22+formatter%3D%22TSV%22+header%3D%221%22+uniqueRows%3D%221%22+datasetConfigVersion%3D%220.6%22%3E%3CDataset+name%3D%22hsapiens_gene_ensembl%22+interface%3D%22default%22%3E%3CAttribute+name%3D%22ensembl_gene_id%22+%2F%3E%3CAttribute+name%3D%22hgnc_symbol%22+%2F%3E%3CFilter+name%3D%22link_ensembl_gene_id%22+value%3D%22ENSG00000211967%2CENSG00000074416%2CENSG00000179546%2CENSG00000146453%2CENSG00000169429%2CENSG00000185220%2CENSG00000180777%2CENSG00000148680%2CENSG00000169057%2CENSG00000064601%2CENSG00000175455%2CENSG00000187994%2CENSG00000110881%2CENSG00000164970%2CENSG00000124333%2CENSG00000211968%2CENSG00000124818%2CENSG00000182600%2CENSG00000255009%2CENSG00000186522%2CENSG00000174010%2CENSG00000064218%2CENSG00000163382%2CENSG00000280196%2C

In [36]:
# Check valid gene symbols
original_list = genes_id_and_symbol['gene_symbol'].unique().tolist()
print(f"Original list length: {len(original_list)}")
# Filter the list
filtered_list = [item for item in original_list if len(item) >= 10]
print(f"Filtered list length (gene symbols >=10 chars): {len(filtered_list)}")

Original list length: 21462
Filtered list length (gene symbols >=10 chars): 1795
