In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hutchinson-Gilford_Progeria_Syndrome/GSE84360'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Reprogramming Hutchinson-Gilford Progeria Syndrome fibroblasts resets epigenomic landscape in patient-derived induced pluripotent stem cells Jan 01, 2018   pending   None"
!Series_summary	"This SuperSeries is composed of the SubSeries listed below."
!Series_overall_design	"Refer to individual Series"
Sample Characteristics Dictionary:
{0: ['Sex: Male', 'Sex: Female', 'Sex: ?'], 1: ['cell line: HGADFN003', 'cell line: HGMDFN090', 'cell line: HGADFN167', 'cell line: HGFDFN168', 'cell line: AG01972', 'cell line: BJ1', 'cell line: H9'], 2: ['condition: HGPS', 'condition: Normal'], 3: ['cell type: iPSC', 'cell type: Vascular Smooth Muscle', 'cell type: Fibroblast', 'cell type: Embryonic Stem Cell']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# 1. Gene Expression Data Availability
# Assuming matrix file usually contains gene expression data if not specified otherwise, set `is_gene_available` to `True`
is_gene_available = True

# 2. Variable Availability and Data Type Conversion
sample_characteristics = {
    0: ['Sex: Male', 'Sex: Female', 'Sex: ?'],
    1: ['cell line: HGADFN003', 'cell line: HGMDFN090', 'cell line: HGADFN167', 'cell line: HGFDFN168', 'cell line: AG01972', 'cell line: BJ1', 'cell line: H9'],
    2: ['condition: HGPS', 'condition: Normal'],
    3: ['cell type: iPSC', 'cell type: Vascular Smooth Muscle', 'cell type: Fibroblast', 'cell type: Embryonic Stem Cell']
}

# Hutchinson-Gilford_Progeria_Syndrome
trait_row = 2 if len(set([x.split(": ")[1] for x in sample_characteristics[2] if len(x.split(": ")) > 1])) > 1 else None

# Age (not explicitly given in the sample characteristic dictionary, hence not available)
age_row = None

# Gender
gender_row = 0 if len(set([x.split(": ")[1] for x in sample_characteristics[0] if len(x.split(": ")) > 1 and x.split(": ")[1] not in ['?']])) > 1 else None

# Data Type Conversion Functions

# Function to convert trait values ('condition: HGPS' to 1, 'condition: Normal' to 0)
def convert_trait(value):
    try:
        mapping = {"HGPS": 1, "Normal": 0}
        return mapping.get(value.split(": ")[1], None)
    except IndexError:
        return None

# Function to convert gender values ('Sex: Male' to 1, 'Sex: Female' to 0, and 'Sex: ?' to None)
def convert_gender(value):
    try:
        mapping = {"Male": 1, "Female": 0, '?': None}
        return mapping.get(value.split(": ")[1], None)
    except IndexError:
        return None

# Age data not available, hence no conversion function needed
convert_age = None

# Saving metadata
save_cohort_info('GSE84360', './preprocessed/Hutchinson-Gilford_Progeria_Syndrome/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction if trait data is available
if trait_row is not None:
    try:
        selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hutchinson-Gilford_Progeria_Syndrome', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
        csv_path = './preprocessed/Hutchinson-Gilford_Progeria_Syndrome/trait_data/GSE84360.csv'
        selected_clinical_data.to_csv(csv_path)
        print(preview_df(selected_clinical_data))
    except Exception as e:
        print(f"Error during clinical feature extraction: {e}")


{'GSM2232606': [1, 1], 'GSM2232607': [1, 1], 'GSM2232608': [1, 1], 'GSM2232609': [1, 1], 'GSM2232610': [1, 1], 'GSM2232611': [1, 1], 'GSM2232612': [1, 1], 'GSM2232613': [1, 1], 'GSM2232614': [1, 1], 'GSM2232615': [0, 0], 'GSM2232616': [0, 0], 'GSM2232617': [0, 0], 'GSM2232618': [0, 0], 'GSM2232619': [0, 0], 'GSM2232620': [0, 0], 'GSM2232621': [0, 0], 'GSM2232622': [0, 0], 'GSM2232623': [0, 0], 'GSM2232624': [1, 1], 'GSM2232625': [1, 1], 'GSM2232626': [1, 1], 'GSM2232627': [1, 1], 'GSM2232628': [1, 1], 'GSM2232629': [1, 1], 'GSM2232630': [1, 1], 'GSM2232631': [1, 1], 'GSM2232632': [1, 1], 'GSM2232633': [0, 1], 'GSM2232634': [0, 1], 'GSM2232635': [0, 1], 'GSM2232636': [0, 1], 'GSM2232637': [0, 1], 'GSM2232638': [0, 1], 'GSM2232639': [0, 1], 'GSM2232640': [0, 1], 'GSM2232641': [1, 0], 'GSM2232642': [1, 0], 'GSM2232643': [1, 0], 'GSM2232644': [1, 0], 'GSM2232645': [1, 0], 'GSM2232646': [1, 0], 'GSM2232647': [1, 0], 'GSM2232648': [1, 0], 'GSM2232649': [0, 1], 'GSM2232650': [0, 1], 'GSM22326

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['16650001', '16650003', '16650005', '16650007', '16650009', '16650011',
       '16650013', '16650015', '16650017', '16650019', '16650021', '16650023',
       '16650025', '16650027', '16650029', '16650031', '16650033', '16650035',
       '16650037', '16650041'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['16657436', '16657440', '16657445', '16657447', '16657450'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': [12190.0, 29554.0, 69091.0, 160446.0, 317811.0], 'RANGE_END': [13639.0, 31109.0, 70008.0, 161525.0, 328581.0], 'total_probes': [25.0, 28.0, 8.0, 13.0, 36.0], 'GB_ACC': ['NR_046018', nan, nan, nan, 'NR_024368'], 'SPOT_ID': ['chr1:12190-13639', 'chr1:29554-31109', 'chr1:69091-70008', 'chr1:160446-161525', 'chr1:317811-328581'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the keys for the identifier and gene symbol
identifier_key = 'ID'
gene_symbol_key = 'GB_ACC'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to get the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, mapping_df)
