In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Werner_Syndrome/GSE62877'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Werner syndrome WRN helicase alters gene expression in a G-quadruplex DNA-dependent manner to antagonize a pro-senescence gene expression program"
!Series_summary	"Werner syndrome (WS) is a human adult progeroid syndrome caused by loss-of-function mutations in the WRN RECQ helicase gene. We analyzed mRNA and miRNA expression in fibroblasts from WS patients and in fibroblasts depleted of WRN protein in order to determine the role of WRN in transcription regulation, and to identify genes and miRNAs that might drive WS disease pathogenesis. Genes altered in WS cells participate in cellular growth, proliferation and survival; in tRNA charging and in oncogenic signaling; and in connective tissue and developmental networks. Genes down-regulated in WS cells were highly enriched in Gquadruplex (G4) DNA motifs, indicating G4 motifs are physiologic substrates for WRN. In contrast, there was a remarkable, coordinate up-regulation of nearly all of the cytopla

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset likely contains gene expression data
is_gene_available = True

# Identify the keys for each variable
trait_row = 2 if 'group: WRN' in sample_characteristics_dict[2] else None
age_row = 1 if any('age:' in item for item in sample_characteristics_dict[1]) else 4 if any('age:' in item for item in sample_characteristics_dict[4]) else None
gender_row = 2 if any('gender:' in item for item in sample_characteristics_dict[2]) else 5 if any('gender:' in item for item in sample_characteristics_dict[5]) else None

# Define conversion functions
def convert_trait(value):
    parts = value.split(':')
    if len(parts) < 2:
        return None
    v = parts[1].strip().lower()
    if v == 'wrn':
        return 1
    elif v == 'control':
        return 0
    return None

def convert_age(value):
    parts = value.split(':')
    if len(parts) < 2:
        return None
    try:
        return float(parts[1].strip())
    except ValueError:
        return None

def convert_gender(value):
    parts = value.split(':')
    if len(parts) < 2:
        return None
    v = parts[1].strip().lower()
    if v == 'm' or v == 'male':
        return 1
    elif v == 'f' or v == 'female':
        return 0
    return None

# Save cohort information
save_cohort_info('GSE62877', './preprocessed/Werner_Syndrome/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Werner_Syndrome', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Werner_Syndrome/trait_data/GSE62877.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


A new JSON file was created at: ./preprocessed/Werner_Syndrome/cohort_info.json


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['2315100', '2315106', '2315109', '2315111', '2315113'], 'GB_LIST': ['NR_024005,NR_034090,NR_024004,AK093685', 'DQ786314', nan, nan, 'DQ786265'], 'SPOT_ID': ['chr1:11884-14409', 'chr1:14760-15198', 'chr1:19408-19712', 'chr1:25142-25532', 'chr1:27563-27813'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': ['11884', '14760', '19408', '25142', '27563'], 'RANGE_STOP': ['14409', '15198', '19712', '25532', '27813'], 'total_probes': ['20', '8', '4', '4', '4'], 'gene_assignment': ['NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 9 // 15q26.3 // 100288486 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// AK093685 // DDX11L2 // D

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Read the dictionary and determine the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to obtain the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, gene_mapping)
