In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Esophageal_Cancer/GSE75241'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profile of esophageal squamous cell carcinoma"
!Series_summary	"The goal was to identify the differently expressed genes between esophageal tumor and nonmalignant surrounding mucosa"
!Series_overall_design	"15 paired ESCC samples and matched nonmalignant mucosa were analyzed"
Sample Characteristics Dictionary:
{0: ['patient: 1', 'patient: 2', 'patient: 3', 'patient: 4', 'patient: 5', 'patient: 6', 'patient: 7', 'patient: 8', 'patient: 9', 'patient: 10', 'patient: 11', 'patient: 12', 'patient: 14', 'patient: 15', 'patient: 16'], 1: ['tissue: nonmalignant surrounding mucosa', 'tissue: esophageal tumor'], 2: [nan, 'tumor differentiation: poor', 'tumor differentiation: moderate', 'tumor differentiation: well']}


### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = 1  # tissue info can be used to determine presence of esophageal cancer
age_row = None  # Age data not available in the provided characteristic dictionary
gender_row = None  # Gender data not available in the provided characteristic dictionary

# Define the conversion functions
def convert_trait(value):
    if "esophageal tumor" in value:
        return 1
    elif "nonmalignant surrounding mucosa" in value:
        return 0
    else:
        return None

def convert_age(value):
    return None  # No data available so return None

def convert_gender(value):
    return None  # No data available so return None

# Save cohort information
save_cohort_info('GSE75241', './preprocessed/Esophageal_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
selected_clinical_data = geo_select_clinical_features(clinical_data, 'Esophageal_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
csv_path = './preprocessed/Esophageal_Cancer/trait_data/GSE75241.csv'
selected_clinical_data.to_csv(csv_path)
print(preview_df(selected_clinical_data))


{'GSM1946756': [0], 'GSM1946757': [1], 'GSM1946758': [0], 'GSM1946759': [1], 'GSM1946760': [0], 'GSM1946761': [1], 'GSM1946762': [0], 'GSM1946763': [1], 'GSM1946764': [0], 'GSM1946765': [1], 'GSM1946766': [0], 'GSM1946767': [1], 'GSM1946768': [0], 'GSM1946769': [1], 'GSM1946770': [0], 'GSM1946771': [1], 'GSM1946772': [0], 'GSM1946773': [1], 'GSM1946774': [0], 'GSM1946775': [1], 'GSM1946776': [0], 'GSM1946777': [1], 'GSM1946778': [0], 'GSM1946779': [1], 'GSM1946780': [0], 'GSM1946781': [1], 'GSM1946782': [0], 'GSM1946783': [1], 'GSM1946784': [0], 'GSM1946785': [1]}


### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['2315554', '2315633', '2315674', '2315739', '2315894', '2315918',
       '2315951', '2316218', '2316245', '2316379', '2316558', '2316605',
       '2316746', '2316905', '2316953', '2317246', '2317317', '2317434',
       '2317472', '2317512'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['2315100', '2315106', '2315109', '2315111', '2315113'], 'GB_LIST': ['NR_024005,NR_034090,NR_024004,AK093685', 'DQ786314', nan, nan, 'DQ786265'], 'SPOT_ID': ['chr1:11884-14409', 'chr1:14760-15198', 'chr1:19408-19712', 'chr1:25142-25532', 'chr1:27563-27813'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': ['11884', '14760', '19408', '25142', '27563'], 'RANGE_STOP': ['14409', '15198', '19712', '25532', '27813'], 'total_probes': ['20', '8', '4', '4', '4'], 'gene_assignment': ['NR_024005 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// NR_034090 // DDX11L9 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 9 // 15q26.3 // 100288486 /// NR_024004 // DDX11L2 // DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide 11 like 2 // 2q13 // 84771 /// AK093685 // DDX11L2 // D

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Identify the keys storing probe IDs and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'gene_assignment'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to obtain the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, mapping_df)

print(preview_df(gene_data))


{'GSM1946756': [8.047910212076582, 8.291397941860465, 8.07868114224138, 9.352761222222222, 8.106042998585572], 'GSM1946757': [8.203152427098674, 8.419225224418605, 8.158900000000001, 9.398239555555556, 8.21486630834512], 'GSM1946758': [8.029133254786451, 8.28807554011628, 8.063632327586207, 9.314703555555557, 8.089297284299859], 'GSM1946759': [8.253346886597937, 8.344949365697675, 8.107376745689656, 9.310331777777778, 8.15006135785007], 'GSM1946760': [7.958182530191458, 8.17378798488372, 8.00489926724138, 9.074196, 8.002949151343705], 'GSM1946761': [8.289586662739323, 8.527258953488372, 8.363034827586207, 9.251420666666668, 8.458198132956152], 'GSM1946762': [7.923090097201767, 8.128155701744186, 7.976488512931034, 9.140863555555557, 7.931781202263083], 'GSM1946763': [8.136237508100148, 8.247879956395348, 8.113118297413793, 9.160349666666667, 8.147282475247525], 'GSM1946764': [8.15346029455081, 8.466311275581397, 8.262526681034483, 9.662303444444444, 8.249127482319661], 'GSM1946765': [8