In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Cervical_Cancer/GSE192897'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)
    
 
import numpy as np


Background Information:
!Series_title	"Identifying molecular changes in early cervical cancer samples of patients that developed metastasis"
!Series_summary	"Cervical cancer is one of the most common cancers in women worldwide. Patients  diagnosed with early-stage cervical cancer have a good prognosis, however, 10-20%  suffer from local or distant recurrent disease after primary treatment. Treatment options for  recurrent cervical cancer are limited. Therefore, it is crucial to identify factors that can  predict patients with an increased risk of recurrence to optimize treatment to prevent the  recurrence of cervical cancer. We aimed to identify biomarkers in early-stage primary  cervical cancer which recurred after surgery. Formalin-Fixed, Paraffin-Embedded surgical  specimens of 34 patients with early-stage cervical cancer (FIGO 2009 stage 1B1) and 7  healthy controls were analyzed. Targeted gene expression profiling using the PanCancer  IO 360 panel of NanoString Technology was perf

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Step 1: Check if the gene expression data is available
is_gene_available = True  # Based on the description, targeted gene expression profiling was performed

# Step 2: Variable availability
sample_characteristics = {
    0: ['tissue: cervix'],
    1: ['stage at last follow-up: Alive', 'stage at last follow-up: Death', 'stage at last follow-up: n/a'],
    2: ['age: 54', 'age: 37', 'age: 41', 'age: 60', 'age: 47', 'age: 59', 'age: 42', 'age: 43', 'age: 35', 'age: 65', 'age: 39', 'age: 32', 'age: 44', 'age: 55', 'age: 38', 'age: 40', 'age: 45', 'age: 61', 'age: 33', np.nan],
    3: ['surgical approach: Open', 'surgical approach: Robot', np.nan],
    4: ['histological subtype: adeno', 'histological subtype: squamous', np.nan],
    5: ['lvsi: no', 'lvsi: yes', 'lvsi: 0', 'lvsi: 1', np.nan],
    6: ['tnm-stage: 1b1', 'tnm-stage: 1b1 N1', 'tnm-stage: 1b2', np.nan],
    7: ['hpv: 18', 'hpv: 16', 'hpv: 68 or 73', 'hpv: no HPV', 'hpv: 16 en 18', 'hpv: 17', 'hpv: 52', np.nan],
    8: ['adjuvant therapy: radiotherapy', 'adjuvant therapy: chemoradiation', 'adjuvant therapy: Follow up', np.nan],
    9: ['time to recurrence: 90', 'time to recurrence: 89', 'time to recurrence: 22', 'time to recurrence: 39', 'time to recurrence: 45', 'time to recurrence: 23', 'time to recurrence: 13', 'time to recurrence: 10', 'time to recurrence: 57', 'time to recurrence: 12', 'time to recurrence: 17', 'time to recurrence: 15', 'time to recurrence: 11', 'time to recurrence: 55', 'time to recurrence: 73', 'time to recurrence: 139', 'time to recurrence: 18', np.nan]
}

trait_row = 1 if len(set(sample_characteristics[1])) > 1 else None
age_row = 2 if len(set(sample_characteristics[2])) > 1 else None
gender_row = None  # No gender data present in provided sample characteristics

# Step 3: Define data type conversion functions
def convert_trait(value):
    key, val = value.split(':')
    if val.strip() == 'Alive':
        return 0
    elif val.strip() == 'Death':
        return 1
    else:
        return None

def convert_age(value):
    try:
        key, val = value.split(':')
        return float(val)
    except:
        return None

def convert_gender(value):
    return None  # Gender is not available in this dataset

# Save cohort metadata
save_cohort_info('GSE192897', './preprocessed/Cervical_Cancer/cohort_info.json', is_gene_available, trait_row is not None)

# Step 4: Extract and save clinical features if available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Cervical_Cancer', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Cervical_Cancer/trait_data/GSE192897.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM5768223': [0.0, 54.0], 'GSM5768224': [1.0, 37.0], 'GSM5768225': [0.0, 41.0], 'GSM5768226': [1.0, 60.0], 'GSM5768227': [0.0, 47.0], 'GSM5768228': [1.0, 59.0], 'GSM5768229': [1.0, 42.0], 'GSM5768230': [1.0, 41.0], 'GSM5768231': [0.0, 60.0], 'GSM5768232': [0.0, 43.0], 'GSM5768233': [0.0, 35.0], 'GSM5768234': [0.0, 37.0], 'GSM5768235': [1.0, 65.0], 'GSM5768236': [0.0, 39.0], 'GSM5768237': [0.0, 32.0], 'GSM5768238': [0.0, 44.0], 'GSM5768239': [0.0, 55.0], 'GSM5768240': [1.0, 38.0], 'GSM5768241': [0.0, 43.0], 'GSM5768242': [1.0, 41.0], 'GSM5768243': [1.0, 37.0], 'GSM5768244': [1.0, 37.0], 'GSM5768245': [0.0, 44.0], 'GSM5768246': [0.0, 40.0], 'GSM5768247': [0.0, 45.0], 'GSM5768248': [0.0, 40.0], 'GSM5768249': [0.0, 45.0], 'GSM5768250': [0.0, 55.0], 'GSM5768251': [0.0, 39.0], 'GSM5768252': [0.0, 61.0], 'GSM5768253': [0.0, 33.0], 'GSM5768254': [0.0, 35.0], 'GSM5768255': [0.0, 40.0], 'GSM5768256': [0.0, 32.0], 'GSM5768257': [None, None], 'GSM5768258': [None, None], 'GSM5768259': [None, None

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['A2M', 'ACVR1C', 'ADAM12', 'ADGRE1', 'ADM', 'ADORA2A', 'AKT1', 'ALDOA',
       'ALDOC', 'ANGPT1', 'ANGPT2', 'ANGPTL4', 'ANLN', 'APC', 'APH1B', 'API5',
       'APLNR', 'APOE', 'APOL6', 'AQP9'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = False


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))
    

# Since requires_gene_mapping is False, we will skip STEP6 and move to the next step


Gene annotation preview:
{'ID': ['A2M', 'ABCF1', 'ACVR1C', 'ADAM12', 'ADGRE1'], 'ORF': ['A2M', 'ABCF1', 'ACVR1C', 'ADAM12', 'ADGRE1'], 'SPOT_ID': [nan, nan, nan, nan, nan]}


### Step 7: Data Normalization and Merging

In [7]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Cervical_Cancer/gene_data/GSE192897.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Cervical_Cancer')

# 4. Save the cohort information.
save_cohort_info('GSE192897', './preprocessed/Cervical_Cancer/cohort_info.json', True, True, trait_biased, merged_data)

# 5. If the trait is not severely biased, save the merged data to a csv file.
if not trait_biased:
    csv_path = './preprocessed/Cervical_Cancer/GSE192897.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Cervical_Cancer', the least common label is '1.0' with 10 occurrences. This represents 29.41% of the dataset.
The distribution of the feature 'Cervical_Cancer' in this dataset is fine.

Quartiles for 'Age':
  25%: 37.25
  50% (Median): 41.0
  75%: 46.5
Min: 32.0
Max: 65.0
The distribution of the feature 'Age' in this dataset is fine.

A new JSON file was created at: ./preprocessed/Cervical_Cancer/cohort_info.json
