# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Cervical Cancer'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


# 2. Data preprocessing and selection

## 2.1. The TCGA Xena dataset

In TCGA Xena, there is either zero or one cohort related to the trait. We search the names of subdirectories to see if any matches the trait. If a match is found, we directly obtain the file paths.

In [2]:
dataset = 'TCGA'
dataset_dir = os.path.join(DATA_ROOT, dataset)
os.listdir(dataset_dir)[:10]

['TCGA_Adrenocortical_Cancer_(ACC)',
 'TCGA_Breast_Cancer_(BRCA)',
 'TCGA_Cervical_Cancer_(CESC)',
 'TCGA_Kidney_Chromophobe_(KICH)',
 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)',
 'TCGA_Lower_Grade_Glioma_(LGG)',
 'TCGA_Melanoma_(SKCM)',
 'TCGA_Mesothelioma_(MESO)',
 'TCGA_Testicular_Cancer_(TGCT)',
 'TCGA_Uterine_Carcinosarcoma_(UCS)']

In [3]:
trait_subdir = "TCGA_Cervical_Cancer_(CESC)"
cohort = 'Xena'
trait_type = 'binary'
is_available = True

cohort_dir = os.path.join(DATA_ROOT, dataset, trait_subdir)
clinical_data_file, genetic_data_file = xena_get_relevant_filepaths(cohort_dir)
clinical_data_file, genetic_data_file

('/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Cervical_Cancer_(CESC)\\TCGA.CESC.sampleMap_CESC_clinicalMatrix',
 '/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Cervical_Cancer_(CESC)\\TCGA.CESC.sampleMap_HiSeqV2_PANCAN.gz')

In [4]:
import pandas as pd

clinical_data = pd.read_csv(clinical_data_file, sep='\t', index_col=0)
genetic_data = pd.read_csv(genetic_data_file, compression='gzip', sep='\t', index_col=0)
age_col = gender_col = None

In [5]:
_, clinical_data_cols = check_rows_and_columns(clinical_data)
clinical_data_cols[:10]

['_INTEGRATION',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_pharmaceutical_therapy',
 'additional_radiation_therapy',
 'additional_treatment_completion_success_outcome',
 'adjuvant_rad_therapy_prior_admin',
 'age_at_initial_pathologic_diagnosis']

Read all the column names in the clinical dataset, to find the columns that record information about age or gender.
Reference prompt:

In [6]:
f'''
Below is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:
candidate_age_cols = [col_name1, col_name2, ...]
candidate_gender_cols = [col_name1, col_name2, ...]
If no columns match a criterion, please provide an empty list.

Column names:
{clinical_data_cols}
'''

"\nBelow is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:\ncandidate_age_cols = [col_name1, col_name2, ...]\ncandidate_gender_cols = [col_name1, col_name2, ...]\nIf no columns match a criterion, please provide an empty list.\n\nColumn names:\n['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'additional_treatment_completion_success_outcome', 'adjuvant_rad_therapy_prior_admin', 'age_at_initial_pathologic_diagnosis', 'age_began_smoking_in_years', 'agent_total_dose_count', 'assessment_timepoint_category', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'birth_control_pill_history_usage_category', 'br

In [9]:
candidate_age_cols = ['age_at_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']


Choose a single column from the candidate columns that record age and gender information respectively.
If no column meets the requirement, keep 'age_col' or 'gender_col' to None

In [10]:
age_col = 'age_began_smoking_in_years'
gender_col = 'gender'
selected_clinical_data = xena_select_clinical_features(clinical_data, TRAIT, age_col=age_col, gender_col=gender_col)

In [11]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [12]:
genetic_data

Unnamed: 0,TCGA-EA-A3HT-01,TCGA-FU-A3HZ-01,TCGA-FU-A3TQ-01,TCGA-DS-A0VK-01,TCGA-UC-A7PG-01,TCGA-C5-A7UE-01,TCGA-C5-A1MJ-01,TCGA-UC-A7PI-01,TCGA-C5-A1MF-01,TCGA-EA-A5ZF-01,...,TCGA-JW-A5VI-01,TCGA-VS-A9UD-01,TCGA-C5-A1BQ-01,TCGA-MU-A8JM-01,TCGA-DS-A0VM-01,TCGA-MY-A5BF-01,TCGA-EA-A556-01,TCGA-IR-A3LA-01,TCGA-VS-AA62-01,TCGA-ZJ-AAXB-01
ARHGEF10L,0.664808,-0.870192,1.278708,-0.210892,0.920108,-0.716092,0.054008,1.438008,-0.102892,-0.678292,...,-0.108292,-0.458392,-0.413892,-1.007792,-0.549292,-0.255692,-1.172392,-0.146792,-1.306892,-0.492192
HIF3A,-3.218726,-2.502826,-2.714626,-0.763726,-0.467326,-4.477926,-0.463926,-1.472326,1.105974,2.024374,...,-3.011226,-2.142026,-2.121626,-4.116626,-3.194026,-4.054026,4.398574,-2.194926,-3.093226,0.063174
RNF17,0.577865,-0.531035,1.847665,2.327065,0.032765,-0.531035,-0.531035,-0.531035,1.457465,0.334265,...,-0.531035,0.930665,-0.123235,1.745865,1.218965,0.406265,-0.531035,-0.531035,0.054765,-0.531035
RNF10,-0.187472,-0.057972,-0.071672,-0.142172,0.148928,-0.214172,-0.073972,-0.196072,-0.199772,-0.428272,...,-0.471072,-0.269972,0.048828,0.292028,-0.252872,-0.067372,0.193628,-0.223072,-0.040772,0.230928
RNF11,0.552722,-0.123378,1.481922,0.176122,0.509522,0.367822,-0.677078,-0.947978,-1.280178,-0.095978,...,0.503822,-0.587878,0.283922,0.592722,-1.018078,0.971822,0.041822,0.012922,-0.434478,-0.072878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GNGT1,0.170210,1.443210,2.729510,-1.281390,0.002310,-0.644890,-1.281390,-1.281390,4.078610,-0.416090,...,1.661210,3.344110,3.013010,-1.281390,-1.281390,-1.281390,2.478010,-0.003090,1.421210,0.305710
TULP3,0.007523,-0.488077,0.415323,-0.644377,0.014623,-0.018677,-0.029077,-0.047377,-0.332877,0.111723,...,0.020823,-0.019477,0.668923,0.274423,-0.220377,0.215323,0.248423,0.704523,-0.101977,-0.849077
BCL6B,-0.686627,-2.017227,-1.286527,0.938473,-1.135727,-2.895427,0.855573,-2.903127,-1.584927,-0.258327,...,-0.775627,-1.650527,-1.165327,-0.596427,0.284873,0.572473,-0.361127,-2.012927,-0.930727,-0.373927
GSTK1,-1.185895,-0.997295,-0.252695,0.038605,0.028005,-1.335795,0.054005,0.716605,0.648805,-0.636095,...,-0.925495,0.676705,0.210605,-0.788095,0.490205,-1.347095,-0.309695,1.093805,0.746105,-0.192695


In [13]:
merged_data = selected_clinical_data.join(genetic_data.T).dropna()
merged_data.head()

Unnamed: 0_level_0,Cervical Cancer,Age,Gender,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,...,SLC7A10,PLA2G2C,TULP2,NPY5R,GNGT2,GNGT1,TULP3,BCL6B,GSTK1,SELP
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-BI-A0VR-01,1,22.0,0.0,-0.352292,-2.344926,0.592565,-0.256772,0.176622,-0.02901,-0.346094,...,-2.090786,-0.086682,-0.270678,-1.587117,0.725467,0.08141,0.103323,-0.890027,-0.151595,0.767467
TCGA-C5-A1BI-01,1,16.0,0.0,0.403208,-2.758326,0.462565,0.011628,0.168622,-0.05601,-0.784594,...,-2.090786,-0.086682,-0.016978,-1.587117,0.482567,0.29501,0.262323,-0.657127,-0.189895,0.948267
TCGA-C5-A1BJ-01,1,24.0,0.0,-0.314392,-4.098426,2.206165,-0.411172,0.083522,-0.28401,-0.401694,...,-1.562186,-0.086682,-0.748878,-0.682517,0.168067,2.51161,0.363123,0.377473,-0.686295,1.108467
TCGA-C5-A1BL-01,1,16.0,0.0,-0.455892,-1.485426,0.956565,-0.333872,0.021822,-0.14241,-0.503494,...,-2.090786,-0.086682,-0.211778,-1.587117,0.381667,2.33641,-0.953477,0.329873,-0.225695,-1.577533
TCGA-C5-A1BN-01,1,15.0,0.0,-1.712492,-1.736526,-0.093635,-0.467472,-0.217678,-0.56891,-2.295594,...,0.490214,-0.086682,0.023722,-1.149717,-2.535933,1.01021,0.157723,-0.763227,-1.524095,-1.707333


In [14]:
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merge_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 88 samples.
For the feature 'Cervical Cancer', the least common label is '0' with 1 occurrences. This represents 1.14% of the dataset.
The distribution of the feature 'Cervical Cancer' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 16.0
  50% (Median): 18.0
  75%: 24.25
Min: 11.0
Max: 44.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 88 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.



True

In [15]:
merged_data.head()
if not is_trait_biased:
    merge_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

In [16]:
save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Cervical-Cancer\cohort_info.json


## 2.2. The GEO dataset

In [17]:
dataset = 'GEO'
trait_subdir = "Cervical-Cancer"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE137034', 'GSE172159', 'GSE192897']

Repeat the below steps for all the accession numbers

In [19]:
# No obvious trait
cohort = accession_num = "GSE137034"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Cervical-Cancer\\GSE137034\\GSE137034_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Cervical-Cancer\\GSE137034\\GSE137034-GPL10558_series_matrix.txt.gz')

In [24]:
# No obvious trait
cohort = accession_num = "GSE172159"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Cervical-Cancer\\GSE172159\\GSE172159_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Cervical-Cancer\\GSE172159\\GSE172159_series_matrix.txt.gz')

In [28]:
# Finished
cohort = accession_num = "GSE192897"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Cervical-Cancer\\GSE192897\\GSE192897_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Cervical-Cancer\\GSE192897\\GSE192897_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [29]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Identifying molecular changes in early cervical cancer samples of patients that developed metastasis"
!Series_summary	"Cervical cancer is one of the most common cancers in women worldwide. Patients  diagnosed with early-stage cervical cancer have a good prognosis, however, 10-20%  suffer from local or distant recurrent disease after primary treatment. Treatment options for  recurrent cervical cancer are limited. Therefore, it is crucial to identify factors that can  predict patients with an increased risk of recurrence to optimize treatment to prevent the  recurrence of cervical cancer. We aimed to identify biomarkers in early-stage primary  cervical cancer which recurred after surgery. Formalin-Fixed, Paraffin-Embedded surgical  specimens of 34 patients with early-stage cervical cancer (FIGO 2009 stage 1B1) and 7  healthy controls were analyzed. Targeted gene expression profiling using the PanCancer  IO 360 panel of NanoString Technology was performed. The findings were

In [30]:
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM5768223,GSM5768224,GSM5768225,GSM5768226,GSM5768227,GSM5768228,GSM5768229,GSM5768230,GSM5768231,...,GSM5768261,GSM5768262,GSM5768263,GSM5768264,GSM5768265,GSM5768266,GSM5768267,GSM5768268,GSM5768269,GSM5768270
0,!Sample_characteristics_ch1,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,...,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix,tissue: cervix
1,!Sample_characteristics_ch1,stage at last follow-up: Alive,stage at last follow-up: Death,stage at last follow-up: Alive,stage at last follow-up: Death,stage at last follow-up: Alive,stage at last follow-up: Death,stage at last follow-up: Death,stage at last follow-up: Death,stage at last follow-up: Alive,...,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a,stage at last follow-up: n/a
2,!Sample_characteristics_ch1,age: 54,age: 37,age: 41,age: 60,age: 47,age: 59,age: 42,age: 41,age: 60,...,,,,,,,,,,
3,!Sample_characteristics_ch1,surgical approach: Open,surgical approach: Open,surgical approach: Open,surgical approach: Open,surgical approach: Open,surgical approach: Open,surgical approach: Open,surgical approach: Open,surgical approach: Open,...,,,,,,,,,,
4,!Sample_characteristics_ch1,histological subtype: adeno,histological subtype: squamous,histological subtype: squamous,histological subtype: squamous,histological subtype: squamous,histological subtype: squamous,histological subtype: squamous,histological subtype: squamous,histological subtype: adeno,...,,,,,,,,,,
5,!Sample_characteristics_ch1,lvsi: no,lvsi: yes,lvsi: yes,lvsi: yes,lvsi: yes,lvsi: yes,lvsi: yes,lvsi: yes,lvsi: yes,...,,,,,,,,,,
6,!Sample_characteristics_ch1,tnm-stage: 1b1,tnm-stage: 1b1 N1,tnm-stage: 1b1 N1,tnm-stage: 1b1 N1,tnm-stage: 1b1 N1,tnm-stage: 1b1,tnm-stage: 1b1 N1,tnm-stage: 1b1 N1,tnm-stage: 1b1,...,,,,,,,,,,
7,!Sample_characteristics_ch1,hpv: 18,hpv: 16,hpv: 16,hpv: 16,hpv: 18,hpv: 16,hpv: 16,hpv: 68 or 73,hpv: 18,...,,,,,,,,,,
8,!Sample_characteristics_ch1,adjuvant therapy: radiotherapy,adjuvant therapy: chemoradiation,adjuvant therapy: radiotherapy,adjuvant therapy: radiotherapy,adjuvant therapy: radiotherapy,adjuvant therapy: radiotherapy,adjuvant therapy: radiotherapy,adjuvant therapy: chemoradiation,adjuvant therapy: Follow up,...,,,,,,,,,,
9,!Sample_characteristics_ch1,time to recurrence: 90,time to recurrence: 89,time to recurrence: 22,time to recurrence: 39,time to recurrence: 45,time to recurrence: 23,time to recurrence: 13,time to recurrence: 10,time to recurrence: 57,...,,,,,,,,,,


Analyze the trait row:

In [31]:
tumor_stage_row = clinical_data.iloc[1]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1', 'stage at last follow-up: Alive',
       'stage at last follow-up: Death', 'stage at last follow-up: n/a'],
      dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [39]:
import numpy as np

trait_row = 1
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'stage at last follow-up: Alive'):
        return 1  
    elif (trait == 'stage at last follow-up: Death'):
        return 0  
    else:
        return None

def convert_age(age_string):
    if age_string == 'n.a.':
        return None
    try:
        age = int(age_string.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [40]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM5768223,GSM5768224,GSM5768225,GSM5768226,GSM5768227,GSM5768228,GSM5768229,GSM5768230,GSM5768231,GSM5768232,...,GSM5768261,GSM5768262,GSM5768263,GSM5768264,GSM5768265,GSM5768266,GSM5768267,GSM5768268,GSM5768269,GSM5768270
Cervical Cancer,1,0,1,0,1,0,0,0,1,1,...,,,,,,,,,,


### Genetic data preprocessing and final filtering

Check the genetic data:

In [41]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM5768223,GSM5768224,GSM5768225,GSM5768226,GSM5768227,GSM5768228,GSM5768229,GSM5768230,GSM5768231,GSM5768232,...,GSM5768261,GSM5768262,GSM5768263,GSM5768264,GSM5768265,GSM5768266,GSM5768267,GSM5768268,GSM5768269,GSM5768270
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,13.640356,13.061804,13.092065,13.526674,13.281625,12.022688,12.160396,14.063438,12.870385,12.447663,...,14.199067,13.778855,13.403649,13.918247,12.970581,13.30094,14.335133,13.169667,13.206249,13.487752
ACVR1C,9.249743,8.866214,7.634368,5.429169,4.721611,5.490114,5.680905,7.724394,4.89919,7.749834,...,6.124174,8.942264,9.019673,8.51499,7.944484,8.11424,8.69508,8.326496,8.642206,8.776665
ADAM12,9.820059,8.941878,9.542529,8.634718,10.839687,10.468202,8.15432,11.134893,8.203524,8.99841,...,10.002951,10.182578,8.04392,8.746316,9.804788,9.473643,8.88942,8.718376,8.646253,8.947777
ADGRE1,8.676655,7.909935,3.895283,4.751097,5.265932,2.250648,2.680905,6.74598,4.247114,5.201397,...,6.95662,8.42769,8.959032,8.415718,7.203017,7.481022,8.873523,7.773343,8.126475,8.414095
ADM,9.667758,11.137318,8.982745,11.248071,11.71266,11.277325,9.404931,9.54104,8.613019,11.628081,...,7.834316,8.787395,10.178298,11.320235,12.156122,11.706041,9.992378,11.550529,11.574802,11.810985


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [42]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['A2M', 'ABCF1', 'ACVR1C', 'ADAM12', 'ADGRE1'], 'ORF': ['A2M', 'ABCF1', 'ACVR1C', 'ADAM12', 'ADGRE1'], 'SPOT_ID': [nan, nan, nan, nan, nan]}


Index(['ID', 'ORF', 'SPOT_ID'], dtype='object')

In [43]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'ORF'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [44]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM5768223,GSM5768224,GSM5768225,GSM5768226,GSM5768227,GSM5768228,GSM5768229,GSM5768230,GSM5768231,GSM5768232,...,GSM5768261,GSM5768262,GSM5768263,GSM5768264,GSM5768265,GSM5768266,GSM5768267,GSM5768268,GSM5768269,GSM5768270
A2M,13.640356,13.061804,13.092065,13.526674,13.281625,12.022688,12.160396,14.063438,12.870385,12.447663,...,14.199067,13.778855,13.403649,13.918247,12.970581,13.300940,14.335133,13.169667,13.206249,13.487752
ACVR1C,9.249743,8.866214,7.634368,5.429169,4.721611,5.490114,5.680905,7.724394,4.899190,7.749834,...,6.124174,8.942264,9.019673,8.514990,7.944484,8.114240,8.695080,8.326496,8.642206,8.776665
ADAM12,9.820059,8.941878,9.542529,8.634718,10.839687,10.468202,8.154320,11.134893,8.203524,8.998410,...,10.002951,10.182578,8.043920,8.746316,9.804788,9.473643,8.889420,8.718376,8.646253,8.947777
ADGRE1,8.676655,7.909935,3.895283,4.751097,5.265932,2.250648,2.680905,6.745980,4.247114,5.201397,...,6.956620,8.427690,8.959032,8.415718,7.203017,7.481022,8.873523,7.773343,8.126475,8.414095
ADM,9.667758,11.137318,8.982745,11.248071,11.712660,11.277325,9.404931,9.541040,8.613019,11.628081,...,7.834316,8.787395,10.178298,11.320235,12.156122,11.706041,9.992378,11.550529,11.574802,11.810985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WNT7B,8.658807,8.043519,9.147670,8.999025,9.509514,9.133291,9.090296,8.191057,8.370496,8.460669,...,5.920641,8.405438,8.379523,8.423601,8.362888,8.910706,8.522326,8.867926,9.199896,8.539053
ZAP70,9.460163,8.936603,9.210832,10.624910,8.440429,7.342818,7.389250,9.454091,9.274756,8.696453,...,6.905994,8.409170,9.456379,8.363397,8.502225,8.022817,8.947523,8.108691,8.515201,9.277278
ZC3H12A,10.003305,10.234015,8.041439,9.244232,10.005471,6.999586,8.833552,8.471473,10.103910,10.602276,...,7.679301,8.416607,10.285325,9.663853,9.051640,8.781196,8.932257,8.574907,8.773803,9.862241
ZEB1,10.902000,9.792213,10.508259,10.218377,10.708401,9.691748,9.611642,11.804588,9.012143,9.443618,...,12.231047,11.113905,12.040051,11.664685,11.327784,11.652582,11.941817,11.315543,10.882435,11.585546


Use selected clinical data and genetic data to generate the merged data:

In [45]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

  merged_data = pd.concat([clinical_df, genetic_df], axis=0).T.dropna()


Unnamed: 0,Cervical Cancer,A2M,ACVR1C,ADAM12,ADGRE1,ADM,ADORA2A,AKT1,ALDOA,ALDOC,...,WNT2B,WNT3A,WNT4,WNT5A,WNT5B,WNT7B,ZAP70,ZC3H12A,ZEB1,ZEB2
GSM5768223,1.0,13.640356,9.249743,9.820059,8.676655,9.667758,9.17637,11.744508,13.710439,10.298586,...,9.851826,8.480627,9.652808,11.63846,9.435915,8.658807,9.460163,10.003305,10.902,10.909567
GSM5768224,0.0,13.061804,8.866214,8.941878,7.909935,11.137318,8.218886,11.58447,14.557877,10.447473,...,9.957588,7.701827,9.274299,10.786361,8.849471,8.043519,8.936603,10.234015,9.792213,10.590361
GSM5768225,1.0,13.092065,7.634368,9.542529,3.895283,8.982745,6.957567,11.952797,14.31239,8.759104,...,10.915691,4.71471,8.816248,10.897932,8.611951,9.14767,9.210832,8.041439,10.508259,10.907168
GSM5768226,0.0,13.526674,5.429169,8.634718,4.751097,11.248071,8.374027,11.426348,13.737963,7.946445,...,9.02312,7.061437,8.48228,10.979916,8.638622,8.999025,10.62491,9.244232,10.218377,11.176691
GSM5768227,1.0,13.281625,4.721611,10.839687,5.265932,11.71266,7.384576,12.203747,14.486759,7.736561,...,6.982139,5.306574,5.384576,10.951231,7.476499,9.509514,8.440429,10.005471,10.708401,10.756868
GSM5768228,0.0,12.022688,5.490114,10.468202,2.250648,11.277325,6.270548,11.888279,14.249865,9.651528,...,10.222192,7.080723,6.168186,11.8929,8.574579,9.133291,7.342818,6.999586,9.691748,9.456848
GSM5768229,0.0,12.160396,5.680905,8.15432,2.680905,9.404931,6.412708,11.170466,14.282543,9.161262,...,6.768367,4.121477,5.083003,9.701686,7.299814,9.090296,7.38925,8.833552,9.611642,9.085438
GSM5768230,0.0,14.063438,7.724394,11.134893,6.74598,9.54104,7.855953,12.178212,13.359815,8.958134,...,9.036136,5.257694,7.306937,11.754437,8.676484,8.191057,9.454091,8.471473,11.804588,11.997287
GSM5768231,1.0,12.870385,4.89919,8.203524,4.247114,8.613019,7.306007,11.793273,14.014116,8.952828,...,7.229836,6.700286,6.843481,10.613852,7.140198,8.370496,9.274756,10.10391,9.012143,9.787011
GSM5768232,1.0,12.447663,7.749834,8.99841,5.201397,11.628081,7.059378,11.93509,14.839154,9.999707,...,8.833665,5.523325,8.67713,10.924546,7.271786,8.460669,8.696453,10.602276,9.443618,10.224874


Check if the merged data biased or not:

In [46]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 34 samples.
For the feature 'Cervical Cancer', the least common label is '0.0' with 10 occurrences. This represents 29.41% of the dataset.
The distribution of the feature 'Cervical Cancer' in this dataset is fine.



False

Save the data as a csv file:

In [47]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)