# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Uterine Carcinosarcoma'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


# 2. Data preprocessing and selection

## 2.1. The TCGA Xena dataset

In TCGA Xena, there is either zero or one cohort related to the trait. We search the names of subdirectories to see if any matches the trait. If a match is found, we directly obtain the file paths.

In [2]:
dataset = 'TCGA'
dataset_dir = os.path.join(DATA_ROOT, dataset)
os.listdir(dataset_dir)[:10]

['TCGA_Adrenocortical_Cancer_(ACC)',
 'TCGA_Breast_Cancer_(BRCA)',
 'TCGA_Cervical_Cancer_(CESC)',
 'TCGA_Kidney_Chromophobe_(KICH)',
 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)',
 'TCGA_Lower_Grade_Glioma_(LGG)',
 'TCGA_Melanoma_(SKCM)',
 'TCGA_Mesothelioma_(MESO)',
 'TCGA_Testicular_Cancer_(TGCT)',
 'TCGA_Uterine_Carcinosarcoma_(UCS)']

In [3]:
trait_subdir = "TCGA_Uterine_Carcinosarcoma_(UCS)"
cohort = 'Xena'
trait_type = 'binary'
is_available = True

cohort_dir = os.path.join(DATA_ROOT, dataset, trait_subdir)
clinical_data_file, genetic_data_file = xena_get_relevant_filepaths(cohort_dir)
clinical_data_file, genetic_data_file

('/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Uterine_Carcinosarcoma_(UCS)\\TCGA.UCS.sampleMap_UCS_clinicalMatrix',
 '/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Uterine_Carcinosarcoma_(UCS)\\TCGA.UCS.sampleMap_HiSeqV2_PANCAN.gz')

In [4]:
import pandas as pd

clinical_data = pd.read_csv(clinical_data_file, sep='\t', index_col=0)
genetic_data = pd.read_csv(genetic_data_file, compression='gzip', sep='\t', index_col=0)
age_col = gender_col = None

In [5]:
_, clinical_data_cols = check_rows_and_columns(clinical_data)
clinical_data_cols[:10]

['CDE_ID_3226963',
 '_INTEGRATION',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_pharmaceutical_therapy',
 'additional_radiation_therapy',
 'age_at_initial_pathologic_diagnosis',
 'aln_pos_ihc']

Read all the column names in the clinical dataset, to find the columns that record information about age or gender.
Reference prompt:

In [6]:
f'''
Below is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:
candidate_age_cols = [col_name1, col_name2, ...]
candidate_gender_cols = [col_name1, col_name2, ...]
If no columns match a criterion, please provide an empty list.

Column names:
{clinical_data_cols}
'''

"\nBelow is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:\ncandidate_age_cols = [col_name1, col_name2, ...]\ncandidate_gender_cols = [col_name1, col_name2, ...]\nIf no columns match a criterion, please provide an empty list.\n\nColumn names:\n['CDE_ID_3226963', '_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'age_at_initial_pathologic_diagnosis', 'aln_pos_ihc', 'aln_pos_light_micro', 'anatomic_neoplasm_subdivision', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'birth_control_pill_history_usage_category', 'clinical_stage', 'colorectal_cancer', 'days_to_birth', 'days_to_collection', 'days_to_dea

In [7]:
candidate_age_cols = ['age_at_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']


Choose a single column from the candidate columns that record age and gender information respectively.
If no column meets the requirement, keep 'age_col' or 'gender_col' to None

In [9]:
age_col = 'age_at_initial_pathologic_diagnosis'
gender_col = 'gender'
selected_clinical_data = xena_select_clinical_features(clinical_data, TRAIT, age_col=age_col, gender_col=gender_col)

In [10]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [11]:
genetic_data

Unnamed: 0,TCGA-N7-A4Y5-01,TCGA-N5-A4RM-01,TCGA-N9-A4Q8-01,TCGA-N9-A4Q7-01,TCGA-N6-A4VD-01,TCGA-N5-A4RT-01,TCGA-N9-A4Q1-01,TCGA-N5-A4RD-01,TCGA-N8-A4PL-01,TCGA-ND-A4WC-01,...,TCGA-ND-A4WA-01,TCGA-N8-A56S-01,TCGA-ND-A4WF-01,TCGA-N5-A4RJ-01,TCGA-NA-A4QX-01,TCGA-N5-A4RS-01,TCGA-N5-A4RO-01,TCGA-NF-A5CP-01,TCGA-N6-A4VC-01,TCGA-N5-A4RU-01
ARHGEF10L,0.159808,-0.342892,-2.561692,0.814708,-0.747992,-1.216492,-1.183192,-1.075592,-0.139592,-0.308092,...,-0.399792,0.710908,-0.276192,0.275708,-0.697392,-0.258992,-1.682292,-0.213692,-1.378792,-1.113392
HIF3A,8.116374,3.251374,3.964374,2.101974,1.745474,4.559774,6.436574,5.400374,5.781974,2.870174,...,5.370274,7.416774,5.491974,5.896174,5.058874,4.639074,5.189674,0.857074,1.616374,4.505174
RNF17,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,0.023165,0.337765,-0.531035,0.006465,...,-0.531035,-0.531035,-0.531035,0.372565,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035
RNF10,0.491128,0.449828,-0.213572,0.247028,0.142628,-0.295572,-0.819172,-0.327472,0.092528,0.082328,...,-0.056772,0.087828,0.532928,-0.087372,0.173328,-1.143872,-0.096972,-0.306672,0.606228,0.110928
RNF11,0.221222,-0.508578,-0.239378,-1.285978,-0.141278,0.112322,0.777922,-0.623978,-1.300578,-0.339078,...,-0.258378,-0.149078,-0.143578,-0.719378,0.129022,-0.635478,-1.139278,-0.246478,0.172722,-0.160778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GNGT1,2.162310,2.528110,-1.281390,2.062210,-0.500290,1.902010,2.914810,3.925510,1.604410,1.498710,...,3.084910,2.572910,3.055810,1.251910,3.911510,4.970610,3.990210,1.539910,2.313210,0.151610
TULP3,1.156223,0.980423,0.125923,-0.115377,0.394023,0.373923,1.008823,0.600623,1.977023,0.193523,...,-0.100777,0.737523,0.622323,0.508223,1.000123,1.038223,0.882223,0.320623,1.178323,-0.238477
BCL6B,-1.218927,-0.407627,-1.056927,-1.821127,-1.103927,-0.457927,-0.592327,-0.892127,-3.291227,-0.344127,...,-0.937927,0.270373,0.005173,0.983573,-0.433727,-1.194427,-2.458227,0.992073,-1.211927,0.821373
GSTK1,-1.559495,-0.638195,-0.807795,1.332405,-0.030595,-1.583695,-1.032395,-0.581895,-0.764895,0.187505,...,0.145405,-0.545095,-0.330595,0.058705,-1.419095,-0.349695,-1.139895,-0.443995,-0.818895,-1.583695


In [12]:
merged_data = selected_clinical_data.join(genetic_data.T).dropna()
merged_data.head()

Unnamed: 0_level_0,Uterine Carcinosarcoma,Age,Gender,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,...,SLC7A10,PLA2G2C,TULP2,NPY5R,GNGT2,GNGT1,TULP3,BCL6B,GSTK1,SELP
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-N5-A4R8-01,1,65,0,-0.957792,6.814974,-0.531035,-0.200472,-0.535378,-1.42261,-0.539194,...,5.055914,-0.086682,0.017922,3.273883,-0.262233,1.95401,1.290723,-0.852727,-2.000395,-5.091733
TCGA-N5-A4RA-01,1,63,0,-0.436292,6.814374,-0.531035,0.387528,-0.035178,-1.14681,-0.246094,...,6.600414,-0.086682,0.234322,-1.013317,-1.696233,0.02041,0.931723,-2.614627,-0.800295,-5.525533
TCGA-N5-A4RD-01,1,69,0,-1.075592,5.400374,0.337765,-0.327472,-0.623978,-0.76911,0.301706,...,6.132114,0.412218,-0.748878,1.260083,0.190467,3.92551,0.600623,-0.892127,-0.581895,-3.565233
TCGA-N5-A4RF-01,1,68,0,-0.616892,4.298374,-0.531035,0.068628,-0.272078,-1.90441,-0.123494,...,5.434814,1.727818,-0.748878,-1.587117,2.072467,3.56111,0.549123,0.040173,-0.607895,-3.091133
TCGA-N5-A4RJ-01,1,61,0,0.275708,5.896174,0.372565,-0.087372,-0.719378,-1.19391,-0.032194,...,5.051014,-0.086682,-0.748878,0.710983,0.398067,1.25191,0.508223,0.983573,0.058705,0.255367


In [13]:
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merge_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 57 samples.
For the feature 'Uterine Carcinosarcoma', the least common label is '1' with 57 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Uterine Carcinosarcoma' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 62.0
  50% (Median): 68.0
  75%: 76.0
Min: 51
Max: 90
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0' with 57 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.



True

In [14]:
merged_data.head()
if not is_trait_biased:
    merge_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

In [15]:
save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Uterine-Carcinosarcoma\cohort_info.json


## 2.2. The GEO dataset

In [17]:
dataset = 'GEO'
trait_subdir = "Uterine-Carcinosarcoma"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE16680', 'GSE32507', 'GSE36133', 'GSE36138', 'GSE36139', 'GSE68950']

Repeat the below steps for all the accession numbers

In [18]:
# Biased
cohort = accession_num = "GSE16680"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE16680\\GSE16680_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE16680\\GSE16680_series_matrix.txt.gz')

In [22]:
# Finished
cohort = accession_num = "GSE32507"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE32507\\GSE32507_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE32507\\GSE32507_series_matrix.txt.gz')

In [36]:
# No gene mapping
cohort = accession_num = "GSE36133"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE36133\\GSE36133_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE36133\\GSE36133_series_matrix.txt.gz')

In [48]:
# MemoryError
cohort = accession_num = "GSE36138"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE36138\\GSE36138_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE36138\\GSE36138_series_matrix.txt.gz')

In [57]:
# No gene mapping
cohort = accession_num = "GSE36139"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE36139\\GSE36139_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE36139\\GSE36139-GPL15308_series_matrix.txt.gz')

In [65]:
# No obvious trait
cohort = accession_num = "GSE68950"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE68950\\GSE68950_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Uterine-Carcinosarcoma\\GSE68950\\GSE68950_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [66]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"caArray_golub-00327: Sanger cell line Affymetrix gene expression project"
!Series_summary	"The microarray gene expression pattern was studied using 798 different cancer cell lines. The cancer cell lines are obtained from different centers. Annotation information were provided in the supplementary file."
!Series_overall_design	"golub-00327"
!Series_overall_design	"Assay Type: Gene Expression"
!Series_overall_design	"Provider: Affymetrix"
!Series_overall_design	"Array Designs: HT_HG-U133A"
!Series_overall_design	"Organism: Homo sapiens (ncbitax)"
!Series_overall_design	"Tissue Sites: leukemia, Urinary tract, Lung, BiliaryTract, Autonomic Ganglion, Thyroid gland, Stomach, Breast, Pancreas, Head and Neck, Lymphoma, Colorectal, Placenta, Liver, Brain, Bone, pleura, Skin, endometrium, Ovary, cervix, Oesophagus, Connective and Soft Tissue, Muscle, Kidney, Prostate, Adrenal Gland, Eye, Testis, Smooth Muscle Tissue, Vulva, Unknow"
!Series_overall_design	"Material Types: cell, syn

In [67]:
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM1687570,GSM1687571,GSM1687572,GSM1687573,GSM1687574,GSM1687575,GSM1687576,GSM1687577,GSM1687578,...,GSM1688358,GSM1688359,GSM1688360,GSM1688361,GSM1688362,GSM1688363,GSM1688364,GSM1688365,GSM1688366,GSM1688367
0,!Sample_characteristics_ch1,cosmic id: 924101,cosmic id: 906800,cosmic id: 687452,cosmic id: 924100,cosmic id: 910924,cosmic id: 906798,cosmic id: 906797,cosmic id: 906797,cosmic id: 910922,...,cosmic id: 909781,cosmic id: 909782,cosmic id: 909782,cosmic id: 909784,cosmic id: 909785,cosmic id: 909785,cosmic id: 909904,cosmic id: 909905,cosmic id: 687592,cosmic id: 909907
1,!Sample_characteristics_ch1,disease state: L2 Acute Lymphoblastic Leukemia,disease state: NS Acute Lymphoblastic Leukemia,disease state: carcinoma,disease state: adenocarcinoma,disease state: adenocarcinoma,disease state: transitional cell carcinoma,disease state: transitional cell carcinoma,disease state: transitional cell carcinoma,disease state: clear cell renal cell carcinoma,...,disease state: renal cell carcinoma,disease state: retinoblastoma,disease state: retinoblastoma,disease state: malignant melanoma,disease state: follicular lymphoma,disease state: follicular lymphoma,disease state: carcinoma,disease state: glioblastoma multiforme,disease state: glioblastoma multiforme,disease state: ductal carcinoma
2,!Sample_characteristics_ch1,disease location: Hematopoietic and Lymphoid T...,disease location: Hematopoietic and Lymphoid T...,disease location: bladder,disease location: prostate,disease location: stomach,disease location: ureter,disease location: bladder,disease location: bladder,disease location: kidney,...,disease location: kidney,disease location: retina,disease location: retina,disease location: skin,disease location: lymph node,disease location: lymph node,disease location: pancreas,disease location: brain,disease location: temporal lobe,disease location: breast
3,!Sample_characteristics_ch1,organism part: Leukemia,organism part: Leukemia,organism part: Urinary tract,organism part: Prostate,organism part: Stomach,organism part: Urinary tract,organism part: Urinary tract,organism part: Urinary tract,organism part: Kidney,...,organism part: Kidney,organism part: Eye,organism part: Eye,organism part: Skin,organism part: Lymphoma,organism part: Lymphoma,organism part: Pancreas,organism part: Brain,organism part: Brain,organism part: Breast
4,!Sample_characteristics_ch1,sample: 736,sample: 494,sample: 7,sample: 746,sample: 439,sample: 168,sample: 152,sample: 37,sample: 450,...,sample: 470,sample: 246,sample: 246,sample: 714,sample: 482,sample: 49,sample: 234,sample: 41,sample: 397,sample: 726
5,!Sample_characteristics_ch1,cell line code: 749,cell line code: 493,cell line code: 505,cell line code: 760,cell line code: 437,cell line code: 151,cell line code: 134,cell line code: 134,cell line code: 449,...,cell line code: 469,cell line code: 231,cell line code: 231,cell line code: 727,cell line code: 481,cell line code: 481,cell line code: 219,cell line code: 401,cell line code: 390,cell line code: 738
6,!Sample_characteristics_ch1,supplier: DSMZ,supplier: DSMZ,supplier: ATCC,supplier: DSMZ,supplier: DSMZ,supplier: DSMZ,supplier: DSMZ,supplier: DSMZ,supplier: Unspecified,...,supplier: HSRRB,supplier: ATCC,supplier: ATCC,supplier: ATCC,supplier: DSMZ,supplier: DSMZ,supplier: DSMZ,supplier: HSRRB,supplier: HSRRB,supplier: ATCC
7,!Sample_characteristics_ch1,affy_batch: 1,affy_batch: 1,affy_batch: 2,affy_batch: 1,affy_batch: 1,affy_batch: 1,affy_batch: 1,affy_batch: 2,affy_batch: 1,...,affy_batch: 1,affy_batch: 1,affy_batch: 2,affy_batch: 1,affy_batch: 1,affy_batch: 2,affy_batch: 1,affy_batch: 2,affy_batch: 1,affy_batch: 1
8,!Sample_characteristics_ch1,crna plate: 8,crna plate: 6,crna plate: 11,crna plate: 8,crna plate: 5,crna plate: 2,crna plate: 2,crna plate: 12,crna plate: 5,...,crna plate: 5,crna plate: 3,crna plate: 12,crna plate: 8,crna plate: 5,crna plate: 12,crna plate: 3,crna plate: 12,crna plate: 4,crna plate: 8


Analyze the trait row:

In [68]:
tumor_stage_row = clinical_data.iloc[1]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1',
       'disease state: L2 Acute Lymphoblastic Leukemia',
       'disease state: NS Acute Lymphoblastic Leukemia',
       'disease state: carcinoma', 'disease state: adenocarcinoma',
       'disease state: transitional cell carcinoma',
       'disease state: clear cell renal cell carcinoma',
       'disease state: anaplastic carcinoma',
       'disease state: glioblastoma multiforme',
       'disease state: malignant melanoma',
       'disease state: rhabdomyosarcoma',
       'disease state: mucoepidermoid carcinoma',
       'disease state: squamous cell carcinoma',
       'disease state: renal cell carcinoma',
       'disease state: neuroblastoma',
       'disease state: Acute Lymphoblastic Leukemia',
       'disease state: M5 acute myeloid leukemia',
       'disease state: plasma cell myeloma',
       'disease state: L1 Acute T-Cell Lymphoblastic Leukemia',
       'disease state: astrocytoma',
       'disease state: B Acute Lymphoblastic Leukemia'

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [61]:

trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'primary site: endometrium'):
        return 1  
    else:
        return 0  

def convert_age(age_string):
    if age_string == 'n.a.':
        return None
    try:
        age = int(age_string.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [62]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM886835,GSM886836,GSM886837,GSM886838,GSM886839,GSM886840,GSM886841,GSM886842,GSM886843,GSM886844,...,GSM887742,GSM887743,GSM887744,GSM887745,GSM887746,GSM887747,GSM887748,GSM887749,GSM887750,GSM887751
Uterine Carcinosarcoma,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Genetic data preprocessing and final filtering

Check the genetic data:

In [63]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM886835,GSM886836,GSM886837,GSM886838,GSM886839,GSM886840,GSM886841,GSM886842,GSM886843,GSM886844,...,GSM887742,GSM887743,GSM887744,GSM887745,GSM887746,GSM887747,GSM887748,GSM887749,GSM887750,GSM887751
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100009676_at,6.1161,6.2052,6.1249,6.6154,5.4236,6.0204,6.3859,6.0518,5.5014,5.4835,...,5.7775,5.5271,6.5087,5.2957,5.0551,5.6623,5.4516,5.7055,5.2289,5.2136
10000_at,8.1556,6.6152,4.5676,4.3519,6.6723,6.6316,6.1898,7.5308,6.0786,7.6018,...,4.4568,5.3949,6.7777,6.4488,6.5771,6.9336,7.0296,4.6383,5.896,4.4004
10001_at,9.7864,9.9699,8.872,9.1376,10.029,9.6573,9.491,9.3053,9.5423,8.7085,...,8.6107,9.1372,9.5185,9.0581,9.834,9.1133,8.3857,8.5755,9.3064,9.0476
10002_at,3.7977,4.0304,3.8455,3.7085,3.6431,4.0679,3.8344,3.7223,3.9649,4.2291,...,3.726,4.3524,4.0484,4.2962,4.0643,4.106,3.5973,3.906,4.709,4.0903
10003_at,3.5458,3.8504,4.0458,3.9508,4.1589,4.1798,3.7039,3.6759,3.618,3.6748,...,3.5464,3.755,3.8436,3.8422,3.7768,3.8934,3.8004,4.0094,3.6699,3.8667


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [64]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['1_at', '10_at', '100_at', '1000_at', '10000_at'], 'ORF': ['1', '10', '100', '1000', '10000'], 'Description': ['alpha-1-B glycoprotein', 'N-acetyltransferase 2 (arylamine N-acetyltransferase)', 'adenosine deaminase', 'cadherin 2, type 1, N-cadherin (neuronal)', 'v-akt murine thymoma viral oncogene homolog 3 (protein kinase B, gamma)']}


Index(['ID', 'ORF', 'Description'], dtype='object')

In [56]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'ORF'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

MemoryError: Unable to allocate 162. MiB for an array with shape (21254341, 1) and data type float64

In [32]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM804806,GSM804807,GSM804808,GSM804809,GSM804810,GSM804811,GSM804812,GSM804813,GSM804814,GSM804815,...,GSM804842,GSM804843,GSM804844,GSM804845,GSM804846,GSM804847,GSM804848,GSM804849,GSM804850,GSM804851
A1BG,-0.428374,-1.185519,0.905210,2.229184,-0.807230,-1.049002,-0.788316,0.264672,-0.209491,-0.711295,...,-0.267915,3.873424,2.243029,1.406519,-1.114534,-0.738209,2.528905,3.867601,-0.190160,2.753432
A1BG-AS1,0.112597,-2.545402,0.345880,2.294041,-1.484570,-2.047867,-1.833521,0.289472,-0.083492,-0.891364,...,0.349333,1.197180,2.109467,1.263221,-1.091778,-0.798120,2.082527,0.933188,0.148014,2.245506
A1CF,-0.414572,1.155139,0.204161,1.504030,-0.382042,-0.231401,-0.411998,0.456773,0.943821,1.132380,...,0.474575,0.438384,3.451386,-0.047563,-1.147372,-0.428677,-2.186289,-1.243680,-0.042116,-2.034372
A2M,-1.598132,1.704536,-1.966787,2.845671,-0.677535,-1.352631,0.799846,2.531783,-1.624713,0.691356,...,0.239463,0.176425,0.871290,0.008356,-0.008356,0.070050,2.499900,-0.278881,1.293716,1.761829
A2ML1,-0.518708,-0.038120,0.456291,0.202503,0.341105,0.115908,-0.303251,0.185458,1.257013,0.833159,...,0.330738,0.591233,3.242414,0.461115,-0.455732,-0.314194,-1.701203,-2.183079,-0.342340,-2.171190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,1.133136,-0.275506,1.333713,-1.295064,-2.577468,2.349671,-1.714859,-3.270926,-2.733860,-1.295885,...,-3.277940,0.084062,0.138241,1.957550,-0.441590,0.420463,-4.701589,1.435610,-1.461164,-4.537579
ZYG11B,0.220626,0.242326,0.234571,0.625795,-0.150644,-0.156697,-0.340128,0.306694,-0.270027,-0.403181,...,-1.379437,-0.171540,-1.479160,0.156511,0.443674,-0.593189,0.641436,0.494374,-0.555147,-0.049674
ZYX,-0.634988,0.121760,-0.194830,1.395852,-1.072702,-0.492642,-0.378221,1.186319,-0.190907,-0.269548,...,-1.017909,0.279231,0.103642,-0.334219,-0.389594,0.005596,2.668182,2.161142,1.830761,3.209167
ZZEF1,0.394825,0.374459,0.132402,0.041250,0.402375,0.242625,0.056147,-0.190490,0.399305,0.309058,...,-0.113832,-0.182934,0.602217,-0.232841,-0.079149,-0.002652,-1.439755,-0.456133,-0.135392,0.026198


Use selected clinical data and genetic data to generate the merged data:

In [33]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Uterine Carcinosarcoma,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
GSM804806,1.0,-0.428374,0.112597,-0.414572,-1.598132,-0.518708,-0.535701,-0.002069,-0.439697,1.063816,...,0.450995,0.924672,1.158071,-1.622256,-0.265457,1.133136,0.220626,-0.634988,0.394825,0.058989
GSM804807,1.0,-1.185519,-2.545402,1.155139,1.704536,-0.03812,-0.203013,0.038813,0.094731,-0.858164,...,-0.860528,0.738762,-0.051341,-0.870751,0.44917,-0.275506,0.242326,0.12176,0.374459,-0.414844
GSM804808,1.0,0.90521,0.34588,0.204161,-1.966787,0.456291,0.118227,-0.125198,-0.11767,0.191601,...,-0.25226,0.440445,-0.285612,0.301058,0.333735,1.333713,0.234571,-0.19483,0.132402,-0.250578
GSM804809,1.0,2.229184,2.294041,1.50403,2.845671,0.202503,-0.192439,2.072114,0.065774,-0.172029,...,-0.855826,-0.030349,0.471311,0.735769,1.17436,-1.295064,0.625795,1.395852,0.04125,0.587149
GSM804810,0.0,-0.80723,-1.48457,-0.382042,-0.677535,0.341105,0.708834,-0.136926,0.4389,1.311179,...,-0.38515,-0.205619,-2.443651,-0.599441,0.757772,-2.577468,-0.150644,-1.072702,0.402375,-0.111271
GSM804811,0.0,-1.049002,-2.047867,-0.231401,-1.352631,0.115908,0.958485,0.20513,0.523745,0.961093,...,0.236782,0.991536,1.121354,-0.499197,0.22096,2.349671,-0.156697,-0.492642,0.242625,0.006584
GSM804812,0.0,-0.788316,-1.833521,-0.411998,0.799846,-0.303251,1.216622,-0.644191,0.039782,0.429529,...,0.062859,-0.174568,0.051341,0.378341,0.24236,-1.714859,-0.340128,-0.378221,0.056147,0.233355
GSM804813,0.0,0.264672,0.289472,0.456773,2.531783,0.185458,0.313938,0.135998,-0.491464,-0.250708,...,-0.56069,-0.88283,-3.38325,1.279682,0.072676,-3.270926,0.306694,1.186319,-0.19049,-0.006584
GSM804814,0.0,-0.209491,-0.083492,0.943821,-1.624713,1.257013,0.435924,0.782212,0.704644,-0.67727,...,-0.489612,0.285653,-0.199854,-0.677793,0.177832,-2.73386,-0.270027,-0.190907,0.399305,-0.08675
GSM804815,0.0,-0.711295,-0.891364,1.13238,0.691356,0.833159,-0.674847,0.002069,0.539719,1.217782,...,-0.388823,0.550212,0.093551,-0.36494,0.475117,-1.295885,-0.403181,-0.269548,0.309058,-0.108851


Check if the merged data biased or not:

In [34]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 46 samples.
For the feature 'Uterine Carcinosarcoma', the least common label is '1.0' with 14 occurrences. This represents 30.43% of the dataset.
The distribution of the feature 'Uterine Carcinosarcoma' in this dataset is fine.



False

Save the data as a csv file:

In [35]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)