# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Testicular Cancer'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


# 2. Data preprocessing and selection

## 2.1. The TCGA Xena dataset

In TCGA Xena, there is either zero or one cohort related to the trait. We search the names of subdirectories to see if any matches the trait. If a match is found, we directly obtain the file paths.

In [2]:
dataset = 'TCGA'
dataset_dir = os.path.join(DATA_ROOT, dataset)
os.listdir(dataset_dir)[:10]

['TCGA_Adrenocortical_Cancer_(ACC)',
 'TCGA_Breast_Cancer_(BRCA)',
 'TCGA_Cervical_Cancer_(CESC)',
 'TCGA_Kidney_Chromophobe_(KICH)',
 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)',
 'TCGA_Lower_Grade_Glioma_(LGG)',
 'TCGA_Melanoma_(SKCM)',
 'TCGA_Mesothelioma_(MESO)',
 'TCGA_Testicular_Cancer_(TGCT)',
 'TCGA_Uterine_Carcinosarcoma_(UCS)']

In [3]:
trait_subdir = "TCGA_Testicular_Cancer_(TGCT)"
cohort = 'Xena'
trait_type = 'binary'
is_available = True

cohort_dir = os.path.join(DATA_ROOT, dataset, trait_subdir)
clinical_data_file, genetic_data_file = xena_get_relevant_filepaths(cohort_dir)
clinical_data_file, genetic_data_file

('/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Testicular_Cancer_(TGCT)\\TCGA.TGCT.sampleMap_TGCT_clinicalMatrix',
 '/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Testicular_Cancer_(TGCT)\\TCGA.TGCT.sampleMap_HiSeqV2_PANCAN.gz')

In [4]:
import pandas as pd

clinical_data = pd.read_csv(clinical_data_file, sep='\t', index_col=0)
genetic_data = pd.read_csv(genetic_data_file, compression='gzip', sep='\t', index_col=0)
age_col = gender_col = None

In [5]:
_, clinical_data_cols = check_rows_and_columns(clinical_data)
clinical_data_cols[:10]

['_INTEGRATION',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'age_at_initial_pathologic_diagnosis',
 'bcr_followup_barcode',
 'bcr_patient_barcode',
 'bcr_sample_barcode',
 'bilateral_diagnosis_timing_type']

Read all the column names in the clinical dataset, to find the columns that record information about age or gender.
Reference prompt:

In [9]:
f'''
Below is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:
candidate_age_cols = [col_name1, col_name2, ...]
candidate_gender_cols = [col_name1, col_name2, ...]
If no columns match a criterion, please provide an empty list.

Column names:
{clinical_data_cols}
'''

"\nBelow is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:\ncandidate_age_cols = [col_name1, col_name2, ...]\ncandidate_gender_cols = [col_name1, col_name2, ...]\nIf no columns match a criterion, please provide an empty list.\n\nColumn names:\n['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'age_at_initial_pathologic_diagnosis', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'bilateral_diagnosis_timing_type', 'clinical_M', 'clinical_N', 'clinical_T', 'clinical_stage', 'days_to_bilateral_tumor_dx', 'days_to_birth', 'days_to_collection', 'days_to_death', 'days_to_initial_pathologic_diagnosis', 'days_to_last_followup', 'days_to_new_tumor_event_after_initial_treatmen

In [10]:
candidate_age_cols = ['age_at_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']


Choose a single column from the candidate columns that record age and gender information respectively.
If no column meets the requirement, keep 'age_col' or 'gender_col' to None

In [13]:
age_col = 'age_at_initial_pathologic_diagnosis'
gender_col = 'gender'
selected_clinical_data = xena_select_clinical_features(clinical_data, TRAIT, age_col=age_col, gender_col=gender_col)

In [14]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [15]:
genetic_data

Unnamed: 0,TCGA-2G-AAHP-01,TCGA-4K-AAAL-01,TCGA-2G-AALP-01,TCGA-2G-AAGI-05,TCGA-ZM-AA0N-01,TCGA-2G-AAGI-01,TCGA-2G-AALT-01,TCGA-2G-AAKO-01,TCGA-2G-AAFG-01,TCGA-2G-AAG6-01,...,TCGA-2G-AAKG-05,TCGA-2G-AAG9-01,TCGA-XE-AANR-01,TCGA-YU-A94I-01,TCGA-2G-AAFO-01,TCGA-2G-AALR-01,TCGA-SN-A84X-01,TCGA-XE-A8H5-01,TCGA-XE-AAOD-01,TCGA-2G-AAHT-01
ARHGEF10L,-1.064292,-0.550492,-0.357592,-2.170992,-1.099292,0.287308,1.117908,-1.294892,-0.312792,0.377908,...,-1.620992,-0.223592,-1.464992,-0.650692,-0.680692,0.141308,0.086308,-1.450792,-1.183792,-0.825592
HIF3A,2.004674,2.885874,3.188374,1.747974,1.870874,5.475574,4.570574,0.672174,0.712774,4.982474,...,1.884474,2.449574,1.259574,2.959974,1.472474,4.936574,4.065074,1.933374,1.659474,0.928274
RNF17,1.875265,9.089365,0.116965,9.729765,9.669065,0.576865,-0.531035,9.999865,9.417865,-0.531035,...,2.382765,9.357565,9.222665,-0.531035,7.320565,0.316965,3.934365,9.994165,8.364465,9.605365
RNF10,-0.002472,0.790228,0.375328,0.640328,0.777128,-0.097272,-0.082872,0.591028,-0.105672,0.197328,...,-0.256172,0.515428,0.598928,-0.035872,0.400628,0.221928,0.298228,0.427928,0.438728,0.413928
RNF11,-0.615978,0.056122,-0.587578,0.514822,-0.449078,-0.798378,-0.413378,-0.223078,0.662422,-0.389178,...,-0.825278,0.179822,-0.506078,-0.504578,-0.595578,0.047122,-0.107178,-0.019278,-0.180378,-0.427978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GNGT1,0.104310,-1.281390,0.152010,-0.837090,-1.281390,0.877510,0.715610,0.145810,-0.480790,0.644310,...,0.572410,-0.707890,-0.303290,-1.281390,0.207210,-0.433390,0.024010,0.781010,0.009010,-0.744090
TULP3,2.237623,1.823623,2.064423,2.454823,2.661123,1.331623,1.351723,2.375923,1.729723,2.133523,...,0.697823,3.338823,2.536723,1.265223,1.816023,1.164723,1.600223,3.572423,1.750023,3.034123
BCL6B,-0.107927,0.450673,-1.355327,0.676473,-0.923627,1.177673,2.341973,0.678373,-0.196027,0.711873,...,0.482273,0.308173,0.226273,0.562773,0.612773,0.253773,0.755673,-0.254427,0.718673,2.040473
GSTK1,-1.888995,-0.786595,-0.779395,-1.329995,-0.866495,-0.688895,0.415005,-0.762395,-1.205195,-0.357695,...,-0.163395,-1.039795,-1.417595,-1.395195,-0.661895,-1.282895,-0.642795,-2.410995,-0.671495,-1.830395


In [16]:
merged_data = selected_clinical_data.join(genetic_data.T).dropna()
merged_data.head()

Unnamed: 0_level_0,Testicular Cancer,Age,Gender,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,...,SLC7A10,PLA2G2C,TULP2,NPY5R,GNGT2,GNGT1,TULP3,BCL6B,GSTK1,SELP
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2G-AAEW-01,1,31.0,1.0,-1.247192,1.370174,9.255665,0.322728,-0.485378,-0.61291,1.315306,...,3.498714,-0.086682,4.207622,-0.237217,1.597767,0.23401,2.696223,3.097773,-1.468595,-0.043933
TCGA-2G-AAEX-01,1,38.0,1.0,-1.743692,1.814274,9.680665,0.636028,0.108222,-1.26731,1.206606,...,3.790214,0.487618,5.504822,0.846083,0.797867,0.02141,2.778623,0.329973,-1.447895,1.108067
TCGA-2G-AAF1-01,1,28.0,1.0,-1.397192,1.187274,9.598765,0.838328,-0.096978,-1.04111,1.560006,...,3.421714,-0.086682,4.813122,-0.005417,1.629067,0.64101,2.337123,2.008273,-1.256195,0.153367
TCGA-2G-AAF4-01,1,30.0,1.0,-0.994992,1.597874,9.971765,0.339728,-0.098878,-1.05091,0.858706,...,4.097814,-0.086682,4.213022,2.064083,1.162567,0.53161,3.066823,-0.735627,-1.617995,-0.710533
TCGA-2G-AAF6-01,1,28.0,1.0,-0.498792,2.274374,8.821565,0.247628,0.408722,-0.65491,1.862706,...,4.072914,-0.086682,5.124822,1.812383,0.734067,-0.88259,2.702423,-1.043427,-1.584895,-0.367333


In [18]:
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merge_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 139 samples.
For the feature 'Testicular Cancer', the least common label is '1' with 139 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Testicular Cancer' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 26.0
  50% (Median): 31.0
  75%: 37.0
Min: 14.0
Max: 67.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 139 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.



True

In [19]:
merged_data.head()
if not is_trait_biased:
    merge_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

In [20]:
save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Testicular-Cancer\cohort_info.json


## 2.2. The GEO dataset

In [22]:
dataset = 'GEO'
trait_subdir = "Testicular-Cancer"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE12630',
 'GSE1818',
 'GSE28094',
 'GSE3921',
 'GSE42647',
 'GSE59520',
 'GSE62523']

Repeat the below steps for all the accession numbers

In [23]:
# Finished
cohort = accession_num = "GSE12630"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE12630\\GSE12630_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE12630\\GSE12630-GPL7280_series_matrix.txt.gz')

In [114]:
# Empty
cohort = accession_num = "GSE1818"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE1818\\GSE1818_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE1818\\GSE1818_series_matrix.txt.gz')

In [121]:
# Biased
cohort = accession_num = "GSE28094"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE28094\\GSE28094_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE28094\\GSE28094_series_matrix.txt.gz')

In [143]:
# No trait information
cohort = accession_num = "GSE3921"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE3921\\GSE3921_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE3921\\GSE3921-GPL1283_series_matrix.txt.gz')

In [146]:
# No trait information
cohort = accession_num = "GSE42647"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE42647\\GSE42647_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE42647\\GSE42647-GPL10558_series_matrix.txt.gz')

In [151]:
# No gene mapping
cohort = accession_num = "GSE59520"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE59520\\GSE59520_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE59520\\GSE59520_series_matrix.txt.gz')

In [161]:
# No gene mapping
cohort = accession_num = "GSE62523"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE62523\\GSE62523_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Testicular-Cancer\\GSE62523\\GSE62523_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [162]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Gene expression profiles of HMEC-1 after exposure to the chemotherapeutic drugs bleomycin and cisplatin with untreated samples as control"
!Series_summary	"Chemotherapy-related endothelial damage contributes to the early development of cardiovascular morbidity in testicular cancer patients. We aimed to identify relevant mechanisms of and search for candidate biomarkers for this endothelial damage."
!Series_summary	"Human micro-vascular endothelial cells (HMEC-1) were exposed to bleomycin or cisplatin with untreated samples as control. 18k cDNA microarrays were used. Gene expression differences were analysed at single gene level and in gene sets clustered in biological pathways and validated by qRT-PCR. Protein levels of a candidate biomarker were measured in testicular cancer patient plasma before, during and after bleomycin-etoposide-cisplatin chemotherapy, and related to endothelial damage biomarkers (von Willebrand Factor (vWF), high-sensitivity C-Reactive Protein (hs

In [163]:
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM1528220,GSM1528221,GSM1528222,GSM1528223,GSM1528224,GSM1528225,GSM1528226,GSM1528227,GSM1528228,...,GSM1528314,GSM1528315,GSM1528316,GSM1528317,GSM1528318,GSM1528319,GSM1528320,GSM1528321,GSM1528322,GSM1528323
0,!Sample_characteristics_ch1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,...,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1,cell line: HMEC-1
1,!Sample_characteristics_ch1,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...,cell type: human microvascular endothelial cel...


In [100]:
import pandas as pd

def parse_age(age_str):
    if (age_str == "age: -"):
        return age_str
    elif '-' in age_str:
        age_range = age_str.split('age: ')[1]
        start_age, end_age = map(int, age_range.split('-'))
        median_age = int(np.ceil((start_age + end_age)/2))
        return f'age: {median_age}'
    else:
        return age_str  
    
clinical_data.iloc[2] = clinical_data.iloc[2].apply(parse_age)
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM319520,GSM319521,GSM319522,GSM319523,GSM319524,GSM319525,GSM319526,GSM319527,GSM319528,...,GSM319599,GSM319600,GSM319601,GSM319602,GSM319603,GSM319604,GSM319605,GSM319606,GSM319607,GSM319608
0,!Sample_characteristics_ch1,Anatomical sites: Liver,Anatomical sites: Bladder,Anatomical sites: Bladder,Anatomical sites: Liver,Anatomical sites: Liver,Anatomical sites: Bladder,Anatomical sites: Bladder,Anatomical sites: Prostate,Anatomical sites: Bladder,...,Anatomical sites: Liver,Anatomical sites: Pancreas,Anatomical sites: Liver,Anatomical sites: Liver,Anatomical sites: Stomach,Anatomical sites: Retroperiteum,Anatomical sites: Pancreas,Anatomical sites: Lymph node,Anatomical sites: Liver,Anatomical sites: Liver
1,!Sample_characteristics_ch1,gender: F,gender: M,gender: M,gender: M,gender: F,gender: M,gender: M,gender: M,gender: M,...,gender: F,gender: F,gender: M,gender: M,gender: M,gender: M,gender: F,gender: M,gender: F,gender: M
2,!Sample_characteristics_ch1,age: 65,age: 65,age: 65,age: 55,age: 55,age: 55,age: 45,age: 65,age: 65,...,age: 96,age: 49,age: 66,age: 88,age: 70,age: 78,age: 67,age: 70,age: 68,age: 64
3,!Sample_characteristics_ch1,TNM: G4,TNM: G4,TNM: G4,TNM: G4,TNM: G4,TNM: G4,TNM: G4,TNM: Gleason 7,TNM: G4,...,TNM: 3,TNM: 4,TNM: 3,TNM: 3,TNM: 4,TNM: 2,TNM: 4,TNM: 3,TNM: 3,TNM: 3


Analyze the trait row:

In [165]:
tumor_stage_row = clinical_data.iloc[1]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1',
       'cell type: human microvascular endothelial cell line'],
      dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [157]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'sample type: serum from patient with yolk sac tumor' or 'sample type: serum from patient with teratoma' or 'sample type: serum from patient with embryonal carcinoma' or
        'sample type: serum from patient with embryonal carcinoma + yolk sac tumor' or 'sample type: serum from patient with embryonal carcinoma + teratoma' or
        'sample type: serum from patient with seminoma'):
        return 1  
    else:
        return 0  

def convert_age(age_string):
    if age_string.lower() == 'n.a.':
        return None
    try:
        age = int(age_string.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [158]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



Unnamed: 0,GSM1438703,GSM1438704,GSM1438705,GSM1438706,GSM1438707,GSM1438708,GSM1438709,GSM1438710,GSM1438711,GSM1438712,...,GSM1438728,GSM1438729,GSM1438730,GSM1438731,GSM1438732,GSM1438733,GSM1438734,GSM1438735,GSM1438736,GSM1438737
Testicular Cancer,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Genetic data preprocessing and final filtering

Check the genetic data:

In [159]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM1438703,GSM1438704,GSM1438705,GSM1438706,GSM1438707,GSM1438708,GSM1438709,GSM1438710,GSM1438711,GSM1438712,...,GSM1438728,GSM1438729,GSM1438730,GSM1438731,GSM1438732,GSM1438733,GSM1438734,GSM1438735,GSM1438736,GSM1438737
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsa-let-7a#-002307_B,5.842278,5.18699,-8.020862,4.256442,4.612815,5.217033,-0.603717,5.435251,3.867738,5.321154,...,5.441058,3.046009,6.003657,6.135078,5.37042,6.212634,5.985714,5.214142,4.578491,5.079505
hsa-let-7a-000377_A,-3.222858,-6.503963,-4.64349,0.949426,-8.546852,-6.698218,-6.865445,-5.403104,0.101707,-8.29768,...,-9.240047,-8.201252,-7.662351,-8.063188,-4.46865,-7.213025,-6.777511,-5.475506,0.180072,0.408909
hsa-let-7b#-002404_B,-0.643702,5.445452,5.223694,4.519038,4.873827,5.475362,6.689619,-0.715029,3.764839,5.282783,...,5.487121,0.279509,6.047222,6.178059,-1.007834,6.25527,6.029359,3.958808,3.294935,3.818193
hsa-let-7b-002619_A,-8.538878,-12.678577,-8.127248,-6.813449,-10.991424,-11.159446,-11.013781,-10.235487,-10.001887,-10.271321,...,-11.753343,-10.54094,-11.43528,-11.226297,-8.359788,-11.168254,-9.711721,-11.853189,-10.120176,-5.492286
hsa-let-7c#-002405_B,5.866882,5.189973,4.959877,4.228723,4.596853,5.221008,6.480926,5.446426,5.114929,6.221021,...,5.199384,6.517703,5.780546,5.916303,5.126416,5.996417,5.762011,5.032583,4.548834,4.93012


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [160]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['hsa-let-7a#-002307_B', 'hsa-let-7a-000377_A', 'hsa-let-7b#-002404_B', 'hsa-let-7b-002619_A', 'hsa-let-7c#-002405_B'], 'miRNA_ID': ['hsa-let-7a', 'hsa-let-7a', 'hsa-let-7b', 'hsa-let-7b', 'hsa-let-7c']}


Index(['ID', 'miRNA_ID'], dtype='object')

In [141]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [142]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM694425,GSM694426,GSM694427,GSM694428,GSM694429,GSM694430,GSM694431,GSM694432,GSM694433,GSM694434,...,GSM696043,GSM696044,GSM696045,GSM696046,GSM696047,GSM696048,GSM696049,GSM696050,GSM696051,GSM696052
AATK,0.763330,0.462200,0.642997,0.478480,0.662853,0.542033,0.634990,0.866760,0.907427,0.769250,...,0.586590,0.868620,0.902693,0.082113,0.844367,0.215980,0.262597,0.217850,0.693370,0.873490
ABCA1,0.055445,0.064085,0.030930,0.026370,0.052145,0.041280,0.040625,0.119855,0.031850,0.044510,...,0.486990,0.270730,0.091300,0.034815,0.035905,0.025070,0.061780,0.030165,0.036575,0.021745
ABCB1,0.029260,0.033270,0.021860,0.289410,0.029440,0.030050,0.045110,0.631570,0.025270,0.431280,...,0.149060,0.112050,0.815110,0.023870,0.859140,0.070090,0.025860,0.264610,0.140650,0.019740
ABCB4,0.917533,0.923730,0.820663,0.412440,0.831660,0.892687,0.892533,0.867287,0.913630,0.906713,...,0.535997,0.142330,0.468137,0.494043,0.430917,0.483337,0.537487,0.561760,0.781263,0.925660
ABCC2,0.644555,0.878110,0.798410,0.125310,0.631210,0.894670,0.709640,0.938555,0.947535,0.938880,...,0.899720,0.899600,0.897065,0.136490,0.566495,0.446390,0.628845,0.604705,0.806435,0.954325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZIM3,0.509543,0.622737,0.807350,0.426720,0.908350,0.734730,0.908807,0.666313,0.943337,0.920040,...,0.952477,0.112767,0.796520,0.499650,0.893730,0.711397,0.622020,0.714327,0.854890,0.953750
ZMYND10,0.043255,0.054505,0.042720,0.072095,0.072390,0.027790,0.323365,0.665285,0.077090,0.318725,...,0.920440,0.742160,0.418530,0.106025,0.041645,0.060450,0.062975,0.045760,0.068870,0.048235
ZNF215,0.206340,0.115455,0.056785,0.036935,0.565590,0.085890,0.073600,0.078520,0.081040,0.130275,...,0.354250,0.167925,0.506625,0.083815,0.795100,0.075735,0.090965,0.291520,0.138770,0.148765
ZNF264,0.248280,0.420710,0.193010,0.137505,0.663030,0.243080,0.186635,0.317830,0.351480,0.233920,...,0.445250,0.156665,0.282680,0.101420,0.366665,0.086490,0.120520,0.074880,0.227085,0.207445


Use selected clinical data and genetic data to generate the merged data:

In [111]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Testicular Cancer,Age,Gender,A2M,A4GALT,A4GNT,AAAS,AACS,AADAC,AAK1,...,ZSCAN2,ZSWIM1,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYX,ZZEF1,ZZZ3
GSM319520,0.0,65.0,1.0,1.822356,-0.786868,-0.237055,0.167821,0.261164,1.618413,-0.518816,...,-0.15557,0.260321,0.567675,0.523961,0.949124,-0.232081,0.569597,0.576858,0.135833,0.584064
GSM319521,0.0,65.0,0.0,1.559114,-0.651795,-0.846372,0.193443,0.539628,0.135394,-0.286925,...,-0.15805,-0.520908,0.646224,0.620909,1.084455,-0.106472,0.037375,0.55263,-0.182003,0.805389
GSM319522,0.0,65.0,0.0,1.659479,-0.020965,-0.258145,0.050393,0.588356,0.349506,-0.425712,...,-1.04737,-0.236573,0.636459,0.61771,0.974492,-0.386168,0.259272,0.733197,0.151216,0.807542
GSM319523,0.0,55.0,0.0,2.259253,-0.234597,-0.034119,0.166468,0.176863,1.803988,-0.443632,...,-0.126293,0.063562,0.530385,0.213963,0.570189,-0.194726,0.361144,0.289863,-0.553081,0.790123
GSM319524,0.0,55.0,1.0,1.943809,-0.567291,-0.128662,0.278019,0.511934,1.529365,-0.453456,...,-0.55456,0.060705,0.630919,0.41537,0.951013,-0.222035,0.375667,0.558456,0.002504,0.893576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM319604,0.0,78.0,0.0,1.278891,-0.486858,-0.298108,0.195726,0.634411,0.851169,-0.426483,...,-1.104783,0.082673,0.534862,0.184389,0.641579,-0.429776,0.41384,0.574886,-0.067703,0.651082
GSM319605,0.0,67.0,1.0,1.409083,-0.811472,-0.237879,0.393832,0.700658,0.919927,-0.224048,...,-0.982962,-0.138692,0.684017,0.205511,0.776389,-0.326719,0.35569,0.581518,-0.041377,0.700126
GSM319606,0.0,70.0,0.0,1.147403,-1.046497,-0.086671,0.334379,0.336263,0.107917,-0.17182,...,-0.911489,0.096947,0.572695,0.156804,0.169242,-1.046497,0.429288,0.363679,0.078083,0.631074
GSM319607,0.0,68.0,1.0,1.726922,-0.645779,-0.063774,0.241135,0.455403,0.961991,-0.231324,...,-1.014851,0.039325,0.664898,0.723621,1.166881,-0.86736,0.691084,0.537276,0.095422,0.715506


Check if the merged data biased or not:

In [112]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 88 samples.
For the feature 'Testicular Cancer', the least common label is '1.0' with 10 occurrences. This represents 11.36% of the dataset.
The distribution of the feature 'Testicular Cancer' in this dataset is fine.

Quartiles for 'Age':
  25%: 44.75
  50% (Median): 55.0
  75%: 65.25
Min: 15.0
Max: 96.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 35 occurrences. This represents 39.77% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



False

Save the data as a csv file:

In [113]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)