# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Kidney Chromophobe'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


# 2. Data preprocessing and selection

## 2.1. The TCGA Xena dataset

In TCGA Xena, there is either zero or one cohort related to the trait. We search the names of subdirectories to see if any matches the trait. If a match is found, we directly obtain the file paths.

In [2]:
dataset = 'TCGA'
dataset_dir = os.path.join(DATA_ROOT, dataset)
os.listdir(dataset_dir)[:10]

['TCGA_Adrenocortical_Cancer_(ACC)',
 'TCGA_Breast_Cancer_(BRCA)',
 'TCGA_Cervical_Cancer_(CESC)',
 'TCGA_Kidney_Chromophobe_(KICH)',
 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)',
 'TCGA_Lower_Grade_Glioma_(LGG)',
 'TCGA_Melanoma_(SKCM)',
 'TCGA_Mesothelioma_(MESO)',
 'TCGA_Testicular_Cancer_(TGCT)',
 'TCGA_Uterine_Carcinosarcoma_(UCS)']

In [3]:
trait_subdir = "TCGA_Kidney_Chromophobe_(KICH)"
cohort = 'Xena'
trait_type = 'binary'
is_available = True

cohort_dir = os.path.join(DATA_ROOT, dataset, trait_subdir)
clinical_data_file, genetic_data_file = xena_get_relevant_filepaths(cohort_dir)
clinical_data_file, genetic_data_file

('/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Kidney_Chromophobe_(KICH)\\TCGA.KICH.sampleMap_KICH_clinicalMatrix',
 '/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Kidney_Chromophobe_(KICH)\\TCGA.KICH.sampleMap_HiSeqV2_PANCAN.gz')

In [4]:
import pandas as pd

clinical_data = pd.read_csv(clinical_data_file, sep='\t', index_col=0)
genetic_data = pd.read_csv(genetic_data_file, compression='gzip', sep='\t', index_col=0)
age_col = gender_col = None

In [5]:
_, clinical_data_cols = check_rows_and_columns(clinical_data)
clinical_data_cols[:10]

['_INTEGRATION',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_pharmaceutical_therapy',
 'additional_radiation_therapy',
 'additional_surgery_locoregional_procedure',
 'additional_surgery_metastatic_procedure',
 'age_at_initial_pathologic_diagnosis']

Read all the column names in the clinical dataset, to find the columns that record information about age or gender.
Reference prompt:

In [6]:
f'''
Below is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:
candidate_age_cols = [col_name1, col_name2, ...]
candidate_gender_cols = [col_name1, col_name2, ...]
If no columns match a criterion, please provide an empty list.

Column names:
{clinical_data_cols}
'''

"\nBelow is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:\ncandidate_age_cols = [col_name1, col_name2, ...]\ncandidate_gender_cols = [col_name1, col_name2, ...]\nIf no columns match a criterion, please provide an empty list.\n\nColumn names:\n['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'additional_surgery_locoregional_procedure', 'additional_surgery_metastatic_procedure', 'age_at_initial_pathologic_diagnosis', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'clinical_M', 'days_to_additional_surgery_metastatic_procedure', 'days_to_birth', 'days_to_death', 'days_to_initial_pathologic_diagnosis

In [7]:
candidate_age_cols = ['age_at_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']


Choose a single column from the candidate columns that record age and gender information respectively.
If no column meets the requirement, keep 'age_col' or 'gender_col' to None

In [8]:
age_col = 'age_at_initial_pathologic_diagnosis'
gender_col = 'gender'
selected_clinical_data = xena_select_clinical_features(clinical_data, TRAIT, age_col=age_col, gender_col=gender_col)

In [9]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [10]:
genetic_data

Unnamed: 0,TCGA-KN-8419-01,TCGA-KL-8346-01,TCGA-KN-8422-01,TCGA-KN-8431-11,TCGA-KN-8430-11,TCGA-KM-8440-01,TCGA-KO-8414-01,TCGA-KL-8323-01,TCGA-KM-8639-01,TCGA-KO-8415-11,...,TCGA-KN-8421-01,TCGA-KO-8417-01,TCGA-KM-8438-01,TCGA-KL-8340-01,TCGA-KO-8406-01,TCGA-KO-8408-01,TCGA-KM-8443-01,TCGA-KM-8442-01,TCGA-KL-8332-11,TCGA-KL-8327-01
ARHGEF10L,0.257008,0.964108,0.816608,1.059108,1.249508,0.780508,0.980108,-0.295492,2.236608,0.908808,...,0.826308,0.301308,-0.360592,0.154508,0.552708,0.850708,0.726308,0.377608,0.691208,1.040008
HIF3A,-1.608126,-2.427326,-2.976826,1.754774,2.650674,-2.327726,-0.067726,-3.487426,-0.698526,1.916374,...,-1.335726,-1.632126,0.457674,-1.470526,-1.885926,-0.121726,-1.197026,-3.668326,0.663774,0.647674
RNF17,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,0.014365,...,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035
RNF10,0.890128,1.293828,0.925828,0.546928,0.702228,0.581128,1.434328,0.359428,0.797228,0.330828,...,0.701928,0.747728,1.093628,0.640128,0.393228,1.213428,1.045428,0.668028,0.035528,0.602928
RNF11,0.952122,0.350422,0.932922,0.568222,0.330522,0.872422,0.283722,0.850422,0.617222,0.726922,...,0.505022,0.668122,0.657722,0.770122,0.867222,0.240622,0.627322,0.675022,1.234822,0.143422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GNGT1,1.217210,1.124410,-0.352490,-0.340290,-0.531490,4.614510,2.826310,-0.797090,0.631510,-0.735990,...,1.273010,2.212910,-0.710790,5.190910,3.372210,-1.281390,-1.281390,1.955710,0.246610,1.454210
TULP3,-1.385877,-1.782277,-1.814177,-0.256377,-0.236177,-1.816577,-0.695777,-1.328077,-0.343577,-0.293877,...,-0.527977,-1.313977,-1.251077,-0.998277,-1.109277,-1.842577,-1.526577,-1.119977,-0.389377,-1.161377
BCL6B,1.992473,0.141973,0.097473,1.239873,0.437773,-0.504027,0.164073,0.933073,-0.169027,1.522073,...,-2.038527,0.217073,2.418173,0.946373,0.921673,0.736373,-0.095527,0.793373,0.742973,1.130973
GSTK1,1.805805,1.411305,2.360905,0.652505,0.646505,1.609705,2.030605,2.015005,0.610805,0.746705,...,1.141605,1.702505,2.053205,1.577805,1.480605,2.513905,1.527905,0.928605,2.015505,2.087105


In [11]:
merged_data = selected_clinical_data.join(genetic_data.T).dropna()
merged_data.head()

Unnamed: 0_level_0,Kidney Chromophobe,Age,Gender,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,...,SLC7A10,PLA2G2C,TULP2,NPY5R,GNGT2,GNGT1,TULP3,BCL6B,GSTK1,SELP
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-KL-8323-01,1,57,0,-0.295492,-3.487426,-0.531035,0.359428,0.850422,0.58019,-0.739994,...,-2.090786,0.397618,-0.748878,0.731583,-1.308333,-0.79709,-1.328077,0.933073,2.015005,0.025067
TCGA-KL-8324-01,1,67,0,0.581408,0.368474,-0.531035,1.217628,0.626922,0.06679,-0.058894,...,-2.090786,-0.086682,-0.193178,0.513683,-0.912933,-0.32549,-1.035777,0.221073,1.600605,2.456767
TCGA-KL-8324-11,0,67,0,1.119008,2.198374,-0.531035,0.341628,0.681022,0.46319,0.057806,...,-0.837786,-0.086682,0.006622,5.587783,0.113667,-0.03259,0.051323,1.403473,0.786105,3.303867
TCGA-KL-8325-01,1,56,0,0.572008,-1.889926,-0.531035,0.683828,0.971922,1.55029,-0.150194,...,-2.090786,-0.086682,-0.748878,0.414383,-2.250633,1.13531,-1.257677,-1.076027,1.424005,-0.464633
TCGA-KL-8326-01,1,69,1,0.259208,-0.380726,-0.531035,0.992728,0.311022,0.65699,0.050806,...,-0.295086,-0.086682,-0.748878,1.982883,-0.845733,1.40521,-1.536277,0.100373,1.883005,0.013067


In [12]:
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merge_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 91 samples.
For the feature 'Kidney Chromophobe', the least common label is '0' with 25 occurrences. This represents 27.47% of the dataset.
The distribution of the feature 'Kidney Chromophobe' in this dataset is fine.

Quartiles for 'Age':
  25%: 42.5
  50% (Median): 51.0
  75%: 62.0
Min: 17
Max: 86
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0' with 39 occurrences. This represents 42.86% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



False

In [13]:
merged_data.head()
if not is_trait_biased:
    merge_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

In [14]:
save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Kidney-Chromophobe\cohort_info.json


## 2.2. The GEO dataset

In [15]:
dataset = 'GEO'
trait_subdir = "Kidney-Chromophobe"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE11024',
 'GSE11151',
 'GSE11447',
 'GSE144082',
 'GSE14670',
 'GSE15641',
 'GSE17746',
 'GSE19949',
 'GSE19982',
 'GSE26574',
 'GSE3',
 'GSE40911',
 'GSE40912',
 'GSE40914',
 'GSE4125',
 'GSE42977',
 'GSE57162',
 'GSE6280',
 'GSE68606',
 'GSE8271',
 'GSE95425']

Repeat the below steps for all the accession numbers

In [16]:
# Finished
cohort = accession_num = "GSE11024"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE11024\\GSE11024_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE11024\\GSE11024_series_matrix.txt.gz')

In [31]:
# Finished
cohort = accession_num = "GSE11151"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE11151\\GSE11151_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE11151\\GSE11151_series_matrix.txt.gz')

In [44]:
# Biased
cohort = accession_num = "GSE11447"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE11447\\GSE11447_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE11447\\GSE11447_series_matrix.txt.gz')

In [48]:
# Biased
cohort = accession_num = "GSE144082"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE144082\\GSE144082_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE144082\\GSE144082_series_matrix.txt.gz')

In [54]:
# Biased 
cohort = accession_num = "GSE14670"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE14670\\GSE14670_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE14670\\GSE14670-GPL2641_series_matrix.txt.gz')

In [57]:
# Biased
cohort = accession_num = "GSE15641"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE15641\\GSE15641_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE15641\\GSE15641_series_matrix.txt.gz')

In [61]:
# No trait
cohort = accession_num = "GSE17746"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE17746\\GSE17746_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE17746\\GSE17746-GPL9064_series_matrix.txt.gz')

In [65]:
# No obviuos traits
cohort = accession_num = "GSE19949"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE19949\\GSE19949_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE19949\\GSE19949-GPL3921_series_matrix.txt.gz')

In [68]:
# Finished
cohort = accession_num = "GSE19982"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE19982\\GSE19982_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE19982\\GSE19982_series_matrix.txt.gz')

In [81]:
# Finished
cohort = accession_num = "GSE26574"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE26574\\GSE26574_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE26574\\GSE26574_series_matrix.txt.gz')

In [94]:
# Clinical data is empty
cohort = accession_num = "GSE3"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE3\\GSE3_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE3\\GSE3-GPL10_series_matrix.txt.gz')

In [97]:
# No traits
cohort = accession_num = "GSE40911"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE40911\\GSE40911_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE40911\\GSE40911_series_matrix.txt.gz')

In [100]:
# Biased
cohort = accession_num = "GSE40912"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE40912\\GSE40912_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE40912\\GSE40912_series_matrix.txt.gz')

In [104]:
# No traits
cohort = accession_num = "GSE40914"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE40914\\GSE40914_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE40914\\GSE40914-GPL3985_series_matrix.txt.gz')

In [107]:
# Biased
cohort = accession_num = "GSE4125"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE4125\\GSE4125_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE4125\\GSE4125-GPL2649_series_matrix.txt.gz')

In [110]:
# Finished
cohort = accession_num = "GSE42977"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE42977\\GSE42977_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE42977\\GSE42977_series_matrix.txt.gz')

In [123]:
# No obvious traits
cohort = accession_num = "GSE57162"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE57162\\GSE57162_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE57162\\GSE57162_series_matrix.txt.gz')

In [129]:
# Finished
cohort = accession_num = "GSE6280"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE6280\\GSE6280_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE6280\\GSE6280-GPL96_series_matrix.txt.gz')

In [142]:
# No traits
cohort = accession_num = "GSE68606"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE68606\\GSE68606_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE68606\\GSE68606_series_matrix.txt.gz')

In [151]:
# Not a gzipped file (b'!S')
cohort = accession_num = "GSE8271"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE8271\\GSE8271_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE8271\\GSE8271-GPL2004_series_matrix.txt')

In [153]:
# No trait
cohort = accession_num = "GSE95425"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE95425\\GSE95425_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Kidney-Chromophobe\\GSE95425\\GSE95425_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [154]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Cell-type specific gene programs of the normal human nephron define kidney cancer subtypes"
!Series_summary	"Comprehensive transcriptome studies of cancers often rely on corresponding normal tissue samples to serve as a transcriptional reference. In this study we performed in-depth analyses of normal kidney tissue transcriptomes from TCGA and demonstrate that the histological variability in cellularity, inherent in the kidney architecture, lead to considerable transcriptional differences between samples. This should be considered when comparing expression profiles of normal and cancerous kidney tissues. We exploited these differences to define renal cell-specific gene signatures and used these as framework to analyze renal cell carcinoma (RCC) ontogeny. Chromophobe RCCs express FOXI1-driven genes that define collecting duct intercalated cells whereas HNF-regulated genes, specific for proximal tubule cells, are an integral part of clear cell and papillary RCC transcriptom

In [155]:
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM2510512,GSM2510513,GSM2510514,GSM2510515,GSM2510516,GSM2510517,GSM2510518,GSM2510519,GSM2510520,...,GSM2510555,GSM2510556,GSM2510557,GSM2510558,GSM2510559,GSM2510560,GSM2510561,GSM2510562,GSM2510563,GSM2510564
0,!Sample_characteristics_ch1,patient id: R099,patient id: R099,patient id: R099,patient id: R099,patient id: R099,patient id: R116,patient id: R116,patient id: R116,patient id: R116,...,patient id: R164,patient id: R164,patient id: R164,patient id: R164,patient id: R164,patient id: R164,patient id: R164,patient id: R164,patient id: R164,patient id: R164
1,!Sample_characteristics_ch1,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,...,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue,patient type: Normal kidney tissue
2,!Sample_characteristics_ch1,sampling depth: cortex,sampling depth: cortex/medulla,sampling depth: medulla,sampling depth: medulla,sampling depth: cortex,sampling depth: cortex,sampling depth: cortex,sampling depth: cortex,sampling depth: cortex,...,sampling depth: medulla,sampling depth: medulla,sampling depth: cortex,sampling depth: cortex,sampling depth: cortex/medulla,sampling depth: medulla,sampling depth: cortex/medulla,sampling depth: cortex,sampling depth: cortex,sampling depth: medulla


Analyze the trait row:

In [156]:
tumor_stage_row = clinical_data.iloc[1]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1',
       'patient type: Normal kidney tissue'], dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [133]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'tumor kidney'):
        return 1
    else:
        return 0

def convert_age(age_string):
    if age_string == 'n.a.':
        return None
    try:
        age = int(age_string.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [134]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM144461,GSM144462,GSM144463,GSM144464,GSM144465,GSM144466,GSM144467,GSM144468,GSM144469,GSM144470,GSM144471,GSM144472,GSM144473,GSM144474,GSM144475,GSM144476,GSM144477,GSM144478,GSM144479,GSM144480
Kidney Chromophobe,0,0,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1


### Genetic data preprocessing and final filtering

Check the genetic data:

In [135]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM144461,GSM144462,GSM144463,GSM144464,GSM144465,GSM144466,GSM144467,GSM144468,GSM144469,GSM144470,GSM144471,GSM144472,GSM144473,GSM144474,GSM144475,GSM144476,GSM144477,GSM144478,GSM144479,GSM144480
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1007_s_at,2448.330886,2088.19113,2378.890584,1501.130673,771.981443,4088.512902,2674.766372,2286.899182,2255.434749,4876.216458,3433.645343,2689.300832,2689.834147,3059.908622,2561.649765,1811.87492,3602.765213,2531.526362,3454.985698,3151.208379
1053_at,269.267608,255.284169,130.82107,594.034183,319.061124,341.086623,365.735131,219.48353,329.812076,216.788189,199.533818,324.270167,206.305211,190.849472,597.407031,545.976134,305.906321,451.409917,314.37746,1375.650135
117_at,150.751626,89.611468,158.252438,213.004294,116.68002,93.676357,128.268059,150.931762,179.156876,205.794808,181.775719,119.492901,146.26018,91.6822,158.098088,101.154749,55.827173,69.107962,121.030158,81.037344
121_at,3467.996211,3847.292272,5602.932328,842.191072,1196.802806,4141.360747,5117.171706,2718.372649,2631.707567,3926.912036,3590.686255,3265.272459,3282.760092,3882.516064,4932.537734,3609.311177,4432.245655,3881.644349,4720.401807,4589.636116
1255_g_at,37.933956,11.354336,8.648698,246.040227,2.941446,32.068664,10.235054,13.217591,6.237206,11.20548,10.448371,62.219285,64.184627,49.127612,40.457592,38.888257,37.35368,61.525593,33.006454,51.429233


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [136]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X69699 /FEATURE= /DEFINITION=HSPAX8A H.sapiens

Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')

In [137]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Gene Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [138]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM144461,GSM144462,GSM144463,GSM144464,GSM144465,GSM144466,GSM144467,GSM144468,GSM144469,GSM144470,GSM144471,GSM144472,GSM144473,GSM144474,GSM144475,GSM144476,GSM144477,GSM144478,GSM144479,GSM144480
A1CF,393.944297,419.312700,839.088737,336.467647,304.352824,335.174207,459.603000,1346.091671,1345.810652,445.784383,660.345296,361.051493,413.444966,1107.773843,347.994816,274.886142,306.428206,267.995520,290.416351,512.066390
A2M,4719.007492,3439.718868,10632.142690,1682.869608,2528.103999,2393.567193,1469.914264,9117.143657,6980.921708,3251.944599,3579.008811,3119.766026,3835.602773,9127.886413,103.539432,89.829983,72.924917,91.767363,102.311943,131.196707
A4GALT,49.958838,79.301499,90.511041,55.524387,16.177150,224.417580,64.379765,194.931170,168.587980,177.297132,70.003244,86.315557,25.267943,154.023404,93.232293,151.859925,109.757082,124.494964,107.948949,116.635868
A4GNT,131.615662,49.544167,119.047538,88.923424,69.388208,80.916960,166.375412,116.424861,22.586034,165.208876,206.573198,131.353374,85.458122,112.478623,158.033502,67.860074,116.941110,60.102529,123.193870,161.910441
AAAS,131.478372,131.223627,38.636492,189.905822,181.276047,158.221706,113.384153,136.317584,297.286839,45.135663,47.571577,126.026207,219.367983,166.142089,181.547875,100.878947,42.981564,25.980267,23.553774,63.487871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDB,10.558654,33.947600,46.070455,43.445391,61.595873,39.224029,37.271771,91.102073,42.849965,82.402914,91.435893,52.568486,16.065046,106.432828,15.418238,17.135592,48.544495,32.215183,60.254770,95.287158
ZXDC,423.033883,340.566859,373.343082,217.649532,523.625683,289.331912,287.891936,395.398262,479.461459,86.904721,136.707254,313.826599,318.848531,127.976212,55.265391,82.592402,176.259601,171.470474,166.607842,69.168220
ZYX,921.478534,506.799871,990.485936,848.028158,784.246503,1656.972634,1332.797022,733.009275,973.694145,260.131155,201.479456,969.376201,939.347579,1149.685743,2501.301051,1619.250402,2691.822218,1965.301338,2535.658897,2752.516539
ZZEF1,176.096677,111.808237,87.818233,98.616717,156.785762,193.936914,75.116985,137.827812,136.597780,61.583107,96.379313,100.145365,143.097354,124.260339,122.797954,106.893822,129.684998,126.550386,128.342315,149.553053


Use selected clinical data and genetic data to generate the merged data:

In [139]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Kidney Chromophobe,A1CF,A2M,A4GALT,A4GNT,AAAS,AACS,AADAC,AAGAB,AAK1,...,ZSWIM1,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYX,ZZEF1,ZZZ3
GSM144461,0.0,393.944297,4719.007492,49.958838,131.615662,131.478372,462.966645,7.547504,104.462909,121.518358,...,157.94693,307.996299,575.516348,262.633939,1459.738442,10.558654,423.033883,921.478534,176.096677,1280.653366
GSM144462,0.0,419.3127,3439.718868,79.301499,49.544167,131.223627,447.606918,11.506281,145.003755,130.319579,...,128.761465,355.480853,592.043995,170.986463,957.9861,33.9476,340.566859,506.799871,111.808237,990.276121
GSM144463,0.0,839.088737,10632.14269,90.511041,119.047538,38.636492,380.26753,16.028805,185.39833,236.420183,...,206.321747,452.471203,337.618206,14.578266,325.414536,46.070455,373.343082,990.485936,87.818233,607.497664
GSM144464,1.0,336.467647,1682.869608,55.524387,88.923424,189.905822,546.137019,12.986112,260.016757,126.58685,...,144.294869,321.884265,631.321929,538.712478,1768.975867,43.445391,217.649532,848.028158,98.616717,611.688165
GSM144465,1.0,304.352824,2528.103999,16.17715,69.388208,181.276047,410.124951,8.106628,205.610911,152.169285,...,127.444978,487.996451,570.303431,389.849283,1080.700123,61.595873,523.625683,784.246503,156.785762,710.38415
GSM144466,1.0,335.174207,2393.567193,224.41758,80.91696,158.221706,446.913434,12.5573,251.25977,180.410165,...,118.930734,399.247372,416.674309,101.277905,325.019757,39.224029,289.331912,1656.972634,193.936914,672.479055
GSM144467,1.0,459.603,1469.914264,64.379765,166.375412,113.384153,495.138195,10.760307,306.258201,188.590295,...,191.973947,183.822971,439.882323,96.552485,380.991633,37.271771,287.891936,1332.797022,75.116985,416.914697
GSM144468,1.0,1346.091671,9117.143657,194.93117,116.424861,136.317584,295.533127,16.908484,140.985456,170.954997,...,192.403265,445.919467,456.478396,6.491022,348.142671,91.102073,395.398262,733.009275,137.827812,405.237446
GSM144469,1.0,1345.810652,6980.921708,168.58798,22.586034,297.286839,317.869036,9.872151,151.278739,135.6812,...,30.741197,488.605292,401.334453,50.910698,350.587775,42.849965,479.461459,973.694145,136.59778,526.876845
GSM144470,1.0,445.784383,3251.944599,177.297132,165.208876,45.135663,371.403622,21.059056,254.574584,166.202472,...,168.860554,173.28927,824.131863,80.999068,239.980663,82.402914,86.904721,260.131155,61.583107,452.282034


Check if the merged data biased or not:

In [140]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 20 samples.
For the feature 'Kidney Chromophobe', the least common label is '0.0' with 6 occurrences. This represents 30.00% of the dataset.
The distribution of the feature 'Kidney Chromophobe' in this dataset is fine.



False

Save the data as a csv file:

In [141]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)