# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Mesothelioma'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


# 2. Data preprocessing and selection

## 2.1. The TCGA Xena dataset

In TCGA Xena, there is either zero or one cohort related to the trait. We search the names of subdirectories to see if any matches the trait. If a match is found, we directly obtain the file paths.

In [2]:
dataset = 'TCGA'
dataset_dir = os.path.join(DATA_ROOT, dataset)
os.listdir(dataset_dir)[:10]

['TCGA_Adrenocortical_Cancer_(ACC)',
 'TCGA_Breast_Cancer_(BRCA)',
 'TCGA_Cervical_Cancer_(CESC)',
 'TCGA_Kidney_Chromophobe_(KICH)',
 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)',
 'TCGA_Lower_Grade_Glioma_(LGG)',
 'TCGA_Melanoma_(SKCM)',
 'TCGA_Mesothelioma_(MESO)',
 'TCGA_Testicular_Cancer_(TGCT)',
 'TCGA_Uterine_Carcinosarcoma_(UCS)']

In [3]:
trait_subdir = "TCGA_Mesothelioma_(MESO)"
cohort = 'Xena'
trait_type = 'binary'
is_available = True

cohort_dir = os.path.join(DATA_ROOT, dataset, trait_subdir)
clinical_data_file, genetic_data_file = xena_get_relevant_filepaths(cohort_dir)
clinical_data_file, genetic_data_file

('/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Mesothelioma_(MESO)\\TCGA.MESO.sampleMap_MESO_clinicalMatrix',
 '/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Mesothelioma_(MESO)\\TCGA.MESO.sampleMap_HiSeqV2_PANCAN.gz')

In [4]:
import pandas as pd

clinical_data = pd.read_csv(clinical_data_file, sep='\t', index_col=0)
genetic_data = pd.read_csv(genetic_data_file, compression='gzip', sep='\t', index_col=0)
age_col = gender_col = None

In [5]:
_, clinical_data_cols = check_rows_and_columns(clinical_data)
clinical_data_cols[:10]

['_INTEGRATION',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_pharmaceutical_therapy',
 'additional_radiation_therapy',
 'age_at_initial_pathologic_diagnosis',
 'asbestos_exposure_age',
 'asbestos_exposure_age_last']

Read all the column names in the clinical dataset, to find the columns that record information about age or gender.
Reference prompt:

In [6]:
f'''
Below is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:
candidate_age_cols = [col_name1, col_name2, ...]
candidate_gender_cols = [col_name1, col_name2, ...]
If no columns match a criterion, please provide an empty list.

Column names:
{clinical_data_cols}
'''

"\nBelow is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:\ncandidate_age_cols = [col_name1, col_name2, ...]\ncandidate_gender_cols = [col_name1, col_name2, ...]\nIf no columns match a criterion, please provide an empty list.\n\nColumn names:\n['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'age_at_initial_pathologic_diagnosis', 'asbestos_exposure_age', 'asbestos_exposure_age_last', 'asbestos_exposure_source', 'asbestos_exposure_type', 'asbestos_exposure_years', 'assessment_timepoint_category', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'creatinine_norm_range_lower', 'creatinine_norm_range_u

In [7]:
candidate_age_cols = ['age_at_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']


Choose a single column from the candidate columns that record age and gender information respectively.
If no column meets the requirement, keep 'age_col' or 'gender_col' to None

In [8]:
age_col = 'age_at_initial_pathologic_diagnosis'
gender_col = 'gender'
selected_clinical_data = xena_select_clinical_features(clinical_data, TRAIT, age_col=age_col, gender_col=gender_col)

In [9]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [10]:
genetic_data

Unnamed: 0,TCGA-SH-A7BC-01,TCGA-LK-A4NY-01,TCGA-3H-AB3L-01,TCGA-3U-A98F-01,TCGA-NQ-A638-01,TCGA-MQ-A6BS-01,TCGA-ZN-A9VV-01,TCGA-3H-AB3X-01,TCGA-3U-A98G-01,TCGA-TS-A7PB-01,...,TCGA-YS-A95C-01,TCGA-MQ-A4LM-01,TCGA-3H-AB3T-01,TCGA-UT-A88D-01,TCGA-MQ-A4KX-01,TCGA-3U-A98I-01,TCGA-TS-A7OU-01,TCGA-LK-A4O6-01,TCGA-TS-A7P7-01,TCGA-ZN-A9VW-01
ARHGEF10L,-0.566692,-1.148192,0.200408,-2.360992,-0.098292,-0.386092,-1.352492,0.503108,-0.662192,-1.328692,...,-0.833992,-1.056392,-0.505092,-0.207492,0.072908,-0.578292,0.354308,-0.479092,-1.472692,-0.856492
HIF3A,-3.993126,-0.560126,-4.117426,2.860974,-1.254426,-1.119526,1.532374,-1.579526,-0.520126,-1.517626,...,-3.770726,-2.890926,-5.554726,-0.867826,-2.518126,-1.904426,-3.805626,0.799274,3.710074,-0.225026
RNF17,-0.124735,-0.531035,-0.531035,-0.055435,-0.047735,0.034365,-0.531035,-0.531035,-0.531035,-0.531035,...,0.620765,-0.531035,-0.531035,1.192765,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035
RNF10,0.743328,0.631928,0.786128,0.011228,0.930628,0.492728,0.161328,0.271728,-0.002872,0.686128,...,1.299528,-0.037572,0.036928,-0.110272,1.019828,0.159728,1.244628,0.156228,0.730128,0.141428
RNF11,0.085322,-0.046578,-0.033878,-0.855678,-0.103878,0.332522,0.252122,-0.106878,0.498622,0.189322,...,-0.638678,-0.081278,-0.100678,0.005822,-0.070778,0.338022,0.530122,-0.568078,0.219122,0.302922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GNGT1,-1.281390,1.454410,-0.390690,2.342010,2.171210,-0.715990,0.100810,0.254810,-1.281390,-1.281390,...,-1.281390,-1.281390,-1.281390,1.475110,-1.281390,-1.281390,-0.611990,-1.281390,-1.281390,-1.281390
TULP3,0.134123,0.154123,0.113323,1.073923,-0.391177,0.763823,0.077423,-0.024977,0.468523,-0.419677,...,-0.209177,0.134323,-1.045577,0.414723,-0.498677,0.493123,-0.536977,0.235023,0.834123,0.455123
BCL6B,-0.633227,-1.092527,0.166873,-0.200927,-1.125027,-1.323527,-2.129627,-1.716927,3.365873,-1.964427,...,-1.842127,1.431473,1.057773,0.249673,-1.179527,1.599373,-0.951527,0.721473,1.862473,0.223173
GSTK1,0.519305,0.420705,0.070905,-0.438495,0.260005,0.039505,-0.650595,0.232705,-0.073595,0.885805,...,0.577505,0.401905,0.222905,-0.274595,0.420605,-0.354695,0.629005,0.526505,-0.112595,-0.243395


In [11]:
merged_data = selected_clinical_data.join(genetic_data.T).dropna()
merged_data.head()

Unnamed: 0_level_0,Mesothelioma,Age,Gender,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,...,SLC7A10,PLA2G2C,TULP2,NPY5R,GNGT2,GNGT1,TULP3,BCL6B,GSTK1,SELP
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-3H-AB3K-01,1,64,1,0.160908,-3.157426,-0.531035,0.483128,-0.152778,0.65969,0.001606,...,-0.068986,-0.086682,-0.061978,-0.900217,0.803567,-1.28139,-0.551677,-2.035927,0.175405,0.164067
TCGA-3H-AB3L-01,1,60,1,0.200408,-4.117426,-0.531035,0.786128,-0.033878,0.10959,-0.456594,...,-1.573486,-0.086682,-0.748878,-1.074117,0.139267,-0.39069,0.113323,0.166873,0.070905,0.942767
TCGA-3H-AB3M-01,1,53,0,-1.009592,-2.663326,-0.068835,0.828928,-0.737678,0.77649,-0.002594,...,1.636814,-0.086682,0.062822,-1.124917,0.242467,-0.46969,0.307823,-1.703127,-0.213695,1.932667
TCGA-3H-AB3O-01,1,58,1,-0.147492,-2.266526,-0.531035,0.274528,-0.311878,0.31179,-1.012494,...,-0.528386,-0.086682,1.031722,0.384783,3.061667,-1.28139,-0.580077,0.947073,1.408105,2.816367
TCGA-3H-AB3S-01,1,69,1,0.282008,-2.042126,-0.531035,0.825228,-0.210878,0.07039,-0.070994,...,-2.090786,-0.086682,-0.748878,-1.587117,-0.441533,-1.28139,0.417023,-0.154627,-0.004295,-0.111033


In [12]:
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merge_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 87 samples.
For the feature 'Mesothelioma', the least common label is '1' with 87 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Mesothelioma' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 57.0
  50% (Median): 64.0
  75%: 69.0
Min: 28
Max: 81
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0' with 16 occurrences. This represents 18.39% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



True

In [13]:
merged_data.head()
if not is_trait_biased:
    merge_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

In [14]:
save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Mesothelioma\cohort_info.json


## 2.2. The GEO dataset

In [15]:
dataset = 'GEO'
trait_subdir = "Mesothelioma"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE107754',
 'GSE112154',
 'GSE117668',
 'GSE131027',
 'GSE163720',
 'GSE163721',
 'GSE163722',
 'GSE172159',
 'GSE40345',
 'GSE42977',
 'GSE51024',
 'GSE52788',
 'GSE57464',
 'GSE64738',
 'GSE67487',
 'GSE68950',
 'GSE73161',
 'GSE73162',
 'GSE99362']

Repeat the below steps for all the accession numbers

In [16]:
# No obvious trait
cohort = accession_num = "GSE107754"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE107754\\GSE107754_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE107754\\GSE107754_series_matrix.txt.gz')

In [22]:
# Biased
cohort = accession_num = "GSE112154"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE112154\\GSE112154_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE112154\\GSE112154_series_matrix.txt.gz')

In [35]:
# No gene mapping
cohort = accession_num = "GSE117668"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE117668\\GSE117668_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE117668\\GSE117668_series_matrix.txt.gz')

In [43]:
# Finished
cohort = accession_num = "GSE131027"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE131027\\GSE131027_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE131027\\GSE131027_series_matrix.txt.gz')

In [56]:
# No obvious trait
cohort = accession_num = "GSE163720"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE163720\\GSE163720_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE163720\\GSE163720_series_matrix.txt.gz')

In [67]:
# Biased
cohort = accession_num = "GSE163721"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE163721\\GSE163721_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE163721\\GSE163721_series_matrix.txt.gz')

In [71]:
# No obvious trait
cohort = accession_num = "GSE163722"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE163722\\GSE163722_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE163722\\GSE163722-GPL11532_series_matrix.txt.gz')

In [75]:
# No obvious trait
cohort = accession_num = "GSE172159"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE172159\\GSE172159_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE172159\\GSE172159_series_matrix.txt.gz')

In [79]:
# No gene mapping
cohort = accession_num = "GSE40345"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE40345\\GSE40345_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE40345\\GSE40345_series_matrix.txt.gz')

In [91]:
# Biased
cohort = accession_num = "GSE42977"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE42977\\GSE42977_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE42977\\GSE42977_series_matrix.txt.gz')

In [105]:
# Finished
cohort = accession_num = "GSE51024"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE51024\\GSE51024_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE51024\\GSE51024_series_matrix.txt.gz')

In [129]:
# No obvious trait
cohort = accession_num = "GSE52788"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE52788\\GSE52788_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE52788\\GSE52788-GPL3718_series_matrix.txt.gz')

In [133]:
# Biased
cohort = accession_num = "GSE57464"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE57464\\GSE57464_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE57464\\GSE57464_series_matrix.txt.gz')

In [146]:
# No obvious traits
cohort = accession_num = "GSE64738"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE64738\\GSE64738_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE64738\\GSE64738_series_matrix.txt.gz')

In [150]:
# Biased
cohort = accession_num = "GSE67487"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE67487\\GSE67487_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE67487\\GSE67487_series_matrix.txt.gz')

In [154]:
# No obvious traits
cohort = accession_num = "GSE68950"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE68950\\GSE68950_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE68950\\GSE68950_series_matrix.txt.gz')

In [170]:
# No gene mapping
cohort = accession_num = "GSE73161"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE73161\\GSE73161_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE73161\\GSE73161_series_matrix.txt.gz')

In [174]:
# No gene mapping
cohort = accession_num = "GSE73162"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE73162\\GSE73162_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE73162\\GSE73162-GPL20275_series_matrix.txt.gz')

In [185]:
# No gene mapping
cohort = accession_num = "GSE99362"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE99362\\GSE99362_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Mesothelioma\\GSE99362\\GSE99362_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [186]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"microRNA expression profiling in diffuse malignant peritoneal mesothelioma"
!Series_summary	"Diffuse malignant peritoneal mesothelioma (DMPM) is a rapidly lethal malignancy. The comprehension of the molecular and cellular features of DMPM is of utmost importance for the fruitful management of the disease, especially in patients who fail standard treatments and have a poor prognosis due to the lack of effective alternative therapeutic options. In this context, we previously found that telomerase activity (TA), which accounts for the limitless proliferative potential of cancer cells, is prognostic for disease relapse and cancer-related death in DMPM patients. Consequently, the identification of factors involved in telomerase activation/regulation may pave the way towards the development of novel therapeutic interventions for the disease. In the present study, miRNA expression profiling was carried out in a series of DMPM tissue specimens, previously characterized for the o

In [187]:
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM2642980,GSM2642981,GSM2642982,GSM2642983,GSM2642984,GSM2642985,GSM2642986,GSM2642987,GSM2642988,...,GSM2643028,GSM2643029,GSM2643030,GSM2643031,GSM2643032,GSM2643033,GSM2643034,GSM2643035,GSM2643036,GSM2643037
0,!Sample_characteristics_ch1,tissue: normal peritoneum,tissue: normal peritoneum,tissue: normal peritoneum,tissue: normal peritoneum,tissue: normal peritoneum,cell line: cell line derived from the tumor of...,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,...,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen,tissue: DMPM frozen tumor specimen
1,!Sample_characteristics_ch1,telomerase activity: NA,telomerase activity: NA,telomerase activity: NA,telomerase activity: NA,telomerase activity: NA,telomerase activity: 1,telomerase activity: 1,telomerase activity: 1,telomerase activity: 1,...,telomerase activity: 1,telomerase activity: 0,telomerase activity: 0,telomerase activity: 1,telomerase activity: 1,telomerase activity: 0,telomerase activity: 1,telomerase activity: 1,telomerase activity: 0,telomerase activity: 1


Analyze the trait row:

In [189]:
tumor_stage_row = clinical_data.iloc[0]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1', 'tissue: normal peritoneum',
       'cell line: cell line derived from the tumor of a DMPM patient',
       'tissue: DMPM frozen tumor specimen',
       'cell line: normal mesothelial cell line (MES-F)'], dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [190]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'tissue: normal peritoneum' or 'cell line: normal mesothelial cell line (MES-F)'):
        return 0
    else:
        return 1

def convert_age(age_string):
    if age_string == 'n.a.':
        return None
    try:
        age = int(age_string.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [191]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM2642980,GSM2642981,GSM2642982,GSM2642983,GSM2642984,GSM2642985,GSM2642986,GSM2642987,GSM2642988,GSM2642989,...,GSM2643028,GSM2643029,GSM2643030,GSM2643031,GSM2643032,GSM2643033,GSM2643034,GSM2643035,GSM2643036,GSM2643037
Mesothelioma,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Genetic data preprocessing and final filtering

Check the genetic data:

In [192]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM2642980,GSM2642981,GSM2642982,GSM2642983,GSM2642984,GSM2642985,GSM2642986,GSM2642987,GSM2642988,GSM2642989,...,GSM2643028,GSM2643029,GSM2643030,GSM2643031,GSM2643032,GSM2643033,GSM2643034,GSM2643035,GSM2643036,GSM2643037
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ILMN_3166935,13.300435,12.087595,12.63877,11.822171,8.716875,10.726839,13.549784,13.385954,12.357774,12.288539,...,11.345022,10.93205,11.485401,12.556914,13.09515,12.363069,10.784803,12.331233,12.35769,12.815934
ILMN_3166938,14.868356,13.778544,14.673963,14.599408,14.019286,14.253204,14.721321,14.656522,14.932162,14.942375,...,14.939675,14.8482,14.249257,14.41529,14.719228,14.53756,15.084427,14.504822,14.263423,14.344565
ILMN_3166940,9.591793,10.39594,10.214005,10.62165,11.244769,11.397247,10.451249,10.580562,10.463574,10.340009,...,11.274713,11.501298,12.444835,10.485318,9.86439,11.608942,10.675793,11.528785,12.224715,10.665295
ILMN_3166941,14.045577,13.192169,13.924135,13.356833,10.616447,13.102322,14.23061,14.794105,13.658476,13.805841,...,13.499657,13.514177,13.43426,13.971901,13.802834,13.711218,13.353869,14.128303,14.017096,13.907098
ILMN_3166943,11.24229,9.787049,10.759074,10.491387,10.793983,9.499666,9.606616,9.864562,10.373289,10.269825,...,9.639816,9.951462,9.377465,9.7348,10.392706,10.295869,10.114188,10.528065,9.387953,9.635768


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [193]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['ILMN_3167151', 'ILMN_3167958', 'ILMN_3167403', 'ILMN_3167819', 'ILMN_3167659'], 'SYMBOL': ['ILMN_3167151', 'ILMN_3167958', 'ILMN_3167403', 'ILMN_3167819', 'ILMN_3167659'], 'ILMN_Gene': ['hsa-miR-553', 'HS_124', 'hsa-miR-320d,hsa-miR-320b,hsa-miR-320a,hsa-miR-320c', 'hsa-miR-522', 'hsa-miR-1537'], 'Search_Key': ['hsa-miR-553', 'HS_124', 'hsa-miR-320d,hsa-miR-320b,hsa-miR-320a,hsa-miR-320c', 'hsa-miR-522', 'hsa-miR-1537'], 'SEQUENCE': ['AAAACGGTGAGATTTTGTT', 'AAAAGAACATGGGTTGAG', 'AAAAGCTGGGTTGAGAGG', 'AAAATGGTTCCCTTTAGAGT', 'AAACCGTCTAGTTACAGTTGT'], 'TargetMatureSeqs': ['AAAACGGTGAGATTTTGTTTT', 'AAAAGAACATGGGTTGAG', 'AAAAGCTGGGTTGAGAGGA,AAAAGCTGGGTTGAGAGGGCAA,AAAAGCTGGGTTGAGAGGGCGA,AAAAGCTGGGTTGAGAGGGT', 'AAAATGGTTCCCTTTAGAGTGT', 'AAAACCGTCTAGTTACAGTTGT'], 'TargetMatureName': ['hsa-miR-553', 'HS_124', 'hsa-miR-320d,hsa-miR-320b,hsa-miR-320a,hsa-miR-320c', 'hsa-miR-522', 'hsa-miR-1537'], 'miRNA_ID': ['hsa-miR-553', nan, 'hsa-miR-320d,hsa-miR-320b,hsa-miR-320a,hsa-miR-320c', 'hsa

Index(['ID', 'SYMBOL', 'ILMN_Gene', 'Search_Key', 'SEQUENCE',
       'TargetMatureSeqs', 'TargetMatureName', 'miRNA_ID', 'SPOT_ID',
       'NumTargets', 'TargetMatureVersion', 'OriginalMatureSeq',
       'OriginalMatureName', 'Source', 'Array_Address_Id', 'Illumicode',
       'Oligo', 'U3_Seq', 'Ploidy', 'Species', 'Probe_MatchOrder',
       'Chromosome', 'Probe_Coordinates', 'Probe_Chr_Orientation'],
      dtype='object')

In [183]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'gene_symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [194]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM2642980,GSM2642981,GSM2642982,GSM2642983,GSM2642984,GSM2642985,GSM2642986,GSM2642987,GSM2642988,GSM2642989,...,GSM2643028,GSM2643029,GSM2643030,GSM2643031,GSM2643032,GSM2643033,GSM2643034,GSM2643035,GSM2643036,GSM2643037


Use selected clinical data and genetic data to generate the merged data:

In [143]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Mesothelioma,A2ML1,A4GALT,ABCA4,ABCC6,ABHD14A,ABHD14A-ACY1,ABHD14B,ABHD5,ABHD6,...,ZNF607,ZNF622,ZNF624,ZNF658,ZNF660,ZNF682,ZNF727,ZNF786,ZNF804B,ZRANB3
GSM1383156,1.0,-0.23,-0.22,-0.47,0.33,-0.254483,-0.11,0.0036,-0.213743,0.01,...,0.0,-0.09,0.1,-0.14,-0.639792,-0.32,0.04,0.2,-0.42,-0.57
GSM1383157,1.0,-0.11,0.82,0.89,0.06,-0.054483,0.115833,0.0796,-0.525965,-1.05,...,0.08,0.03,0.58,-0.85,-0.181667,0.28,0.12,-0.63,-0.53,-0.54
GSM1383158,1.0,0.53,1.11,0.37,1.14,0.48931,0.5775,0.5884,0.539123,0.0,...,0.82,0.34,0.96,0.71,0.099375,0.29,0.9,0.92,0.14,0.19
GSM1383159,1.0,-0.06,0.37,0.25,0.86,0.122414,0.346944,0.2892,-0.330643,-0.67,...,-0.03,1.27,0.84,0.45,-0.116667,0.17,0.76,-0.22,-0.4,0.27
GSM1383160,1.0,-0.15,-0.71,-0.65,0.12,-0.348276,-0.270833,-0.1732,-0.814444,0.08,...,-0.09,0.16,-0.87,-0.35,-0.746667,0.29,-0.14,0.01,-0.16,-0.15
GSM1383161,1.0,-0.42,-0.27,0.01,0.78,0.04,0.158611,0.1964,-0.138889,-0.26,...,0.61,0.89,-0.27,0.41,-0.169167,0.49,0.64,0.54,0.76,0.77
GSM1383162,1.0,-0.74,-0.43,-0.58,0.21,-0.898276,-0.730833,-0.6884,-0.377251,-0.54,...,-0.56,0.1,-0.04,0.64,-0.284792,-0.43,0.04,-0.83,-0.2,-0.28
GSM1383163,0.0,-0.35,0.02,0.26,0.43,0.034138,0.075278,0.114,-0.029942,0.05,...,-0.09,0.01,0.02,-0.02,0.029792,-0.2,0.0,-0.03,-0.19,0.01
GSM2159884,1.0,-0.09,-0.31,0.2,0.48,0.04931,0.152222,0.1388,-0.089474,-0.32,...,0.07,0.45,-0.08,0.23,-0.118125,-0.03,0.46,0.12,0.39,0.24
GSM2159885,1.0,-0.23,-0.21,0.37,0.25,0.053103,0.008611,0.1328,-0.032339,0.08,...,0.48,0.88,-0.15,0.21,-0.076667,-0.22,0.32,0.2,0.33,0.23


Check if the merged data biased or not:

In [144]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 34 samples.
For the feature 'Mesothelioma', the least common label is '0.0' with 1 occurrences. This represents 2.94% of the dataset.
The distribution of the feature 'Mesothelioma' in this dataset is severely biased.



True

Save the data as a csv file:

In [145]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)