# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Lower Grade Glioma'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


# 2. Data preprocessing and selection

## 2.1. The TCGA Xena dataset

In TCGA Xena, there is either zero or one cohort related to the trait. We search the names of subdirectories to see if any matches the trait. If a match is found, we directly obtain the file paths.

In [2]:
dataset = 'TCGA'
dataset_dir = os.path.join(DATA_ROOT, dataset)
os.listdir(dataset_dir)[:10]

['TCGA_Adrenocortical_Cancer_(ACC)',
 'TCGA_Breast_Cancer_(BRCA)',
 'TCGA_Cervical_Cancer_(CESC)',
 'TCGA_Kidney_Chromophobe_(KICH)',
 'TCGA_Kidney_Papillary_Cell_Carcinoma_(KIRP)',
 'TCGA_Lower_Grade_Glioma_(LGG)',
 'TCGA_Melanoma_(SKCM)',
 'TCGA_Mesothelioma_(MESO)',
 'TCGA_Testicular_Cancer_(TGCT)',
 'TCGA_Uterine_Carcinosarcoma_(UCS)']

In [3]:
trait_subdir = "TCGA_Lower_Grade_Glioma_(LGG)"
cohort = 'Xena'
trait_type = 'binary'
is_available = True

cohort_dir = os.path.join(DATA_ROOT, dataset, trait_subdir)
clinical_data_file, genetic_data_file = xena_get_relevant_filepaths(cohort_dir)
clinical_data_file, genetic_data_file

('/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Lower_Grade_Glioma_(LGG)\\TCGA.LGG.sampleMap_LGG_clinicalMatrix',
 '/Users/legion/Desktop/Courses/IS389/data\\TCGA\\TCGA_Lower_Grade_Glioma_(LGG)\\TCGA.LGG.sampleMap_HiSeqV2_PANCAN.gz')

In [4]:
import pandas as pd

clinical_data = pd.read_csv(clinical_data_file, sep='\t', index_col=0)
genetic_data = pd.read_csv(genetic_data_file, compression='gzip', sep='\t', index_col=0)
age_col = gender_col = None

In [5]:
_, clinical_data_cols = check_rows_and_columns(clinical_data)
clinical_data_cols[:10]

['_INTEGRATION',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_pharmaceutical_therapy',
 'additional_radiation_therapy',
 'additional_surgery_locoregional_procedure',
 'additional_surgery_metastatic_procedure',
 'age_at_initial_pathologic_diagnosis']

Read all the column names in the clinical dataset, to find the columns that record information about age or gender.
Reference prompt:

In [6]:
f'''
Below is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:
candidate_age_cols = [col_name1, col_name2, ...]
candidate_gender_cols = [col_name1, col_name2, ...]
If no columns match a criterion, please provide an empty list.

Column names:
{clinical_data_cols}
'''

"\nBelow is a list of column names from a biomedical dataset. Please examine it and identify the columns that are likely to contain information about patients' age. Additionally, please do the same for columns that may hold data on patients' gender. Please provide your answer by strictly following this format, without redundant words:\ncandidate_age_cols = [col_name1, col_name2, ...]\ncandidate_gender_cols = [col_name1, col_name2, ...]\nIf no columns match a criterion, please provide an empty list.\n\nColumn names:\n['_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'additional_surgery_locoregional_procedure', 'additional_surgery_metastatic_procedure', 'age_at_initial_pathologic_diagnosis', 'animal_insect_allergy_history', 'animal_insect_allergy_types', 'asthma_history', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'days_to_additional_surgery_locoregional_procedure', '

In [7]:
candidate_age_cols = ['age_at_initial_pathologic_diagnosis']
candidate_gender_cols = ['gender']


Choose a single column from the candidate columns that record age and gender information respectively.
If no column meets the requirement, keep 'age_col' or 'gender_col' to None

In [8]:
age_col = 'age_at_initial_pathologic_diagnosis'
gender_col = 'gender'
selected_clinical_data = xena_select_clinical_features(clinical_data, TRAIT, age_col=age_col, gender_col=gender_col)

In [9]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)

In [10]:
genetic_data

Unnamed: 0,TCGA-E1-5319-01,TCGA-HT-7693-01,TCGA-CS-6665-01,TCGA-S9-A7J2-01,TCGA-FG-A6J3-01,TCGA-FG-6688-01,TCGA-S9-A6TX-01,TCGA-VM-A8C8-01,TCGA-HT-A74L-01,TCGA-S9-A7QY-01,...,TCGA-HT-7483-01,TCGA-HT-7687-01,TCGA-TM-A84G-01,TCGA-E1-A7Z6-01,TCGA-DU-7014-01,TCGA-HT-A74O-01,TCGA-DU-A76O-01,TCGA-DU-A5TW-01,TCGA-HT-7857-01,TCGA-FG-5962-01
ARHGEF10L,0.256808,1.944408,0.323208,1.093608,0.808408,0.999308,0.975108,1.795608,1.522108,0.720008,...,1.967008,0.300808,0.192708,1.406008,0.700708,2.647808,2.005508,2.097808,0.358508,0.526108
HIF3A,4.895874,4.962574,-1.908126,-0.744826,-2.776226,4.063474,5.198274,3.007474,5.320174,2.826174,...,-0.469026,1.635574,3.304574,6.879274,3.960974,1.864074,0.761774,3.381074,-1.957226,3.988474
RNF17,-0.531035,-0.531035,-0.531035,-0.065335,-0.531035,-0.531035,-0.531035,-0.531035,-0.531035,1.444865,...,-0.531035,-0.531035,-0.531035,0.425565,-0.531035,0.032465,-0.531035,-0.531035,-0.531035,-0.531035
RNF10,-0.560472,0.119828,-0.240672,-0.510072,0.038228,-0.298372,-0.364772,-0.509872,-0.630572,-0.385472,...,-0.225872,-0.799172,-1.303872,-0.068072,-0.036672,0.160928,-0.155172,-0.377372,-0.097372,-0.441172
RNF11,0.034422,0.718222,0.318222,-0.791778,-0.808078,0.324922,-0.685278,0.387022,-0.427078,-0.014078,...,0.702722,0.181722,-0.350178,0.579722,0.335422,-0.194778,-0.074678,0.082822,0.087522,0.594722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GNGT1,-1.281390,-1.281390,-1.281390,-1.281390,-1.281390,-1.281390,-1.281390,-0.677990,-1.281390,-1.281390,...,-1.281390,-1.281390,-1.281390,-0.725190,-1.281390,-1.281390,-1.281390,-1.281390,-1.281390,-1.281390
TULP3,0.086323,0.633423,-0.743977,0.032023,1.087723,-0.049377,-0.079777,-0.170077,-0.187077,-0.145077,...,-0.166277,-0.662477,-0.289077,0.196323,-0.201477,-0.136577,-0.864777,-0.396977,-0.381577,-0.628677
BCL6B,0.028173,0.167973,0.546073,-1.305127,-0.318227,-0.186927,0.341773,-1.553027,-0.576727,-1.726127,...,-0.206127,0.039273,0.162273,-0.966827,-0.151927,-0.431527,-0.931827,-0.674327,0.260773,-1.236427
GSTK1,-1.294695,-1.360295,-1.143995,-1.485295,-0.573495,-0.069795,-1.353595,-1.173395,-1.181495,-0.702795,...,-1.676795,-1.353095,-1.520095,-1.312995,-1.088295,-0.003995,-1.109095,-0.680295,-0.047695,-1.578195


In [11]:
merged_data = selected_clinical_data.join(genetic_data.T).dropna()
merged_data.head()

Unnamed: 0_level_0,Lower Grade Glioma,Age,Gender,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,...,SLC7A10,PLA2G2C,TULP2,NPY5R,GNGT2,GNGT1,TULP3,BCL6B,GSTK1,SELP
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-CS-4938-01,1,31.0,0.0,1.675408,2.256574,-0.025535,-0.129672,0.512522,1.00569,0.980606,...,2.145814,-0.086682,-0.748878,-0.411617,0.508467,-1.28139,0.587923,-1.931327,-1.200995,-5.525533
TCGA-CS-4941-01,1,67.0,1.0,1.677108,1.531874,-0.103335,-0.044972,1.044022,0.68629,0.715906,...,3.563814,-0.086682,2.294922,2.043683,0.703167,-1.28139,-0.211777,-0.691827,-0.067395,-4.274433
TCGA-CS-4942-01,1,44.0,0.0,1.451408,0.339574,0.227665,-0.359972,0.479422,0.83969,0.928806,...,2.061014,-0.086682,-0.748878,2.006583,1.287867,-1.28139,-0.521377,-1.996927,-1.020895,-5.096833
TCGA-CS-4943-01,1,37.0,1.0,1.159508,1.747574,0.193265,0.049128,0.345022,0.20049,0.736706,...,-0.864186,-0.086682,-0.748878,-0.862817,0.231767,-1.28139,0.760023,-1.260727,-2.100995,-4.541333
TCGA-CS-4944-01,1,50.0,1.0,0.847608,3.430774,-0.531035,-0.503272,0.820222,1.21089,0.934706,...,3.457214,-0.086682,-0.159678,1.123883,0.491667,-1.28139,-0.713077,-1.252627,-1.109495,-3.516133


In [12]:
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merge_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 529 samples.
For the feature 'Lower Grade Glioma', the least common label is '1' with 529 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Lower Grade Glioma' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 32.0
  50% (Median): 41.0
  75%: 53.0
Min: 14.0
Max: 87.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 238 occurrences. This represents 44.99% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



True

In [13]:
merged_data.head()
if not is_trait_biased:
    merge_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

In [14]:
save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Lower-Grade-Glioma\cohort_info.json


## 2.2. The GEO dataset

In [15]:
dataset = 'GEO'
trait_subdir = "Lower-Grade-Glioma"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE107850',
 'GSE111627',
 'GSE14',
 'GSE143843',
 'GSE145510',
 'GSE15309',
 'GSE19578',
 'GSE24072',
 'GSE28271',
 'GSE35158',
 'GSE4058',
 'GSE74567']

Repeat the below steps for all the accession numbers

In [16]:
# No obvious traits
cohort = accession_num = "GSE107850"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE107850\\GSE107850_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE107850\\GSE107850_series_matrix.txt.gz')

In [19]:
# Biased
cohort = accession_num = "GSE111627"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE111627\\GSE111627_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE111627\\GSE111627_series_matrix.txt.gz')

In [23]:
# No obvious traits
cohort = accession_num = "GSE14"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE14\\GSE14_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE14\\GSE14-GPL11_series_matrix.txt.gz')

In [27]:
# Biased
cohort = accession_num = "GSE143843"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE143843\\GSE143843_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE143843\\GSE143843-GPL21145_series_matrix.txt.gz')

In [33]:
# No gene mapping
cohort = accession_num = "GSE145510"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE145510\\GSE145510_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE145510\\GSE145510_series_matrix.txt.gz')

In [41]:
# No obvious traits
cohort = accession_num = "GSE15309"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE15309\\GSE15309_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE15309\\GSE15309_series_matrix.txt.gz')

In [44]:
# MemoryError
cohort = accession_num = "GSE19578"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE19578\\GSE19578_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE19578\\GSE19578-GPL3718_series_matrix.txt.gz')

In [52]:
# Finished
cohort = accession_num = "GSE24072"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE24072\\GSE24072_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE24072\\GSE24072_series_matrix.txt.gz')

In [65]:
# Finished
cohort = accession_num = "GSE28271"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE28271\\GSE28271_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE28271\\GSE28271_series_matrix.txt.gz')

In [79]:
# Biased
cohort = accession_num = "GSE35158"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE35158\\GSE35158_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE35158\\GSE35158_series_matrix.txt.gz')

In [93]:
# No traits
cohort = accession_num = "GSE4058"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE4058\\GSE4058_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE4058\\GSE4058-GPL182_series_matrix.txt.gz')

In [96]:
# No obvious traits
cohort = accession_num = "GSE74567"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE74567\\GSE74567_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Lower-Grade-Glioma\\GSE74567\\GSE74567_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [97]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Astrocytoma cell lines transduced to either express GFAP isoforms or to knockdown GFAP isoforms"
!Series_summary	"Modulation of the GFAP cytoskeleton in astrocytoma cells alters processes involved in extracellular matrix remodelling and cell-cell signalling – a transcriptome analysis"
!Series_summary	"Astrocytomas grade IV are malignant brain tumours with no effective treatment and a five year survival rate of only 5%. Expression of Glial Fibrillary Acidic Protein (GFAP) is lower in high astrocytoma grade, but the expression of the splice isoform GFAPδ is similar in low and high-grade astrocytomas. Thus the ratio of GFAPδ/α is increased in high-grade astrocytomas. We studied transcriptome changes in astrocytoma cell lines resulting from an induced alteration of GFAP isoform expression. GFAPα or GFAPδ were increased or decreased by recombinant expression or shRNA mediated knockdown of GFAPpan or GFAPα. We find that the most prominent effects are induced by the modulations

In [98]:
clinical_data

Unnamed: 0,!Sample_geo_accession,GSM1923085,GSM1923086,GSM1923087,GSM1923088,GSM1923089,GSM1923090,GSM1923091,GSM1923092,GSM1923093,...,GSM1923115,GSM1923116,GSM1923117,GSM1923118,GSM1923119,GSM1923120,GSM1923121,GSM1923122,GSM1923123,GSM1923124
0,!Sample_characteristics_ch1,tumour line: U373,tumour line: U373,tumour line: U373,tumour line: U373,tumour line: U373,tumour line: U373,tumour line: U373,tumour line: U373,tumour line: U373,...,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251,tumour line: U251
1,!Sample_characteristics_ch1,construct: shRNA GFAPpan,construct: shRNA NTC,construct: shRNA GFAPalpha,construct: shRNA GFAPalpha,construct: shRNA GFAPpan,construct: shRNA NTC,construct: shRNA NTC,construct: shRNA GFAPalpha,construct: shRNA NTC,...,construct: recombinant GFAPdelta - IRES - mCherry,construct: recombinant GFAPalpha - IRES - GFP,construct: recombinant mCherry,construct: recombinant mCherry,construct: recombinant GFAPalpha - IRES - GFP,construct: recombinant GFAPdelta - IRES - mCherry,construct: recombinant GFAPalpha - IRES - GFP,construct: recombinant GFAPdelta - IRES - mCherry,construct: recombinant mCherry,construct: recombinant mCherry
2,!Sample_characteristics_ch1,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,...,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide,substrate: PDMS coated with YIGSR peptide


Analyze the trait row:

In [101]:
tumor_stage_row = clinical_data.iloc[2]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1',
       'substrate: PDMS coated with YIGSR peptide'], dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [83]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'tumor type: diffuse astrocytic glioma'):
        return 1
    else:
        return 0

def convert_age(age_string):
    if age_string == 'n.a.':
        return None
    try:
        age = int(age_string.split(': ')[1])
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [84]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM864095,GSM864096,GSM864097,GSM864098,GSM864099,GSM864100,GSM864101,GSM864102,GSM864103,GSM864104,...,GSM864166,GSM864167,GSM864168,GSM864169,GSM864170,GSM864171,GSM864172,GSM864173,GSM864174,GSM864175
Lower Grade Glioma,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Genetic data preprocessing and final filtering

Check the genetic data:

In [85]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM864095,GSM864096,GSM864097,GSM864098,GSM864099,GSM864100,GSM864101,GSM864102,GSM864103,GSM864104,...,GSM864166,GSM864167,GSM864168,GSM864169,GSM864170,GSM864171,GSM864172,GSM864173,GSM864174,GSM864175
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ILMN_1651209,7.08126,7.02804,7.04731,6.89578,7.54783,7.05696,7.02078,7.08857,6.93991,7.23049,...,8.39485,8.31617,7.57317,6.61048,7.14696,6.64119,6.9018,7.34592,6.85502,6.88195
ILMN_1651228,12.2664,11.8163,12.2526,12.7677,12.5551,12.591,12.4412,12.6828,12.068,12.2492,...,12.7194,12.8489,12.8557,12.7538,11.3313,11.6043,11.2992,11.6019,7.72358,11.1593
ILMN_1651229,11.5187,11.1847,10.522,10.8389,10.725,11.0925,11.2993,9.72791,10.8801,10.725,...,10.158,9.88498,9.72949,9.72235,8.09054,8.93811,9.89586,7.60621,8.16145,8.77942
ILMN_1651235,11.0662,11.1127,12.095,12.0406,11.6247,11.7356,11.4617,12.1964,11.8319,11.6046,...,11.2291,11.577,11.6054,11.6087,7.03031,7.28178,7.3876,7.25692,10.2219,7.19244
ILMN_1651236,6.71231,6.80484,7.7136,6.76902,6.83577,6.98686,6.77434,6.89079,6.8258,7.22916,...,6.96177,6.65941,7.13952,7.2605,7.03282,7.09629,7.05044,7.23764,7.0974,7.16042


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [86]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['ILMN_3166687', 'ILMN_3165566', 'ILMN_3164811', 'ILMN_3165363', 'ILMN_3166511'], 'Transcript': ['ILMN_333737', 'ILMN_333646', 'ILMN_333584', 'ILMN_333628', 'ILMN_333719'], 'Species': ['ILMN Controls', 'ILMN Controls', 'ILMN Controls', 'ILMN Controls', 'ILMN Controls'], 'Source': ['ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls', 'ILMN_Controls'], 'Search_Key': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'ILMN_Gene': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'Source_Reference_ID': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'RefSeq_ID': [nan, nan, nan, nan, nan], 'Entrez_Gene_ID': [nan, nan, nan, nan, nan], 'GI': [nan, nan, nan, nan, nan], 'Accession': ['DQ516750', 'DQ883654', 'DQ668364', 'DQ516785', 'DQ854995'], 'Symbol': ['ERCC-00162', 'ERCC-00071', 'ERCC-00009', 'ERCC-00053', 'ERCC-00144'], 'Protein_Product': [nan, nan, nan, nan, nan], 'Array_Address_Id': [5270161.0, 426059

Index(['ID', 'Transcript', 'Species', 'Source', 'Search_Key', 'ILMN_Gene',
       'Source_Reference_ID', 'RefSeq_ID', 'Entrez_Gene_ID', 'GI', 'Accession',
       'Symbol', 'Protein_Product', 'Array_Address_Id', 'Probe_Type',
       'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation',
       'Probe_Coordinates', 'Cytoband', 'Definition', 'Ontology_Component',
       'Ontology_Process', 'Ontology_Function', 'Synonyms',
       'Obsolete_Probe_Id', 'GB_ACC'],
      dtype='object')

In [87]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [88]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM864095,GSM864096,GSM864097,GSM864098,GSM864099,GSM864100,GSM864101,GSM864102,GSM864103,GSM864104,...,GSM864166,GSM864167,GSM864168,GSM864169,GSM864170,GSM864171,GSM864172,GSM864173,GSM864174,GSM864175
A1BG,7.26299,7.33984,7.05604,7.35527,7.29422,6.98433,7.63733,7.25926,6.92810,7.51760,...,7.09568,8.34684,7.36993,7.63071,7.35173,7.37455,7.13238,6.62549,6.74157,6.91874
A1CF,7.00445,7.25328,6.81456,7.24100,6.83460,7.03233,6.92788,6.61342,7.18611,6.97340,...,6.97962,6.94430,7.15381,6.60018,6.98928,7.08978,7.19660,6.88211,7.23039,7.21305
A2M,12.66320,12.12180,12.21760,12.68280,12.40220,12.28430,12.31000,12.47900,12.69650,12.61970,...,12.01350,11.18670,11.92090,12.46340,10.83410,7.03836,7.48536,6.95512,7.31170,9.08426
A2ML1,8.89724,8.02318,6.53002,7.77120,8.58752,7.73412,7.72119,8.66294,8.26156,8.55367,...,8.50179,9.39235,6.77108,8.56073,7.57273,8.88461,8.48545,7.35455,7.73330,7.18958
A3GALT2,6.58877,6.59165,6.65354,7.26082,6.81609,6.72278,7.18236,6.84640,6.66252,6.64315,...,7.06416,8.10593,7.52797,7.00872,7.14662,6.60719,6.89761,6.94392,6.96191,6.77036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,6.99325,8.22443,10.40700,10.15350,6.76882,7.33668,7.48604,8.74448,9.16049,6.61523,...,6.64472,6.52377,6.57874,6.61732,7.23060,7.18165,7.21305,6.94335,7.46759,8.14860
ZYG11B,11.20560,11.12960,11.71370,11.31940,11.25370,11.28910,11.29440,11.65360,11.27060,11.27740,...,11.67890,11.97270,11.58070,11.22320,10.06810,8.17997,7.31621,10.77050,7.10785,8.81633
ZYX,9.65265,8.76356,8.46257,9.92270,10.60330,8.77365,9.56414,9.67786,9.09486,10.20100,...,9.32799,10.18560,8.12342,9.55623,8.95185,10.33370,9.04514,8.45426,9.24264,8.22080
ZZEF1,9.66498,9.18799,8.17041,7.38441,9.76300,9.90751,9.71319,9.61425,10.00860,9.81844,...,7.48423,6.94775,8.85493,8.71635,9.09447,10.72160,9.26359,8.52914,8.79593,7.38667


Use selected clinical data and genetic data to generate the merged data:

In [89]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Lower Grade Glioma,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAA1,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
GSM864095,0.0,7.26299,7.00445,12.66320,8.89724,6.58877,10.49320,6.90447,7.52565,9.24826,...,8.76605,7.13261,9.71837,10.35020,10.13400,6.99325,11.20560,9.65265,9.66498,9.71133
GSM864096,1.0,7.33984,7.25328,12.12180,8.02318,6.59165,7.29236,6.60890,7.28486,9.19508,...,9.37420,8.34295,11.35850,9.94582,10.47930,8.22443,11.12960,8.76356,9.18799,10.01200
GSM864097,1.0,7.05604,6.81456,12.21760,6.53002,6.65354,7.16371,7.17169,8.86592,8.22297,...,9.13227,9.17095,10.78920,7.21657,9.40957,10.40700,11.71370,8.46257,8.17041,10.26000
GSM864098,1.0,7.35527,7.24100,12.68280,7.77120,7.26082,7.14636,6.74501,8.58840,7.42137,...,9.72717,7.92411,10.61190,10.04980,10.71390,10.15350,11.31940,9.92270,7.38441,9.54446
GSM864099,1.0,7.29422,6.83460,12.40220,8.58752,6.81609,8.86649,7.10791,7.00043,9.20572,...,9.57788,8.13050,10.87190,8.95779,11.38540,6.76882,11.25370,10.60330,9.76300,10.24570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM864171,1.0,7.37455,7.08978,7.03836,8.88461,6.60719,9.88613,7.51475,7.46655,10.68630,...,7.32621,7.85855,9.01334,7.32259,8.77541,7.18165,8.17997,10.33370,10.72160,7.89890
GSM864172,1.0,7.13238,7.19660,7.48536,8.48545,6.89761,8.16306,6.76468,7.83845,8.44184,...,8.35084,7.17168,8.71861,7.99772,9.49643,7.21305,7.31621,9.04514,9.26359,7.77520
GSM864173,1.0,6.62549,6.88211,6.95512,7.35455,6.94392,7.69466,6.99440,6.87859,7.76972,...,8.15137,8.16665,8.58264,8.38555,7.65254,6.94335,10.77050,8.45426,8.52914,7.45045
GSM864174,1.0,6.74157,7.23039,7.31170,7.73330,6.96191,8.30797,6.77792,8.04829,8.39400,...,8.27122,7.43365,8.50119,7.88391,8.95584,7.46759,7.10785,9.24264,8.79593,7.68737


Check if the merged data biased or not:

In [90]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 81 samples.
For the feature 'Lower Grade Glioma', the least common label is '0.0' with 1 occurrences. This represents 1.23% of the dataset.
The distribution of the feature 'Lower Grade Glioma' in this dataset is severely biased.



True

Save the data as a csv file:

In [91]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)