# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = "Creutzfeldt-Jakob Disease"

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


## The GEO dataset

In [2]:
dataset = 'GEO'
trait_subdir = "Creutzfeldt-Jakob-Disease"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE62699', 'GSE87629']

Repeat the below steps for all the accession numbers

In [3]:
# No gene mapping
cohort = accession_num = "GSE62699"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Creutzfeldt-Jakob-Disease\\GSE62699\\GSE62699_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Creutzfeldt-Jakob-Disease\\GSE62699\\GSE62699-GPL16384_series_matrix.txt.gz')

In [13]:
# No obvious trait
cohort = accession_num = "GSE87629"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Creutzfeldt-Jakob-Disease\\GSE87629\\GSE87629_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Creutzfeldt-Jakob-Disease\\GSE87629\\GSE87629-GPL10558_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [14]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Genome-wide analysis of B and T cell gene expression during a six-week gluten challenge in patients with celiac disease"
!Series_summary	"Dietary gluten proteins (prolamins) from wheat, rye, and barley are the driving forces behind celiac disease, an organ-specific autoimmune disorder that targets both the small intestine and organs outside the gut. In the small intestine, gluten induces inflammation and a typical morphological change of villous atrophy and crypt hyperplasia. Gut lesions improve and heal when gluten is excluded from the diet and the disease relapses when patients consume gluten. Oral immune tolerance towards gluten may be kept for years or decades before breaking tolerance in genetically susceptible individuals. Celiac disease provides a unique opportunity to study autoimmunity and the transition in immune cells as gluten breaks oral tolerance. Seventy-three celiac disease patients on a long-term gluten-free diet ingested a known amount of gluten daily f

In [15]:
clinical_data.head()

Unnamed: 0,!Sample_geo_accession,GSM2335776,GSM2335777,GSM2335778,GSM2335779,GSM2335780,GSM2335781,GSM2335782,GSM2335783,GSM2335784,...,GSM2335877,GSM2335878,GSM2335879,GSM2335880,GSM2335881,GSM2335882,GSM2335883,GSM2335884,GSM2335885,GSM2335921
0,!Sample_characteristics_ch1,individual: celiac patient AA,individual: celiac patient AA,individual: celiac patient AB,individual: celiac patient AB,individual: celiac patient AC,individual: celiac patient AC,individual: celiac patient AD,individual: celiac patient AD,individual: celiac patient AE,...,individual: celiac patient DR,individual: celiac patient DW,individual: celiac patient DW,individual: celiac patient DX,individual: celiac patient DX,individual: celiac patient DY,individual: celiac patient DY,individual: celiac patient DZ,individual: celiac patient DZ,individual: celiac patient T
1,!Sample_characteristics_ch1,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...,disease state: biopsy confirmed celiac disease...
2,!Sample_characteristics_ch1,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,...,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control,treatment: 6 weeks gluten challenge,treatment: control
3,!Sample_characteristics_ch1,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,...,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood,tissue: peripheral whole blood
4,!Sample_characteristics_ch1,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,...,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells,cell type: purified pool of B and T cells


Analyze the trait row:

In [18]:
tumor_stage_row = clinical_data.iloc[3]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1', 'tissue: peripheral whole blood'],
      dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [7]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    if (trait == 'diagnosis: Control'):
        return 0
    else:
        return 1


def convert_age(age_string):
    if age_string.lower() == 'n.a.':
        return None
    try:
        age = int(age_string.split(':')[1])/12
        age = round(age, 0)
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [8]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM1531652,GSM1531653,GSM1531654,GSM1531655,GSM1531656,GSM1531657,GSM1531658,GSM1531659,GSM1531660,GSM1531661,...,GSM1531678,GSM1531679,GSM1531680,GSM1531681,GSM1531682,GSM1531683,GSM1531684,GSM1531685,GSM1531686,GSM1531687
Creutzfeldt-Jakob Disease,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0


### Genetic data preprocessing and final filtering

Check the genetic data:

In [9]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM1531652,GSM1531653,GSM1531654,GSM1531655,GSM1531656,GSM1531657,GSM1531658,GSM1531659,GSM1531660,GSM1531661,...,GSM1531678,GSM1531679,GSM1531680,GSM1531681,GSM1531682,GSM1531683,GSM1531684,GSM1531685,GSM1531686,GSM1531687
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14q0_st,7.68542,7.69338,8.05731,8.03301,7.41483,7.87933,7.61217,7.80203,7.81174,8.22454,...,7.97129,8.30401,7.80492,7.72724,7.80544,7.76272,7.97463,7.87084,7.74456,7.95184
14qI-1_st,5.27731,5.3207,4.65591,4.48387,4.84928,4.88192,4.69104,5.04664,4.8552,4.60723,...,4.8469,4.52504,4.72786,4.56759,5.10662,5.04491,4.90743,4.88524,5.29804,5.21946
14qI-1_x_st,4.55933,4.59639,4.69234,4.80472,4.89809,4.83344,4.81458,4.96099,4.79273,4.90661,...,4.67695,4.3715,4.5266,4.77767,4.70989,4.74682,4.71377,4.5393,4.71029,4.75213
14qI-2_st,4.67843,4.29837,4.42294,4.40797,4.53978,4.50486,4.60397,4.48906,4.5116,4.49264,...,4.39827,4.49887,4.47253,4.35049,4.45511,4.50782,4.98151,4.62313,4.46442,4.38716
14qI-3_x_st,6.65881,6.33247,6.53892,6.50822,6.67872,6.08335,6.1272,6.33515,6.37796,6.23352,...,6.67494,6.30227,6.28161,6.05855,6.15381,6.25624,5.8467,6.01996,6.07394,6.11179


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [10]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at'], 'GB_ACC': ['U48705', 'M87338', 'X51757', 'X69699', 'L36861'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database', 'GenBank', 'Affymetrix Proprietary Database'], 'Target Description': ['U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds', 'M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds', "X51757 /FEATURE=cds /DEFINITION=HSP70B Human heat-shock protein HSP70B' gene", 'X69699 /FEATURE= /DEFINITION=HSPAX8A H.sapiens

Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')

In [11]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Gene Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [12]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM1531652,GSM1531653,GSM1531654,GSM1531655,GSM1531656,GSM1531657,GSM1531658,GSM1531659,GSM1531660,GSM1531661,...,GSM1531678,GSM1531679,GSM1531680,GSM1531681,GSM1531682,GSM1531683,GSM1531684,GSM1531685,GSM1531686,GSM1531687


Use selected clinical data and genetic data to generate the merged data:

In [46]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Coronary artery disease,Gender,A1CF,A2M,A2ML1,AAAS,AACS,AADAT,AAGAB,AAK1,...,ZSWIM6,ZSWIM7,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZZEF1,ZZZ3
GSM3188436,1.0,0.0,4.303746,4.658333,4.127707,7.417824,6.324911,3.665135,5.750955,8.740156,...,8.267012,8.622072,6.511444,5.276233,2.765249,7.955894,3.021423,6.195087,7.666162,6.685938
GSM3188437,1.0,0.0,3.864738,5.006374,4.15872,7.130532,6.181629,3.338409,5.849932,8.35817,...,8.543126,8.324926,7.095449,5.176791,3.325044,7.976143,3.084119,6.139374,7.57507,6.360326
GSM3188438,1.0,0.0,3.625849,5.018469,4.170842,7.552365,6.117605,3.538927,6.097752,8.067759,...,8.579405,8.531927,6.416341,5.41787,3.438114,7.913347,3.071192,6.160805,7.734392,6.078101
GSM3188439,1.0,0.0,2.606136,5.531297,3.799715,7.700452,6.0094,3.692063,5.732469,8.310598,...,8.082617,8.230054,6.799698,5.506264,2.951082,7.974691,3.529228,6.137104,7.540814,6.454944
GSM3188440,1.0,0.0,3.753605,4.990348,4.097827,7.175898,6.14125,3.423539,5.830323,7.929667,...,8.484641,8.155123,6.836687,5.443652,3.425032,7.950576,3.190696,5.938086,7.502443,6.17524
GSM3188441,1.0,0.0,3.869394,5.579674,4.110009,7.198859,6.126123,3.479282,6.109326,8.421108,...,8.367078,8.174617,6.765051,5.656255,3.187053,8.021766,3.12484,5.975827,7.58019,6.206856
GSM3188442,1.0,0.0,3.978041,5.287926,4.116779,7.059702,6.100074,3.381674,5.655645,8.434831,...,8.549734,8.018434,6.915468,5.748913,3.353418,8.111779,3.085333,6.044921,7.583205,6.291254
GSM3188443,1.0,0.0,3.964216,4.991422,4.215373,7.533977,6.208076,3.679147,5.507761,8.412982,...,8.332112,8.471732,6.902637,5.675422,3.178719,7.987204,3.296036,6.076884,7.574004,6.53365
GSM3188444,1.0,0.0,4.224023,5.519005,3.882698,7.031556,6.092556,3.540713,5.990248,8.256269,...,8.839863,8.628709,6.812532,5.279751,2.934744,8.028546,2.894363,6.343047,7.621562,6.38721
GSM3188445,1.0,0.0,3.846065,6.423629,4.163691,7.272274,6.378582,3.549975,6.079201,8.475148,...,8.378584,8.070188,6.636148,5.50724,2.942788,7.933273,2.766647,5.914579,7.645101,6.051843


Check if the merged data biased or not:

In [47]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 31 samples.
For the feature 'Coronary artery disease', the least common label is '0.0' with 7 occurrences. This represents 22.58% of the dataset.
The distribution of the feature 'Coronary artery disease' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 31 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.



False

Save the data as a csv file:

In [19]:
is_available = False

if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Creutzfeldt-Jakob-Disease\cohort_info.json


NameError: name 'merged_data' is not defined