# 1. Basic setup

In [33]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = "Coronary artery disease"

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


## The GEO dataset

In [34]:
dataset = 'GEO'
trait_subdir = "Coronary-artery-disease"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE115733', 'GSE156357']

Repeat the below steps for all the accession numbers

In [35]:
# Finished
cohort = accession_num = "GSE115733"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Coronary-artery-disease\\GSE115733\\GSE115733_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Coronary-artery-disease\\GSE115733\\GSE115733_series_matrix.txt.gz')

In [49]:
# No obvious trait
cohort = accession_num = "GSE156357"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Coronary-artery-disease\\GSE156357\\GSE156357_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Coronary-artery-disease\\GSE156357\\GSE156357_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [50]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Supplementation with Probiotic Lactobacillus plantarum 299v  in Men With Stable Coronary Artery Disease Suppresses Systemic Inflammation"
!Series_summary	"Recent clinical trials demonstrate the efficacy of treatment strategies to reduce cardiovascular events in patients with coronary artery disease (CAD) that focus in reducing inflammatory signaling. Emerging data implicate the gut microbiota as a critical regulator of systemic inflammation. We recently demonstrated that supplementation with Lactobacillus plantarum 299v (Lp299v) improved vascular endothelial function in men with stable CAD. In this study we investigated whether the favorable effects of Lp299v on vascular health are due in part to coordinated suppression of systemic inflammation. We applied pre- and post-Lp299v supplementation plasma from these patients to peripheral blood mononuclear cells of a healthy donor to determine the transcriptional response to this intervention."
!Series_overall_design	"UPN119 c

In [51]:
clinical_data.head()

Unnamed: 0,!Sample_geo_accession,GSM4729476,GSM4729477,GSM4729478,GSM4729479,GSM4729480,GSM4729481,GSM4729482,GSM4729483,GSM4729484,...,GSM4729504,GSM4729505,GSM4729506,GSM4729507,GSM4729508,GSM4729509,GSM4729510,GSM4729511,GSM4729512,GSM4729513
0,!Sample_characteristics_ch1,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,...,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells,responder cells: UPN119 cells
1,!Sample_characteristics_ch1,treatment: stimulated with PRE Sub001 plasma,treatment: stimulated with POST Sub001 plasma,treatment: stimulated with PRE Sub003 plasma,treatment: stimulated with POST Sub003 plasma,treatment: stimulated with PRE Sub004 plasma,treatment: stimulated with POST Sub004 plasma,treatment: stimulated with PRE Sub005 plasma,treatment: stimulated with POST Sub005 plasma,treatment: stimulated with PRE Sub006 plasma,...,treatment: stimulated with PRE Sub0018 plasma,treatment: stimulated with POST Sub0018 plasma,treatment: stimulated with PRE Sub0019 plasma,treatment: stimulated with POST Sub0019 plasma,treatment: stimulated with PRE Sub0020 plasma,treatment: stimulated with POST Sub0020 plasma,treatment: stimulated with PRE Sub0021 plasma,treatment: stimulated with POST Sub0021 plasma,treatment: stimulated with PRE Sub0022 plasma,treatment: stimulated with POST Sub0022 plasma


Analyze the trait row:

In [53]:
tumor_stage_row = clinical_data.iloc[1]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1',
       'treatment: stimulated with PRE Sub001 plasma',
       'treatment: stimulated with POST Sub001 plasma',
       'treatment: stimulated with PRE Sub003 plasma',
       'treatment: stimulated with POST Sub003 plasma',
       'treatment: stimulated with PRE Sub004 plasma',
       'treatment: stimulated with POST Sub004 plasma',
       'treatment: stimulated with PRE Sub005 plasma',
       'treatment: stimulated with POST Sub005 plasma',
       'treatment: stimulated with PRE Sub006 plasma',
       'treatment: stimulated with POST Sub006 plasma',
       'treatment: stimulated with PRE Sub007 plasma',
       'treatment: stimulated with POST Sub007 plasma',
       'treatment: stimulated with PRE Sub008 plasma',
       'treatment: stimulated with POST Sub008 plasma',
       'treatment: stimulated with PRE Sub009 plasma',
       'treatment: stimulated with POST Sub009 plasma',
       'treatment: stimulated with PRE Sub0010 plasma',
       'treatment:

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [40]:
trait_row = 0
age_row = None
gender_row = 2

def convert_trait(trait):
    if (trait == 'diagnosis: Healthy control'):
        return 0
    else:
        return 1


def convert_age(age_string):
    if age_string.lower() == 'n.a.':
        return None
    try:
        age = int(age_string.split(':')[1])/12
        age = round(age, 0)
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [41]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)
  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM3188436,GSM3188437,GSM3188438,GSM3188439,GSM3188440,GSM3188441,GSM3188442,GSM3188443,GSM3188444,GSM3188445,...,GSM3188457,GSM3188458,GSM3188459,GSM3188460,GSM3188461,GSM3188462,GSM3188463,GSM3188464,GSM3188465,GSM3188466
Coronary artery disease,1,1,1,1,1,1,1,1,1,1,...,1,1,1,0,0,0,0,0,0,0
Gender,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Genetic data preprocessing and final filtering

Check the genetic data:

In [42]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM3188436,GSM3188437,GSM3188438,GSM3188439,GSM3188440,GSM3188441,GSM3188442,GSM3188443,GSM3188444,GSM3188445,...,GSM3188457,GSM3188458,GSM3188459,GSM3188460,GSM3188461,GSM3188462,GSM3188463,GSM3188464,GSM3188465,GSM3188466
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(+)E1A_r60_1,17.906179,17.948768,17.945564,17.849567,17.882431,17.965614,17.915101,17.934513,17.913933,17.885777,...,17.947725,17.978621,17.94167,17.963373,17.916048,17.930924,17.94991,17.908101,17.930042,17.866173
(+)E1A_r60_3,5.752783,5.917641,5.673246,5.533523,5.675315,5.864788,5.604841,5.820733,5.24953,5.535571,...,5.651631,6.220919,5.492687,5.72354,5.265002,5.322426,5.52431,5.406193,5.911039,5.693445
(+)E1A_r60_a104,1.642499,1.954251,1.700297,1.542966,3.286404,1.660298,1.686856,1.792349,1.846673,1.662439,...,1.917395,1.710143,1.503942,2.188707,1.829612,1.82961,1.56756,1.590049,2.126818,1.459233
(+)E1A_r60_a107,5.203293,5.536191,5.299424,5.093644,5.154304,5.109153,5.230224,5.334619,5.335169,4.895817,...,5.498666,5.584147,4.880614,5.37466,5.351413,5.21399,5.00768,4.955956,5.246613,4.94564
(+)E1A_r60_a135,8.193956,8.613137,8.351687,8.214264,8.077422,8.281637,8.224431,8.32359,8.162486,7.934074,...,8.415754,8.66108,7.968473,8.403389,8.204766,8.115829,8.085488,7.996021,8.260667,7.941801


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [43]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['GE_BrightCorner', 'DarkCorner', 'hsa_circ_0037705', 'hsa_circ_0004246', 'hsa_circ_0049281'], 'CONTROL_TYPE': ['pos', 'pos', 'FALSE', 'FALSE', 'FALSE'], 'TYPE': [nan, nan, 'circRNA', 'circRNA', 'circRNA'], 'CIRCRNA_ID': [nan, nan, 'hsa_circ_0037705', 'hsa_circ_0004246', 'hsa_circ_0049281'], 'CHROM': [nan, nan, 'chr16', 'chr8', 'chr19'], 'HSA_HG19_CIRCRNA_START': [nan, nan, 4404542.0, 145745084.0, 10687873.0], 'HSA_HG19_CIRCRNA_END': [nan, nan, 4463405.0, 145746908.0, 10694746.0], 'HSA_HG19_CIRCRNA_STRAND': [nan, nan, '-', '+', '-'], 'GENOMIC_LENGTH': [nan, nan, 58863.0, 1824.0, 6873.0], 'SPLICED_SEQ_LENGTH': [nan, nan, 3334.0, 1553.0, 1005.0], 'BEST_TRANSCRIPT': [nan, nan, 'NM_024535', 'NM_014665', 'NM_005498'], 'GB_ACC': [nan, nan, 'NM_024535', 'NM_014665', 'NM_005498'], 'GENE_SYMBOL': [nan, nan, 'CORO7', 'LRRC14', 'AP1M2'], 'SEQUENCE': [nan, nan, 'TTTCAGCAATTAAACTTTTTTAAGCTGGCCATCCTGGATCAGTGACATTCGAGCAGGAAC', 'GTCTCTTTAGGTGGCCGTCTGCCCGGCCCAGCACCATGCACACGCTTGTGTTCTTGAGCA', 'AG

Index(['ID', 'CONTROL_TYPE', 'TYPE', 'CIRCRNA_ID', 'CHROM',
       'HSA_HG19_CIRCRNA_START', 'HSA_HG19_CIRCRNA_END',
       'HSA_HG19_CIRCRNA_STRAND', 'GENOMIC_LENGTH', 'SPLICED_SEQ_LENGTH',
       'BEST_TRANSCRIPT', 'GB_ACC', 'GENE_SYMBOL', 'SEQUENCE', 'SPOT_ID'],
      dtype='object')

In [44]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'GENE_SYMBOL'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [45]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM3188436,GSM3188437,GSM3188438,GSM3188439,GSM3188440,GSM3188441,GSM3188442,GSM3188443,GSM3188444,GSM3188445,...,GSM3188457,GSM3188458,GSM3188459,GSM3188460,GSM3188461,GSM3188462,GSM3188463,GSM3188464,GSM3188465,GSM3188466
A1CF,4.303746,3.864738,3.625849,2.606136,3.753605,3.869394,3.978041,3.964216,4.224023,3.846065,...,2.865641,4.111839,4.029210,4.051465,3.301221,4.220185,3.775258,3.712203,4.108376,3.859673
A2M,4.658333,5.006374,5.018469,5.531297,4.990348,5.579674,5.287926,4.991422,5.519005,6.423629,...,6.105759,4.726825,5.617011,5.576377,5.501579,5.775806,5.748089,5.997114,4.846222,5.033945
A2ML1,4.127707,4.158720,4.170842,3.799715,4.097827,4.110009,4.116779,4.215373,3.882698,4.163691,...,3.949613,4.325596,4.099885,3.900159,3.850512,4.083462,4.373176,4.226218,4.302424,4.164832
AAAS,7.417824,7.130532,7.552365,7.700452,7.175898,7.198859,7.059702,7.533977,7.031556,7.272274,...,7.527175,6.948296,7.509040,6.785121,7.159783,7.226576,7.047342,7.763064,6.974612,7.586978
AACS,6.324911,6.181629,6.117605,6.009400,6.141250,6.126123,6.100074,6.208076,6.092556,6.378582,...,6.138481,6.125191,6.114697,6.296190,6.267744,6.364349,6.236709,6.102275,6.416129,6.196540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,7.955894,7.976143,7.913347,7.974691,7.950576,8.021766,8.111779,7.987204,8.028546,7.933273,...,8.019139,8.208267,8.052435,8.041334,8.070085,8.041505,8.211853,7.947054,8.183173,8.090283
ZYG11A,3.021423,3.084119,3.071192,3.529228,3.190696,3.124840,3.085333,3.296036,2.894363,2.766647,...,3.060047,2.942156,3.098041,3.263122,2.918022,3.013529,3.219150,3.171273,3.346364,3.457542
ZYG11B,6.195087,6.139374,6.160805,6.137104,5.938086,5.975827,6.044921,6.076884,6.343047,5.914579,...,6.187632,6.020802,5.865893,5.863600,6.145421,5.962614,5.813261,6.128784,6.064534,5.960804
ZZEF1,7.666162,7.575070,7.734392,7.540814,7.502443,7.580190,7.583205,7.574004,7.621562,7.645101,...,7.713456,7.935561,7.679000,7.936546,7.642127,7.820722,7.839113,7.663779,7.741242,7.713401


Use selected clinical data and genetic data to generate the merged data:

In [46]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Coronary artery disease,Gender,A1CF,A2M,A2ML1,AAAS,AACS,AADAT,AAGAB,AAK1,...,ZSWIM6,ZSWIM7,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZZEF1,ZZZ3
GSM3188436,1.0,0.0,4.303746,4.658333,4.127707,7.417824,6.324911,3.665135,5.750955,8.740156,...,8.267012,8.622072,6.511444,5.276233,2.765249,7.955894,3.021423,6.195087,7.666162,6.685938
GSM3188437,1.0,0.0,3.864738,5.006374,4.15872,7.130532,6.181629,3.338409,5.849932,8.35817,...,8.543126,8.324926,7.095449,5.176791,3.325044,7.976143,3.084119,6.139374,7.57507,6.360326
GSM3188438,1.0,0.0,3.625849,5.018469,4.170842,7.552365,6.117605,3.538927,6.097752,8.067759,...,8.579405,8.531927,6.416341,5.41787,3.438114,7.913347,3.071192,6.160805,7.734392,6.078101
GSM3188439,1.0,0.0,2.606136,5.531297,3.799715,7.700452,6.0094,3.692063,5.732469,8.310598,...,8.082617,8.230054,6.799698,5.506264,2.951082,7.974691,3.529228,6.137104,7.540814,6.454944
GSM3188440,1.0,0.0,3.753605,4.990348,4.097827,7.175898,6.14125,3.423539,5.830323,7.929667,...,8.484641,8.155123,6.836687,5.443652,3.425032,7.950576,3.190696,5.938086,7.502443,6.17524
GSM3188441,1.0,0.0,3.869394,5.579674,4.110009,7.198859,6.126123,3.479282,6.109326,8.421108,...,8.367078,8.174617,6.765051,5.656255,3.187053,8.021766,3.12484,5.975827,7.58019,6.206856
GSM3188442,1.0,0.0,3.978041,5.287926,4.116779,7.059702,6.100074,3.381674,5.655645,8.434831,...,8.549734,8.018434,6.915468,5.748913,3.353418,8.111779,3.085333,6.044921,7.583205,6.291254
GSM3188443,1.0,0.0,3.964216,4.991422,4.215373,7.533977,6.208076,3.679147,5.507761,8.412982,...,8.332112,8.471732,6.902637,5.675422,3.178719,7.987204,3.296036,6.076884,7.574004,6.53365
GSM3188444,1.0,0.0,4.224023,5.519005,3.882698,7.031556,6.092556,3.540713,5.990248,8.256269,...,8.839863,8.628709,6.812532,5.279751,2.934744,8.028546,2.894363,6.343047,7.621562,6.38721
GSM3188445,1.0,0.0,3.846065,6.423629,4.163691,7.272274,6.378582,3.549975,6.079201,8.475148,...,8.378584,8.070188,6.636148,5.50724,2.942788,7.933273,2.766647,5.914579,7.645101,6.051843


Check if the merged data biased or not:

In [47]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 31 samples.
For the feature 'Coronary artery disease', the least common label is '0.0' with 7 occurrences. This represents 22.58% of the dataset.
The distribution of the feature 'Coronary artery disease' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 31 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Gender' in this dataset is severely biased.



False

Save the data as a csv file:

In [48]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Coronary-artery-disease\cohort_info.json
