# Data Prep
This notebook is used to load gse into a temp dir, import into python and clean the data so that it is ready for differential gene expression analysis and ML model
<br>
<p>Daniel Thompson 
<p>Dhwanil Patel
<p>Radiyana Mancheva
<p>5/18/22
    
Files being saved to temp dir

- GPL_GSE112366_data.pkl
- GSE112366_phenotype_data.pkl
- GSE112366_data.pkl

- GPL_GSE179285_data.pkl
- GSE179285_phenotype_data.pkl
- GSE179285_data.pkl

- validation_znorm.pkl



# Import Packages

In [2]:
% load_ext autoreload
% autoreload 2
import sys,os; sys.path.append(os.environ['BMESAHMETDIR']); 
import bmes

# bmes.pipinstall('Bio','biopython')
# bmes.pipinstall('GEOparse')
# bmes.pipinstall('pandasql')

import GEOparse
import pandas as pd
import numpy as np
import re

import mygene

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'bmes'

In [None]:
# check if directory exist 

PATH = os.path.join(bmes.tempdir(),"BMES543_crohns_project")
if not os.path.isdir(PATH):
    os.mkdir(PATH)

# print path if needed
# print(PATH)

# Load Data

## GSE112366
- save gpl, gsm in df and phenotype data
- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112366

In [None]:
gse = GEOparse.get_GEO(geo="GSE112366", destdir=bmes.tempdir());

In [None]:
# adapted from Dr. Sacan's in class example

# which group each sample belongs to 


sampletable={}
gsm= list(gse.gsms.values())
for sample in gsm:

    sampletable[sample.metadata["geo_accession"][0]]=sample.metadata["title"][0]
# print(sampletable)

# Data for each GSM is available in gse.gsms[...].table

# Combine GSM's into a single data table.
gsedata = None
for gsmid in gse.gsms.keys():
    gsmdata = gse.gsms[gsmid].table.rename(columns={'VALUE':gsmid});
    if gsedata is None: gsedata=gsmdata;
    else:
        assert(gsedata['ID_REF'].equals(gsmdata['ID_REF'])) #just make sure that the same probes are listed in the same order.
        gsedata = pd.concat([gsedata,gsmdata[gsmid]],axis=1);
# print(gsedata.shape) # (22283, 198)
# gsedata.head()
# gse.show_metadata()

# save data as pkl
gsedata.to_pickle(os.path.join(PATH,"GSE112366_data.pkl"))
gse.phenotype_data.to_pickle(os.path.join(PATH,"GSE112366_phenotype_data.pkl"))
phenotype_data = gse.phenotype_data
gpl = list(gse.gpls.values())[0].table;
gpl.to_pickle(os.path.join(PATH,"GPL_GSE112366_data.pkl"))


## GSE179285
- save gpl, gsm in df and phenotype data
- https://www-ncbi-nlm-nih-gov.ezproxy.u-pec.fr/geo/query/acc.cgi?acc=GSE179285

In [None]:
# taken from Dr. Sacan in class example

gse = GEOparse.get_GEO(geo="GSE179285", destdir=bmes.tempdir());
sampletable={}
gsm= list(gse.gsms.values())
for sample in gsm:

    sampletable[sample.metadata["geo_accession"][0]]=sample.metadata["title"][0]
# print(sampletable)

# Data for each GSM is available in gse.gsms[...].table

# Combine GSM's into a single data table.
gsedata = None
for gsmid in gse.gsms.keys():
    gsmdata = gse.gsms[gsmid].table.rename(columns={'VALUE':gsmid});
    if gsedata is None: gsedata=gsmdata;
    else:
        assert(gsedata['ID_REF'].equals(gsmdata['ID_REF'])) #just make sure that the same probes are listed in the same order.
        gsedata = pd.concat([gsedata,gsmdata[gsmid]],axis=1);
# print(gsedata.shape) # (22283, 198)
# gsedata.head()
# gse.show_metadata()

# save data as pkl
gsedata.to_pickle(os.path.join(PATH,"GSE179285_data.pkl"))
gse.phenotype_data.to_pickle(os.path.join(PATH,"GSE179285_phenotype_data.pkl"))
phenotype_data = gse.phenotype_data
gpl = list(gse.gpls.values())[0].table;
gpl.to_pickle(os.path.join(PATH,"GPL_GSE179285_data.pkl"))



## GSE52746
- saved df is labeled, genes added and z-norm
- verification dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52746

In [None]:
# taken from Dr. Sacan in class example

gse = GEOparse.get_GEO(geo="GSE52746", destdir=bmes.tempdir());

sampletable={}
gsm= list(gse.gsms.values())
for sample in gsm:

    sampletable[sample.metadata["geo_accession"][0]]=sample.metadata["title"][0]
# print(sampletable)

# Data for each GSM is available in gse.gsms[...].table

# Combine GSM's into a single data table.
gsedata = None
for gsmid in gse.gsms.keys():
    gsmdata = gse.gsms[gsmid].table.rename(columns={'VALUE':gsmid});
    if gsedata is None: gsedata=gsmdata;
    else:
        assert(gsedata['ID_REF'].equals(gsmdata['ID_REF'])) #just make sure that the same probes are listed in the same order.
        gsedata = pd.concat([gsedata,gsmdata[gsmid]],axis=1);
# print(gsedata.shape) # (22283, 198)
# gsedata.head()
# gse.show_metadata()

In [None]:
phenotype_data = gse.phenotype_data
gsedata.set_index('ID_REF',inplace=True)
gpl = list(gse.gpls.values())[0].table;

In [None]:
probe_id_dict = {}
for pos,i in enumerate(gpl["ID"]):
    probe_id_dict[i] = gpl.loc[pos,"ENTREZ_GENE_ID"]

In [None]:
# some probes are for control / cannot be found in gpl, this code will label them as control
entrez_id_list = []

for i in gsedata.index:
    try:
        entrez_id_list.append(str(int(probe_id_dict[i])))
    except:
        entrez_id_list.append("control")

In [None]:
# matching probe to gene using mygene package
# https://sulab.org/2013/10/quick-id-mapping-using-mygene-info/
mg = mygene.MyGeneInfo()

out = mg.querymany(entrez_id_list, scopes='entrezgene', fields='symbol', species='human')

genes = []
for i in out:
    try:
        genes.append(i["symbol"])
    except:
        genes.append("control")

In [None]:
# reorganize df
gsedata.reset_index(inplace=True)

gsedata.drop("ID_REF",inplace=True,axis=1)

gsedata.index = genes
gsedata.drop("control",inplace=True)

In [None]:
# getting samples of interest

# phenotype_data["source_name_ch1"].unique()

group = {'Human colon biopsies from active CD patient without anti-TNF therapy':["E_",0],
'Human colon biopsies from non-inflammatory control':["C_",0]}

new_gsm_labels = []

for record in phenotype_data.index:
    g = phenotype_data.loc[record,"source_name_ch1"]
    if g in ['Human colon biopsies from active CD patient without anti-TNF therapy','Human colon biopsies from non-inflammatory control']:
        label = group[g][0]+str(group[g][1])
        group[g][1] = group[g][1]+1
        new_gsm_labels.append(label)
    else:
        new_gsm_labels.append("NA")    
        
gsedata.columns = new_gsm_labels
gsedata.drop("NA",axis=1,inplace=True)

In [None]:
# z-norm and save df 

norm_gsedata = gsedata.apply(lambda x: (x - np.mean(x)) /np.std(x) )

norm_gsedata.to_pickle(os.path.join(PATH,"validation_znorm.pkl"))