In [54]:
#import modules for handling data
import GEOparse
import pandas as pd
import numpy as np
import math
import seaborn
import matplotlib.pyplot as plt
import os
from scipy import stats

def get_gsedata(geo_id, log2val, keepProbeId, loc):

    gse = GEOparse.get_GEO(geo=geo_id)

    #get the expression table with rows representing probes and columns representing samples
    gsedata = gse.pivot_samples('VALUE')
    gsedata.columns.name = None
    gsedata.index.name = None

    #if the data is not log2 values then convert
    if log2val != 1:
        gsedata = np.log2(gsedata+1)
        


    #get gpl information
    gpl = list(gse.gpls.values())[0].table

    #get the gsm list and data
    gsm = list(gse.gsms.values())

    #if the user choices to not keep probe id
    if keepProbeId != 1:
        #convert the ID and Gene Symbol column into a dictionary
        #this is to convert the probe ID to gene symbols in the gsedata

        #the probe id is the key and the gene symbol is the value
        gene_dict = gpl.set_index(gpl.ID).to_dict()['Gene Symbol']

        #map the gse probe id to the gpl gene symbol
        gsedata.index = gsedata.index.map(gene_dict)


    #get patient characteristics (group assignment)
    groups = [gsm[x].metadata['characteristics_ch1'][loc] for x in range(len(gsm))]

    #change gsedata column names to the group assignment
    gsedata.columns = groups

    #for each patient 
    for x in range(len(gsedata.columns)):

        #if 'normal' is in the patient's group 
        if 'normal' in gsedata.columns[x].lower():

            #then, assign 'normal' 
            gsedata.columns.values[x] = 'normal'
        else:

            #otherwise, assign 'tumor'
            gsedata.columns.values[x] = 'tumor'

    #Z Normalize data 
    gsedata = (gsedata - gsedata.mean()) / gsedata.std()

    return gsedata

def get_fold_change(gsedata):
    #first filter out genes meeting minimum p-value threshold only
    #get normal / tumor columns only
    data_a = gsedata.normal
    data_b = gsedata.tumor
    
    #tranpose data for ttest
    data_a = data_a.transpose()
    data_b = data_b.transpose()
    
    #perform t-test
    t,p = stats.ttest_ind(data_a,data_b)
    
    #add p-values to gsedataset column for ease of future selection
    gsedata = gsedata.assign(pvalues=p)
    
    #create a dataset with only the significant genes
    gsedata = gsedata.loc[(gsedata['pvalues']<0.05)]

    #get the average expression for normal samples for each gene
    normal_avg = (gsedata.normal).mean(axis = 1)

    #get the average expression for tumor samples for each gene
    tumor_avg = (gsedata.tumor).mean(axis = 1)

    #get the fold change (LOG2)
    #fold change = 2 ^ (tumor_avg - normal_avg)
    fold_change = pd.DataFrame(2 ** (tumor_avg - normal_avg), columns=['Fold Change'])
    fold_change.index.rename('Gene Symbols', inplace=True)


    #convert fold changes that are less than 1 to negative fold change
    fold_change[fold_change < 1] = -1 / fold_change[fold_change < 1]
      
    
    #get the significant genes by setting a fold change threshold of > 2.5
    sig_genes = fold_change.loc[abs(fold_change['Fold Change']) > 2.5]

    return sig_genes
