In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

import joblib

In [2]:
import matplotlib.pyplot as plt
import gseapy as gp

In [3]:
X = joblib.load('./AML_data/meth.pkl')
pheno = joblib.load('./AML_data/pheno.pkl')

In [4]:
unionindices = joblib.load('./AML_data/unionindices.pkl')
len(unionindices)

1300

In [5]:
# Create a new column for genotypes to merge some groups together

# Nas, No result and other will form one group

In [6]:
mll = ['other 11q23/MLL', 't(9;11)', 't(10;11)','t(11;19)']
other = ['normal', 'mono 7', 'inv(16)','other clon abn', '3q21q26', 't(8;21)', 'sole+8', 't(15;17)']

In [7]:
finalgenotype = []

for data in pheno.genotype:

    if data in mll:
        finalgenotype.append('MLL rearranged')
        
    elif data == 'no result':
        finalgenotype.append('No result')
        
    elif data in other:
        finalgenotype.append(data)
        
    else:
        finalgenotype.append('No result')
        

In [8]:
finalgenotype

['normal',
 'MLL rearranged',
 'normal',
 'No result',
 'mono 7',
 'inv(16)',
 'normal',
 'normal',
 'MLL rearranged',
 'other clon abn',
 'other clon abn',
 'MLL rearranged',
 'normal',
 'MLL rearranged',
 'MLL rearranged',
 'MLL rearranged',
 'other clon abn',
 'MLL rearranged',
 'normal',
 'No result',
 'MLL rearranged',
 'normal',
 'MLL rearranged',
 '3q21q26',
 'normal',
 'other clon abn',
 'other clon abn',
 'inv(16)',
 'No result',
 't(8;21)',
 'sole+8',
 'sole+8',
 'normal',
 'normal',
 'No result',
 'other clon abn',
 'mono 7',
 't(8;21)',
 'normal',
 'inv(16)',
 'No result',
 't(8;21)',
 'MLL rearranged',
 'MLL rearranged',
 'normal',
 'No result',
 'normal',
 't(15;17)',
 'No result',
 'other clon abn',
 'normal',
 'other clon abn',
 'inv(16)',
 'inv(16)',
 't(8;21)',
 'MLL rearranged',
 't(8;21)',
 't(8;21)',
 'normal',
 'normal',
 't(8;21)',
 't(15;17)',
 'No result',
 'MLL rearranged',
 'MLL rearranged',
 'MLL rearranged',
 'inv(16)',
 'No result',
 't(8;21)',
 'No result

In [9]:
phenodf = pheno.copy()

In [10]:
phenodf['finalgenotype'] = finalgenotype

In [11]:
unknowndf = phenodf[phenodf['finalgenotype'].isin(['No result', 'other clon abn'])]

In [12]:
Xun = X[X.index.get_level_values(1).isin(unknowndf.index)]

In [13]:
phenodf = phenodf[~phenodf['finalgenotype'].isin(['No result', 'other clon abn'])]

In [14]:
phenodf.shape

(99, 5)

In [15]:
Xnew = X[X.index.get_level_values(1).isin(phenodf.index)]  # remove the no result and other from the methylation df as well

In [16]:
Xnew.shape

(99, 406830)

In [17]:
phenodf

Unnamed: 0_level_0,sample.type,FAB,genotype,relapse,finalgenotype
public_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AML_001,diagnostic,M2,normal,True,normal
AML_002,diagnostic,M5,t(11;19),True,MLL rearranged
AML_003,diagnostic,M1,normal,False,normal
AML_005,diagnostic,M2,mono 7,True,mono 7
AML_006,diagnostic,M4,inv(16),False,inv(16)
...,...,...,...,...,...
AML_122,diagnostic,M2,normal,False,normal
AML_033,diagnostic,M6,normal,True,normal
AML_123,diagnostic,M4,inv(16),False,inv(16)
AML_124,diagnostic,M2,normal,False,normal


In [18]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(Xnew)
Xnew = pd.DataFrame(imputer.transform(Xnew), columns = Xnew.columns, index = Xnew.index)

# Find the most significant CpGs in a One-VS-Rest Approach

In [19]:
# P value: Each subtype vs the other

In [20]:
def dmsOneVsRest(data, indices, classes, phenodata, subtype):
    pvals = []
    for index in indices:
       
        data1 = data[index][data.index.get_level_values(1).isin(phenodata[subtype == classes].index)]
        data2 = data[index][~data.index.get_level_values(1).isin(phenodata[subtype == classes].index)]
#         print('CpG site: {}'.format(index))
#         print('-----------------------------------')
#         print(mannwhitneyu(data1,data2))
#         print('-----------------------------------')
#         print('-----------------------------------')
        stat, pvalue = mannwhitneyu(data1,data2)
        
        pvals.append([classes, index, data1.mean(), data2.mean(), data1.median(), data2.median(), data1.std(), data2.std(), pvalue])
    return pvals
        

In [21]:
phenodf.finalgenotype.unique()

array(['normal', 'MLL rearranged', 'mono 7', 'inv(16)', '3q21q26',
       't(8;21)', 'sole+8', 't(15;17)'], dtype=object)

In [22]:
stats_all = []
for c in phenodf.finalgenotype.unique():
    print('\033[1m' + '{} vs REST'.format(c.upper()) + '\033[0m')
    print('-----------------------------------')
    data = dmsOneVsRest(Xnew, unionindices, c, phenodf, phenodf.finalgenotype)
    statistics = pd.DataFrame(data, columns = ['subtype','CpG_id', 'Meanbetasub', 'Meanbetaother',
                                               'Medianbetasub', 'Medianbetaother',
                                               'stdbetasub', 'stdbetaother',
                                               'p-value'])
    statistics['Adjusted p-value'] = multipletests(statistics['p-value'], method = 'fdr_bh')[1]
    
    statistics = statistics[statistics['Adjusted p-value']<0.05]

    print(statistics.nsmallest(15, 'Adjusted p-value'))
    #stats_all.extend(statistics.nsmallest(15, 'Adjusted p-value').values)
    #stats_all.extend(statistics.nsmallest(15, 'Adjusted p-value').values)
    stats_all.extend(statistics.sort_values(by='Adjusted p-value').values)
    
    
    print('-----------------------------------')
    

[1mNORMAL vs REST[0m
-----------------------------------
     subtype      CpG_id  Meanbetasub  Meanbetaother  Medianbetasub  \
69    normal  cg01423695     0.839200       0.387464         0.8750   
116   normal  cg02538833     0.757233       0.381652         0.8815   
142   normal  cg03009030     0.750367       0.310406         0.8245   
532   normal  cg11749010     0.761700       0.335507         0.7885   
1015  normal  cg21820873     0.884333       0.560594         0.9550   
257   normal  cg05445326     0.813367       0.434420         0.9170   
468   normal  cg10301695     0.453500       0.728609         0.4385   
39    normal  cg00787180     0.659867       0.335884         0.7400   
281   normal  cg06046490     0.422700       0.128348         0.4320   
976   normal  cg21090033     0.821233       0.479217         0.9200   
199   normal  cg04193820     0.478400       0.218304         0.4320   
16    normal  cg00340855     0.751333       0.451957         0.8185   
821   normal  cg17

Empty DataFrame
Columns: [subtype, CpG_id, Meanbetasub, Meanbetaother, Medianbetasub, Medianbetaother, stdbetasub, stdbetaother, p-value, Adjusted p-value]
Index: []
-----------------------------------
[1mT(8;21) vs REST[0m
-----------------------------------
      subtype      CpG_id  Meanbetasub  Meanbetaother  Medianbetasub  \
373   t(8;21)  cg08166720     0.193895       0.911687          0.142   
578   t(8;21)  cg12654519     0.904316       0.337262          0.914   
742   t(8;21)  cg15769475     0.200684       0.928837          0.163   
751   t(8;21)  cg16034168     0.133105       0.949862          0.121   
1257  t(8;21)  cg26786253     0.143579       0.912475          0.104   
25    t(8;21)  cg00502209     0.242737       0.945800          0.236   
700   t(8;21)  cg14795528     0.076895       0.832000          0.066   
435   t(8;21)  cg09481537     0.173316       0.821937          0.116   
873   t(8;21)  cg18771737     0.900579       0.260537          0.910   
608   t(8;21)  cg1

In [23]:
stats_all

[array(['normal', 'cg01423695', 0.839199960231781, 0.38746377825737, 0.875,
        0.2460000067949295, 0.12918534874916077, 0.3142096698284149,
        1.8066738440194036e-09, 2.348675997225225e-06], dtype=object),
 array(['normal', 'cg11749010', 0.7616999745368958, 0.33550727367401123,
        0.7884999513626099, 0.13600000739097595, 0.22185955941677094,
        0.32846805453300476, 3.219119624939113e-08, 1.2152301115020317e-05],
       dtype=object),
 array(['normal', 'cg03009030', 0.7503665685653687, 0.31040582060813904,
        0.8244999647140503, 0.1850000023841858, 0.19594308733940125,
        0.2974041700363159, 2.548750754711883e-08, 1.2152301115020317e-05],
       dtype=object),
 array(['normal', 'cg02538833', 0.7572334408760071, 0.38165217638015747,
        0.8815000057220459, 0.30300000309944153, 0.2304828017950058,
        0.2756745219230652, 3.739169573852405e-08, 1.2152301115020317e-05],
       dtype=object),
 array(['normal', 'cg21820873', 0.8843333721160889, 0.56059426

In [24]:
columns = ['Subtype','CpG ID', 'Mean B-value subtype', 'Mean B-value other','Median B-value subtype', 'Median B-value other',
           'Std B-value subtype', 'Std B-value other','p-value', 'Adjusted p-value']

In [25]:
finaldata = pd.DataFrame(stats_all, columns = columns)
finaldata

Unnamed: 0,Subtype,CpG ID,Mean B-value subtype,Mean B-value other,Median B-value subtype,Median B-value other,Std B-value subtype,Std B-value other,p-value,Adjusted p-value
0,normal,cg01423695,0.839200,0.387464,0.8750,0.246,0.129185,0.314210,1.806674e-09,0.000002
1,normal,cg11749010,0.761700,0.335507,0.7885,0.136,0.221860,0.328468,3.219120e-08,0.000012
2,normal,cg03009030,0.750367,0.310406,0.8245,0.185,0.195943,0.297404,2.548751e-08,0.000012
3,normal,cg02538833,0.757233,0.381652,0.8815,0.303,0.230483,0.275675,3.739170e-08,0.000012
4,normal,cg21820873,0.884333,0.560594,0.9550,0.518,0.164890,0.321850,4.708303e-08,0.000012
...,...,...,...,...,...,...,...,...,...,...
3389,t(15;17),cg25279778,0.956500,0.691874,0.9570,0.931,0.005260,0.338166,1.198455e-02,0.048170
3390,t(15;17),cg04115740,0.823500,0.519316,0.9015,0.554,0.181931,0.279946,1.228658e-02,0.048997
3391,t(15;17),cg26471497,0.826750,0.424832,0.8165,0.430,0.116363,0.333455,1.228703e-02,0.048997
3392,t(15;17),cg14839087,0.068250,0.420379,0.0650,0.310,0.017173,0.347067,1.257359e-02,0.049834


In [26]:
finaldata['Subtype'].value_counts()

MLL rearranged    873
t(8;21)           723
inv(16)           571
normal            569
mono 7            330
t(15;17)          328
Name: Subtype, dtype: int64

In [27]:
## Remove duplicates only unique CpGs per subtype are allowed 
finaldata2 = finaldata[~finaldata.duplicated('CpG ID', keep = False)].reset_index(drop = True)
finaldata2.head()

Unnamed: 0,Subtype,CpG ID,Mean B-value subtype,Mean B-value other,Median B-value subtype,Median B-value other,Std B-value subtype,Std B-value other,p-value,Adjusted p-value
0,normal,cg15139588,0.216867,0.464783,0.054,0.425,0.273001,0.334755,0.000214,0.001621
1,normal,cg15605858,0.443967,0.222043,0.3755,0.147,0.321264,0.201276,0.000876,0.004379
2,normal,cg02390319,0.419733,0.090754,0.2515,0.026,0.397723,0.17005,0.00214,0.008245
3,normal,cg02571816,0.5031,0.320261,0.515,0.223,0.33659,0.303788,0.011292,0.03008
4,normal,cg17400476,0.4072,0.284435,0.3805,0.159,0.305329,0.279711,0.021221,0.049177


In [28]:
finaldata2.shape

(146, 10)

In [29]:
finaldata2['Subtype'].value_counts()

MLL rearranged    59
t(8;21)           27
mono 7            24
inv(16)           22
t(15;17)           8
normal             6
Name: Subtype, dtype: int64

**Sole+8 and 3q21q26 did not pass the cutoff**

In [30]:
finaldata2.Subtype.unique()

array(['normal', 'MLL rearranged', 'mono 7', 'inv(16)', 't(8;21)',
       't(15;17)'], dtype=object)

# CpG Annotation
**The csv file is available upon request together with the raw idat files**

In [31]:
df = pd.read_csv('./AML_data/annotation2.csv', index_col = 'Index')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [32]:
df.head(50)

Unnamed: 0_level_0,TargetID,ProbeID_A,ProbeID_B,ILMNID,ADDRESSA_ID,ALLELEA_PROBESEQ,ADDRESSB_ID,ALLELEB_PROBESEQ,INFINIUM_DESIGN_TYPE,NEXT_BASE,...,REGULATORY_FEATURE_NAME,REGULATORY_FEATURE_GROUP,DHS,NAME,UCSC_REFGENE_NAME,BWA.hit,SNP,SNP.distance,minor.allele.freq,pass.filter
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,cg00000029,14782418,14782418,cg00000029,14782418,AACTATACTAACRAAAAAATATCCAAAAAACACTAACRTATAAAAA...,,,II,,...,16:53467838-53469685,Promoter_Associated,True,cg00000029,RBL2,False,,,,True
2,cg00000108,12709357,12709357,cg00000108,12709357,ATACAATAAAACAAACCTAAAATAATCCTAACTCCRCTATCATCCT...,,,II,,...,,,False,cg00000108,C3orf35;C3orf35,False,rs9857774,16.0,0.056801,True
3,cg00000109,59755374,59755374,cg00000109,59755374,CAATACTAACAAACACATATACCCCCCCACAAATCTTAACTTCTAA...,,,II,,...,,,False,cg00000109,FNDC3B;FNDC3B,False,rs9864492,17.0,0.010135,True
4,cg00000165,12637463,12637463,cg00000165,12637463,CAAAATCTATTAATACAATAACTTTTAATAAAACAACTAAAACACA...,,,II,,...,,,False,cg00000165,,False,rs76771611,25.0,0.027778,True
5,cg00000236,12649348,12649348,cg00000236,12649348,TATAACRTCATATTAAAAAAAACRATCTAACCCACCAATTTATACA...,,,II,,...,,,False,cg00000236,VDAC3;VDAC3,False,,,,True
6,cg00000289,18766346,18766346,cg00000289,18766346,ATCTACTATATTCATTTCTCCAATCTCATATCCATTTTAATATAAA...,,,II,,...,,,False,cg00000289,ACTN1;ACTN1;ACTN1,False,,,,True
7,cg00000292,43764508,43764508,cg00000292,43764508,AAAACATTAATTACCAACCRCTCTTCCAAAAAACACTTACCATTAA...,,,II,,...,,,False,cg00000292,ATP2A1;ATP2A1,False,rs62037371,31.0,0.333333,True
8,cg00000321,62789509,62789509,cg00000321,62789509,ATAAATACCCAATAAACCTAACTAAACTCCCTAAAAAACRAAACRA...,,,II,,...,,,False,cg00000321,SFRP1,False,,,,True
9,cg00000363,16661505,16661505,cg00000363,16661505,RTCTTAACTTAACTTAATTTTCTCCTTAATCTAAAAAACTTTCCCT...,,,II,,...,,,False,cg00000363,,False,,,,True
10,cg00000622,11642304,38691301,cg00000622,11642304,CAACAAAAAAAAACCCCAAAAACAAAATATACATAATACAACACTA...,38691301.0,CGACGAAAAAAAACCCCGAAAACGAAATATACGTAATACGACACTA...,I,T,...,15:23033272-23034847,Promoter_Associated,False,cg00000622,NIPA2;NIPA2;NIPA2;NIPA2,False,,,,True


In [33]:
df.columns

Index(['TargetID', 'ProbeID_A', 'ProbeID_B', 'ILMNID', 'ADDRESSA_ID',
       'ALLELEA_PROBESEQ', 'ADDRESSB_ID', 'ALLELEB_PROBESEQ',
       'INFINIUM_DESIGN_TYPE', 'NEXT_BASE', 'COLOR_CHANNEL',
       'FORWARD_SEQUENCE', 'GENOME_BUILD', 'CHR', 'MAPINFO', 'SOURCESEQ',
       'CHROMOSOME_36', 'COORDINATE_36', 'CHROMOSOME_37', 'COORDINATE_37',
       'STRAND', 'PROBE_SNPS', 'PROBE_SNPS_10', 'RANDOM_LOCI', 'METHYL27_LOCI',
       'UCSC_REFGENE_ACCESSION', 'UCSC_REFGENE_GROUP', 'UCSC_CPG_ISLANDS_NAME',
       'RELATION_TO_UCSC_CPG_ISLAND', 'PHANTOM', 'DMR', 'ENHANCER',
       'HMM_ISLAND', 'REGULATORY_FEATURE_NAME', 'REGULATORY_FEATURE_GROUP',
       'DHS', 'NAME', 'UCSC_REFGENE_NAME', 'BWA.hit', 'SNP', 'SNP.distance',
       'minor.allele.freq', 'pass.filter'],
      dtype='object')

In [34]:
mydf = df[['TargetID', 'CHR','UCSC_REFGENE_NAME', 'RELATION_TO_UCSC_CPG_ISLAND']][df['TargetID'].isin(finaldata2['CpG ID'])].copy()

In [35]:
mydf['UCSC_REFGENE_NAME'].isna().sum()

41

In [36]:
mydf[mydf['UCSC_REFGENE_NAME'].isna()]

Unnamed: 0_level_0,TargetID,CHR,UCSC_REFGENE_NAME,RELATION_TO_UCSC_CPG_ISLAND
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1780,cg00077285,15,,
3075,cg00135497,4,,Island
38185,cg01863042,13,,
47587,cg02330874,10,,S_Shore
48875,cg02391713,6,,Island
63485,cg03124146,7,,Island
71269,cg03526459,1,,Island
88831,cg04425920,10,,Island
99171,cg04972745,10,,Island
118152,cg05991492,16,,N_Shore


In [37]:
finaldf = mydf[~mydf['UCSC_REFGENE_NAME'].isna()]

In [38]:
finaldf

Unnamed: 0_level_0,TargetID,CHR,UCSC_REFGENE_NAME,RELATION_TO_UCSC_CPG_ISLAND
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1945,cg00084338,6,DLL1,N_Shore
7172,cg00339556,5,MARCH11,Island
8373,cg00396667,5,PITX1,Island
16415,cg00776960,15,IGDCC4,Island
36683,cg01791874,5,MARCH11,Island
...,...,...,...,...
468066,cg26827876,19,KLK4,N_Shelf
469931,cg26929700,16,ZNF423,Island
470993,cg26988138,19,GNG7,N_Shelf
471995,cg27049766,19,ZNF154;ZNF154,Island


In [39]:
mergeddata = finaldata2[finaldata2['CpG ID'].isin(finaldf.TargetID)]
mergeddata = mergeddata.merge(finaldf[['TargetID', 'UCSC_REFGENE_NAME']], how = 'inner', right_on = 'TargetID', left_on = 'CpG ID')

In [40]:
mergeddata

Unnamed: 0,Subtype,CpG ID,Mean B-value subtype,Mean B-value other,Median B-value subtype,Median B-value other,Std B-value subtype,Std B-value other,p-value,Adjusted p-value,TargetID,UCSC_REFGENE_NAME
0,normal,cg15139588,0.216867,0.464783,0.0540,0.425,0.273001,0.334755,0.000214,0.001621,cg15139588,ZNF793;ZNF793
1,normal,cg15605858,0.443967,0.222043,0.3755,0.147,0.321264,0.201276,0.000876,0.004379,cg15605858,APBA2;APBA2
2,normal,cg02390319,0.419733,0.090754,0.2515,0.026,0.397723,0.170050,0.002140,0.008245,cg02390319,PRDM16;PRDM16
3,normal,cg02571816,0.503100,0.320261,0.5150,0.223,0.336590,0.303788,0.011292,0.030080,cg02571816,PPP1R14A
4,normal,cg03745383,0.521267,0.640898,0.5575,0.635,0.294348,0.257976,0.021415,0.049191,cg03745383,ACCN1
...,...,...,...,...,...,...,...,...,...,...,...,...
100,t(15;17),cg02409722,0.937250,0.687263,0.9390,0.760,0.011236,0.235247,0.000908,0.021009,cg02409722,SETD7
101,t(15;17),cg23553912,0.863500,0.513179,0.9060,0.519,0.117140,0.314718,0.006923,0.036583,cg23553912,SCHIP1
102,t(15;17),cg02796568,0.497000,0.793211,0.4990,0.900,0.225687,0.193896,0.007271,0.037372,cg02796568,SYNE1;SYNE1
103,t(15;17),cg02300154,0.742250,0.277495,0.7095,0.064,0.173700,0.316659,0.008377,0.039661,cg02300154,WBSCR17


In [41]:
mergeddata.Subtype.unique()

array(['normal', 'MLL rearranged', 'mono 7', 'inv(16)', 't(8;21)',
       't(15;17)'], dtype=object)

In [42]:
newgenes =[]
for gene, val in zip(mergeddata['UCSC_REFGENE_NAME'], mergeddata['CpG ID']):
    
    genename = list(np.unique(gene.split(';')))   # for cases like PLAUR;PLAUR;PLAUR to keep PLAUR

    newgenes.append('{}'.format(';'.join(genename)))
print(newgenes)

['ZNF793', 'APBA2', 'PRDM16', 'PPP1R14A', 'ACCN1', 'KIAA1755', 'PLAUR', 'PER3', 'ASB2', 'KLK4', 'BARHL2', 'L1TD1', 'ARPC1B', 'ST8SIA6', 'NKX6-2', 'WNT5A', 'HOXA5', 'NFIX', 'SNED1', 'TNXB', 'MSX2', 'MAPK8IP1', 'SNED1', 'BNIP3', 'CASR', 'HECW1', 'HOXA5', 'PCDHA1;PCDHA2;PCDHA3;PCDHA4;PCDHA5;PCDHA6;PCDHA7;PCDHA8', 'TNXB', 'PITX1', 'KCNN1', 'TNXB', 'TMEM132D', 'NPSR1', 'LOC732275', 'NOM1', 'SPEG', 'TNXB', 'EDARADD', 'THBS4', 'HOOK2', 'LOC254559', 'DCC', 'BCL2', 'ZNF577', 'ZNF577', 'ZNF154', 'ARRB2', 'SKI', 'ZNF577', 'ZNF154', 'ERCC3', 'RPTOR', 'FBXO47', 'ZNF577', 'ARRB2', 'DLL1', 'C1orf86;LOC100128003', 'ZNF154', 'CYP1A1', 'PLD6', 'PLD6', 'DPF3', 'IFLTD1', 'BAHCC1', 'LEPR', 'AFAP1', 'PRHOXNB', 'ANK1', 'SHISA6', 'PRDM16', 'MUC4', 'C22orf34', 'LY96', 'ZNF423', 'GNG7', 'CNTD2', 'ZNF423', 'SMTNL2', 'TUSC1', 'PDLIM3', 'CYP27C1', 'PCDHA1;PCDHA10;PCDHA11;PCDHA2;PCDHA3;PCDHA4;PCDHA5;PCDHA6;PCDHA7;PCDHA8;PCDHA9', 'RYR2', 'TACSTD2', 'FBXL7', 'VSTM2A', 'MARCH11', 'IGSF21', 'MARCH11', 'MARCH11', 'TTBK1

In [43]:
print(len(newgenes))

105


In [44]:
mergeddata['Gene'] = newgenes

In [45]:
columns = ['Subtype','CpG ID', 'Gene', 'Mean B-value subtype', 'Mean B-value other','Median B-value subtype', 'Median B-value other',
           'Std B-value subtype', 'Std B-value other','p-value', 'Adjusted p-value']

In [46]:
geneinfo = mergeddata[columns]
geneinfo

Unnamed: 0,Subtype,CpG ID,Gene,Mean B-value subtype,Mean B-value other,Median B-value subtype,Median B-value other,Std B-value subtype,Std B-value other,p-value,Adjusted p-value
0,normal,cg15139588,ZNF793,0.216867,0.464783,0.0540,0.425,0.273001,0.334755,0.000214,0.001621
1,normal,cg15605858,APBA2,0.443967,0.222043,0.3755,0.147,0.321264,0.201276,0.000876,0.004379
2,normal,cg02390319,PRDM16,0.419733,0.090754,0.2515,0.026,0.397723,0.170050,0.002140,0.008245
3,normal,cg02571816,PPP1R14A,0.503100,0.320261,0.5150,0.223,0.336590,0.303788,0.011292,0.030080
4,normal,cg03745383,ACCN1,0.521267,0.640898,0.5575,0.635,0.294348,0.257976,0.021415,0.049191
...,...,...,...,...,...,...,...,...,...,...,...
100,t(15;17),cg02409722,SETD7,0.937250,0.687263,0.9390,0.760,0.011236,0.235247,0.000908,0.021009
101,t(15;17),cg23553912,SCHIP1,0.863500,0.513179,0.9060,0.519,0.117140,0.314718,0.006923,0.036583
102,t(15;17),cg02796568,SYNE1,0.497000,0.793211,0.4990,0.900,0.225687,0.193896,0.007271,0.037372
103,t(15;17),cg02300154,WBSCR17,0.742250,0.277495,0.7095,0.064,0.173700,0.316659,0.008377,0.039661


In [47]:
geneinfo.Gene

0        ZNF793
1         APBA2
2        PRDM16
3      PPP1R14A
4         ACCN1
         ...   
100       SETD7
101      SCHIP1
102       SYNE1
103     WBSCR17
104     C21orf7
Name: Gene, Length: 105, dtype: object

In [48]:
#geneinfo.to_csv('geneCpgList.txt', index = False)