# Part 1. Data Mining for preparation to Machine Learning 


In [1]:
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
import joblib

#### In this part the DNA methylation data (b-values) will be filterd according to our initial set up found in filteredProbes.pkl including all post filtering CpGs (N = 406830) :
##### Initially, 485512 CpG probes and 168 samples were available
Empirical filtering deriving from 
#### [Nordlund, J. , Bäcklin, C., L.  et al (2013)](https://pubmed.ncbi.nlm.nih.gov/24063430/) Genome-wide signatures of differential DNA methylation in pediatric acute lymphoblastic leukemia. *Genome Biology*, 4(9):r105. doi: 10.1186/gb-2013-14-9-r105.

led to 435941 CpGs, while applying the 10% NA rule (excluding all probes with 10% or more missing values) followed by Multi-Dimensional Scaling for outlier removal created the final dataframe of 406830 CpGs and 142 samples.



In [2]:
path = '../data/' # path with the generated data from the methylprep library
meth = joblib.load(path + 'beta_values.pkl')
fprobes = joblib.load('./AML_data/filteredProbes.pkl')

In [3]:
methdf= meth.T # transpose to get the samples as rows and the CpGs as columns

In [4]:
methdf.shape

(142, 485512)

## Apply the CpG filter

In [5]:
methdf = methdf[fprobes] 

In [6]:
methdf.shape

(142, 406830)

# Align the methylation dataframe with the phenotypes based on the samplesheet's information that links the Sentrix ID & Position with the sample IDs

In [7]:
samples = pd.read_pickle(path + 'sample_sheet_meta_data.pkl') # metadata file produced after running the methylprep command
phenodf = pd.read_csv('./AML_data/phenotypes.csv')

In [8]:
samples

Unnamed: 0,Sentrix_ID,Sentrix_Position,Sample_Group,Sample_Name,Sample_Plate,Sample_Type,Sub_Type,Sample_Well,Pool_ID,GSM_ID,Control,Sample_ID
0,7539308036,R01C01,,AML_001.2_1R01C01,AML1_METH1_PL10_200ng_120703.1,Unknown,,E11,,,False,7539308036_R01C01
1,7539308036,R01C02,,AML_002.2_1R01C02,AML1_METH1_PL10_200ng_120703.1,Unknown,,C12,,,False,7539308036_R01C02
2,7539308036,R02C01,,AML_003.2_1R02C01,AML1_METH1_PL10_200ng_120703.1,Unknown,,F11,,,False,7539308036_R02C01
3,7539308036,R02C02,,AML_004_r.2_1R02C02,AML1_METH1_PL10_200ng_120703.1,Unknown,,D12,,,False,7539308036_R02C02
4,7539308036,R03C01,,AML_005.2_1R03C01,AML1_METH1_PL10_200ng_120703.1,Unknown,,G11,,,False,7539308036_R03C01
...,...,...,...,...,...,...,...,...,...,...,...,...
137,7766148110,R04C02,,AML_122.2_14R04C02,GU4_AML1_PL15_200ng_120703.1,Unknown,,F6,,,False,7766148110_R04C02
138,7766148110,R05C01,,AML_033.2_14R05C01,GU4_AML1_PL15_200ng_120703.1,Unknown,,A6,,,False,7766148110_R05C01
139,7766148110,R05C02,,AML_123.2_14R05C02,GU4_AML1_PL15_200ng_120703.1,Unknown,,G6,,,False,7766148110_R05C02
140,7766148110,R06C01,,AML_124.2_14R06C01,GU4_AML1_PL15_200ng_120703.1,Unknown,,B6,,,False,7766148110_R06C01


### Parse  the IDs as presented by the other dfs (phenodf) from the Sample Name column and append it on the methylation dataframe

In [9]:
ids = [name.split('.')[0] for name in samples['Sample_Name']]    

In [10]:
samples['IDs'] = ids
samples.columns

Index(['Sentrix_ID', 'Sentrix_Position', 'Sample_Group', 'Sample_Name',
       'Sample_Plate', 'Sample_Type', 'Sub_Type', 'Sample_Well', 'Pool_ID',
       'GSM_ID', 'Control', 'Sample_ID', 'IDs'],
      dtype='object')

In [11]:
samples

Unnamed: 0,Sentrix_ID,Sentrix_Position,Sample_Group,Sample_Name,Sample_Plate,Sample_Type,Sub_Type,Sample_Well,Pool_ID,GSM_ID,Control,Sample_ID,IDs
0,7539308036,R01C01,,AML_001.2_1R01C01,AML1_METH1_PL10_200ng_120703.1,Unknown,,E11,,,False,7539308036_R01C01,AML_001
1,7539308036,R01C02,,AML_002.2_1R01C02,AML1_METH1_PL10_200ng_120703.1,Unknown,,C12,,,False,7539308036_R01C02,AML_002
2,7539308036,R02C01,,AML_003.2_1R02C01,AML1_METH1_PL10_200ng_120703.1,Unknown,,F11,,,False,7539308036_R02C01,AML_003
3,7539308036,R02C02,,AML_004_r.2_1R02C02,AML1_METH1_PL10_200ng_120703.1,Unknown,,D12,,,False,7539308036_R02C02,AML_004_r
4,7539308036,R03C01,,AML_005.2_1R03C01,AML1_METH1_PL10_200ng_120703.1,Unknown,,G11,,,False,7539308036_R03C01,AML_005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,7766148110,R04C02,,AML_122.2_14R04C02,GU4_AML1_PL15_200ng_120703.1,Unknown,,F6,,,False,7766148110_R04C02,AML_122
138,7766148110,R05C01,,AML_033.2_14R05C01,GU4_AML1_PL15_200ng_120703.1,Unknown,,A6,,,False,7766148110_R05C01,AML_033
139,7766148110,R05C02,,AML_123.2_14R05C02,GU4_AML1_PL15_200ng_120703.1,Unknown,,G6,,,False,7766148110_R05C02,AML_123
140,7766148110,R06C01,,AML_124.2_14R06C01,GU4_AML1_PL15_200ng_120703.1,Unknown,,B6,,,False,7766148110_R06C01,AML_124


**Add the IDs on the methylation DF through making an inner join with the sample df based on the Sample/Illumina IDs, which will keep only the intersection of the two dataframes**

In [12]:
methdf['Sample_ID'] = methdf.index


In [13]:
new = methdf.merge(samples[['IDs', 'Sample_ID']], how='inner', on='Sample_ID')
new.index = new.Sample_ID
new.drop('Sample_ID', axis = 1, inplace = True)
methdf.drop('Sample_ID', axis = 1, inplace = True)
new.index.name = 'Sample_ID'


In [14]:
new.IDs

Sample_ID
7539308036_R01C01      AML_001
7539308036_R01C02      AML_002
7539308036_R02C01      AML_003
7539308036_R02C02    AML_004_r
7539308036_R03C01      AML_005
                       ...    
7766148110_R04C02      AML_122
7766148110_R05C01      AML_033
7766148110_R05C02      AML_123
7766148110_R06C01      AML_124
7766148110_R06C02      AML_125
Name: IDs, Length: 142, dtype: object

In [15]:
phenodf

Unnamed: 0,public_id,sample.type,FAB,genotype,relapse
0,AML_001,diagnostic,M2,normal,True
1,AML_002,diagnostic,M5,t(11;19),True
2,AML_003,diagnostic,M1,normal,False
3,AML_004_r,relapse,M5,,True
4,AML_005,diagnostic,M2,mono 7,True
...,...,...,...,...,...
137,AML_122,diagnostic,M2,normal,False
138,AML_033,diagnostic,M6,normal,True
139,AML_123,diagnostic,M4,inv(16),False
140,AML_124,diagnostic,M2,normal,False


In [16]:
phenodf.set_index('public_id', inplace = True)
new.set_index([new.index, 'IDs'], inplace = True)

In [17]:
phenodf.head()

Unnamed: 0_level_0,sample.type,FAB,genotype,relapse
public_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AML_001,diagnostic,M2,normal,True
AML_002,diagnostic,M5,t(11;19),True
AML_003,diagnostic,M1,normal,False
AML_004_r,relapse,M5,,True
AML_005,diagnostic,M2,mono 7,True


In [18]:
new.index

MultiIndex([('7539308036_R01C01',   'AML_001'),
            ('7539308036_R01C02',   'AML_002'),
            ('7539308036_R02C01',   'AML_003'),
            ('7539308036_R02C02', 'AML_004_r'),
            ('7539308036_R03C01',   'AML_005'),
            ('7539308036_R03C02',   'AML_006'),
            ('7539308036_R04C01',   'AML_007'),
            ('7668610011_R02C01',   'AML_008'),
            ('7668610011_R02C02',   'AML_009'),
            ('7668610011_R03C01',   'AML_010'),
            ...
            ('7766148110_R01C02', 'AML_093_r'),
            ('7766148110_R02C02',   'AML_118'),
            ('7766148110_R03C01',   'AML_119'),
            ('7766148110_R03C02',   'AML_120'),
            ('7766148110_R04C01',   'AML_121'),
            ('7766148110_R04C02',   'AML_122'),
            ('7766148110_R05C01',   'AML_033'),
            ('7766148110_R05C02',   'AML_123'),
            ('7766148110_R06C01',   'AML_124'),
            ('7766148110_R06C02',   'AML_125')],
           names=['Samp

In [19]:
########## Save the data ##############

In [20]:
#joblib.dump(phenodf, './AML_data/pheno.pkl')
#joblib.dump(new,'./AML_data/meth.pkl')
