In [1]:
#1. imports
import os
import pandas as pd
import scipy.io as sio
import glob
import re
import numpy as np
import time
import matplotlib.pyplot as plt

from pyimzml.ImzMLParser import ImzMLParser

In [2]:
#2. paths and parameters
DataPath = 'M:/'

In [3]:
#3. get all relevant folders
full_paths = []
folders=[]
TMA_IDs = []
for r, d, f in os.walk(DataPath):
    for folder in d:
        if 'Patient' in folder:
            full_paths.append(r+'/'+folder + '/')
            folders.append(folder)
            Pos1 = os.path.basename(r).find('UniSA',0,100)
            Pos2 = os.path.basename(r).find('imzml',0,100)
            TMA_ID = os.path.basename(r)[Pos1+6:Pos2-1]
            TMA_IDs.append(TMA_ID)     
SourceData_df = pd.DataFrame()
SourceData_df['TMA_ID'] = TMA_IDs
SourceData_df['folder'] = folders
SourceData_df['TMA_path'] = full_paths
SourceData_df.to_csv('../MetaData/WEHI-codes.csv')

In [4]:
#4. get list of tumour and normal imzML files that are not in METS folders, by Patient ID
imzML_df = pd.DataFrame()
CC=0
for Path in SourceData_df['TMA_path']:
    Files = glob.glob(Path + '*.imzML')
    ThisTMA = SourceData_df['TMA_ID'].values[CC]
    CC=CC+1
    for File in Files:
        imzml_name = os.path.basename(File)
        if Path.find('METS',0,-1) == -1 and imzml_name.find('tumour',0,-1)>= 0:
            PatientID = re.findall('\d+', imzml_name )[0]
            imzML_df.at[PatientID,'TMA_ID'] = ThisTMA
            imzML_df.at[PatientID,'TumourFileName'] = os.path.basename(File)
            imzML_df.at[PatientID,'TumourFile'] =File
        elif Path.find('METS',0,-1) == -1 and imzml_name.find('normal',0,-1)>= 0 and  imzml_name.find('tumour',0,-1)== -1:
            PatientID = re.findall('\d+', imzml_name )[0]
            imzML_df.at[PatientID,'TMA_ID'] = ThisTMA
            imzML_df.at[PatientID,'NormalFileName'] = os.path.basename(File)
            imzML_df.at[PatientID,'NormalFile'] = File
imzML_df = imzML_df.where((pd.notnull(imzML_df)), None)
imzML_df=imzML_df.sort_index()

In [5]:
#5. get list of METS by patient ID
imzML_METS_df = pd.DataFrame()
CC=0
for Path in SourceData_df['TMA_path']:
    Files = glob.glob(Path + '*.imzML')
    ThisTMA = SourceData_df['TMA_ID'].values[CC]
    CC=CC+1
    for File in Files:
        imzml_name = os.path.basename(File)
        if Path.find('METS',0,-1) != -1:
            aa=re.findall('\d+', imzml_name )
            if len(aa)>0:
                PatientID = re.findall('\d+', imzml_name )[0]
                imzML_METS_df.at[PatientID,'TMA_ID'] = ThisTMA
                imzML_METS_df.at[PatientID,'FileName'] = os.path.basename(File)
                imzML_METS_df.at[PatientID,'File'] =File
imzML_METS_df = imzML_METS_df.where((pd.notnull(imzML_METS_df)), None)
imzML_METS_df=imzML_METS_df.sort_index()

In [8]:
#6. get the first and last mz points for all per-patient data and find the maximum and minimum mzs bin in all the data
def ProcessFile(FileName,SavePath,DataKind,TMA_ID,PatientID,DoSave):
   # print(FileName)
    p = ImzMLParser(FileName)
    mzs, intens = p.getspectrum(0)
    mzs = np.zeros(len(intens),'float32')
    intensities = np.zeros((len(p.coordinates),len(intens)),'float32')
    coords = np.zeros((len(p.coordinates),3),'float32')
    for idx, (x,y,z) in enumerate(p.coordinates):
        #assume mzs same for all
        mzs,intensities[idx,:] = p.getspectrum(idx)
        coords[idx,:] = [x,y,z]
    first_point = mzs[0]
    last_point = mzs[-1]
    if DoSave == True:
        OutFileName = SavePath+PatientID + '_' + DataKind + '_' + TMA_ID+ '.mat'
        sio.savemat(OutFileName, {'mzs':mzs,'coords':coords,'intensities':intensities})
    return intensities.shape[1],first_point,last_point

start = time.time()
for index, row in imzML_df.iterrows():
    if row['NormalFile'] != None:
        this_len,first_point,last_point = ProcessFile(row['NormalFile'],None,'Normal',row['TMA_ID'],index,False)
        imzML_df.at[index,'num_normal_mzs_points']=this_len
        imzML_df.at[index,'first_mzs_normal']=first_point
        imzML_df.at[index,'last_mzs_normal']=last_point
    if row['TumourFile'] != None:
        this_len,first_point,last_point = ProcessFile(row['TumourFile'],None,'Tumour',row['TMA_ID'],index,False)
        imzML_df.at[index,'num_tumour_mzs_points']=this_len
        imzML_df.at[index,'first_mzs_tumour']=first_point
        imzML_df.at[index,'last_mzs_tumour']=last_point
end = time.time()
print(end - start)

#repeat for METS: get the first and last mz points
start = time.time()
for index, row in imzML_METS_df.iterrows():
    if row['FileName'] != None:
        this_len,first_point,last_point = ProcessFile(row['File'],None,'METS',row['TMA_ID'],index,False)
        imzML_METS_df.at[index,'num_mzs_points']=this_len
        imzML_METS_df.at[index,'first_mzs']=first_point
        imzML_METS_df.at[index,'last_mzs']=last_point
end = time.time()
print(end - start)
Overall_min_mzs = min(min(imzML_df['first_mzs_normal']),min(imzML_df['first_mzs_tumour']),min(imzML_METS_df['first_mzs']))
Overall_max_mzs = max(max(imzML_df['last_mzs_normal']),max(imzML_df['last_mzs_tumour']), max(imzML_METS_df['last_mzs']))
print(Overall_min_mzs,Overall_max_mzs)
#takes around 15 minutes approx

1257.0535917282104
86.11969661712646
799.7571411132812 4496.83984375


In [9]:
#7. remove nan's and ensure number of points are integers
imzML_df['num_normal_mzs_points']=imzML_df['num_normal_mzs_points'].fillna(0)
imzML_df['num_tumour_mzs_points']=imzML_df['num_tumour_mzs_points'].fillna(0)
imzML_df['num_normal_mzs_points']=imzML_df['num_normal_mzs_points'].astype('int')
imzML_df['num_tumour_mzs_points']=imzML_df['num_tumour_mzs_points'].astype('int')
imzML_METS_df['num_mzs_points']=imzML_METS_df['num_mzs_points'].fillna(0)
imzML_METS_df['num_mzs_points']=imzML_METS_df['num_mzs_points'].astype('int')

In [10]:
#8. functions for binning mzs data to reduce size
def GetBinnedData(BinEdges,FileName):
    p = ImzMLParser(FileName)
    mzs, intens = p.getspectrum(0)
    mzs = np.zeros(len(intens),'float32')
    Binned_intensities = np.zeros((len(p.coordinates),len(BinEdges)),'float32')
    coords = np.zeros((len(p.coordinates),3),'float32')
    for idx, (x,y,z) in enumerate(p.coordinates):
        #assume mzs same for all
        mzs,intensities = p.getspectrum(idx)
        Mzs_binning_indices = np.digitize(mzs, BinEdges)
        Binned_intensities[idx,:] = [intensities[Mzs_binning_indices == i].sum() for i in range(1, len(BinEdges)+1)]
        coords[idx,:] = [x,y,z]
    return mzs,Binned_intensities

def BinAndSaveSpectraTumourNormal(BinEdges,data_df):
    data_df=data_df.astype(object)
    #this loop takes over 10 hours to run!
    start = time.time()
    for index, row in data_df.iterrows():
        print(index)
        if row['NormalFile'] != None:
            this_mzs,this_intensities = GetBinnedData(BinEdges,row['NormalFile'])
            data_df.at[index,'binned_normal'] =''
            data_df=data_df.astype(object)
            data_df.at[index,'binned_normal'] = [this_intensities]
        if row['TumourFile'] != None:
            this_mzs,this_intensities = GetBinnedData(BinEdges,row['TumourFile'])
            data_df.at[index,'binned_tumour'] =''
            data_df=data_df.astype(object)
            data_df.at[index,'binned_tumour'] = [this_intensities]
        end = time.time()
        print(end - start)
    #save as a h5 file
    data_df.to_hdf('./BinnedData/imzML_dz'+str(dz)+'_df.h5', key='imzML_df', mode='w')
    return data_df

def BinAndSaveSpectraMETS(BinEdges,data_df):
    data_df=data_df.astype(object)
    start = time.time()
    for index, row in data_df.iterrows():
        print(index)
        if row['File'] != None:
            this_mzs,this_intensities = GetBinnedData(BinEdges,row['File'])
            data_df.at[index,'binned_mets'] =''
            data_df=data_df.astype(object)
            data_df.at[index,'binned_mets'] = [this_intensities]
        end = time.time()
        print(end - start)
    #save as a h5 file
    data_df.to_hdf('./BinnedData/imzML_METS_dz'+str(dz)+'_df.h5', key='imzML_METS_df', mode='w')
    return data_df

In [11]:
dz=3
if not os.path.exists('BinnedData'):
    os.mkdir('BinnedData')
#get the boundaries of the bins
BinEdges = np.linspace(Overall_min_mzs-dz,Overall_max_mzs,int((Overall_max_mzs-Overall_min_mzs)/dz))
#bin the TMA data
imzML_df = BinAndSaveSpectraTumourNormal(BinEdges,imzML_df)
#bin the METS data
imzML_METS_df = BinAndSaveSpectraMETS(BinEdges,imzML_METS_df)


573
116.13486981391907
574
292.1132607460022
575
491.7425091266632
576
722.978129863739
577
884.0174734592438
578
1028.2977437973022
579
1225.3128769397736
580
1335.6797869205475
581
1432.13489985466
582
1604.4044871330261
583
1710.9919347763062
584
1808.7720539569855
585
2000.2946546077728
586
2146.7026653289795
587
2264.312158346176
588
2444.2412972450256
589
2566.4130680561066
590
2741.66641163826
591
2767.5890867710114
592
2956.0713834762573
593
3118.848118543625
594
3139.683875322342
595
3159.039906978607
596
3260.4989235401154
597
3400.6404869556427
598
3433.971253633499
599
3595.304308652878
600
3771.7823779582977
601
3937.6931722164154
602
4013.8343069553375
603
4138.993068933487
604
4263.1356654167175
605
4398.962809801102
606
4543.026063919067
607
4669.633362770081
608
4831.495440721512
609
4943.684972524643
610
5075.227090597153
611
5196.24275636673
612
5380.891746997833
613
5573.081613063812
614
5689.587443828583
615
5791.59494137764
616
5857.878485202789
617
5928.206413745

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->Index(['TMA_ID', 'NormalFileName', 'NormalFile', 'TumourFileName',
       'TumourFile', 'num_normal_mzs_points', 'first_mzs_normal',
       'last_mzs_normal', 'num_tumour_mzs_points', 'first_mzs_tumour',
       'last_mzs_tumour', 'binned_normal', 'binned_tumour'],
      dtype='object')]

  encoding=encoding,


001


  % (accession, raw_name, name)


32.09218072891235
002
53.35930824279785
003
54.36561369895935
004
85.09443736076355
005
104.44768214225769
006
115.63077807426453
007
142.36029720306396
008
177.50531125068665
009
199.87648510932922
010
204.11215567588806
011
229.9889576435089
012
246.06297135353088
013
271.38425397872925
014
273.1934187412262
015
292.27538776397705
016
319.1814351081848
017
342.71051049232483
018
354.19081139564514
019
376.69064235687256
020
396.555517911911
021
430.0080580711365
022
431.00239872932434
023
439.220419883728
024
468.652713060379
025
484.89427852630615
026
504.41009044647217
027
505.23887038230896
028
530.4384818077087
029
544.3373136520386
030
562.9492173194885
031
569.0802226066589
032
583.983283996582
033
609.2572178840637
034
627.6554644107819
035
640.7247424125671
036
670.1220600605011
037
671.7454998493195
038
699.620730638504
039
729.6056568622589
040
753.5015172958374
041
762.0649015903473
042
775.763338804245
043
789.8349347114563
044
814.9280092716217
045
831.5247383117676
046


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->Index(['TMA_ID', 'FileName', 'File', 'num_mzs_points', 'first_mzs', 'last_mzs',
       'binned_mets'],
      dtype='object')]

  encoding=encoding,
