# Breast cancer PSI-MS - Data preprocessing

Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import netCDF4 as nc
import os
import math

In [2]:
def correction(mass_start,mass_end,spectrum):
    # Find possible range
    index1 = np.where(np.isclose(spectrum[0,:], mass_start))
    index2 = np.where(np.isclose(spectrum[0,:], mass_end))
    # Find the position of maximum signal in the possible range
    correct_mass = max(spectrum[1,index1[0][0]:index2[0][0]])
    index = np.where(np.isclose(spectrum[1,:], correct_mass))
    index = index[0][0]
    # Retrun the index of the correct peak position
    return index

File directory / parameters

In [3]:
Path = r'D:\Breast Cancer PSIMS\Codes and figures'
Filename = 'C678_benign_500-1000.cdf'
ID = 'C678'
Label = 0
Filename_output = 'C678_benign_500-1000.csv'
bin_size=1

Import data

In [4]:
data = nc.Dataset(os.path.join(Path, Filename),'a')
inten = data.variables['intensity_values'][:]
mass = data.variables['mass_values'][:]
TIC = sum(data.variables['total_intensity'][:])
ms_list = np.arange(np.min(mass), np.max(mass), 0.05)
ms_list = np.around(ms_list,2)
mass = np.around(mass,2)

spectrum = np.zeros((2,len(ms_list)))
spectrum[0,:] = ms_list

Mass calibration

In [6]:
for i in range(len(ms_list)):
    ind = np.where(mass==spectrum[0,i])
    spectrum[1,i] = sum(inten[ind])
    
ref1 = correction(518,519,spectrum)
ref2 = correction(725,726.5,spectrum)
ref3 = correction(782,784,spectrum)
ref4 = correction(808,809.5,spectrum)

Obs_peak=[ref1,ref2,ref3,ref4]
Cal_peak=[518.32,725.56,782.57,808.58]

# Calculate average mass error
error=(sum(ms_list[Obs_peak])-sum(Cal_peak))/4
# Calculate number of position to move
error_parameter=np.ceil(error/0.05)

if error_parameter > 0:
    # m/z is higher than expected.
    spectrum_corrected = np.zeros((2,spectrum.shape[1]))
    spectrum_corrected[0,:-int(error_parameter)] = ms_list[:-int(error_parameter)]
    spectrum_corrected[1,:-int(error_parameter)] = spectrum[1,int(error_parameter):]
    
elif error_parameter < 0:
    # m/z is lower than expected.
    error_parameter = error_parameter*(-1)
    spectrum_corrected = np.zeros((2,spectrum.shape[1]+int(error_parameter)))
    spectrum_corrected[0,:-int(error_parameter)] = ms_list[:]
    spectrum_corrected[1,int(error_parameter):] = spectrum[1,:]
    
elif error_parameter == 0:
    # No calibration required
    spectrum_corrected = spectrum

Binning & normalization

In [7]:
bin_parameter = int(bin_size/0.05)
ms_list_bin = np.arange(math.ceil(min(ms_list))+1.5,math.floor(max(ms_list))-3.5,bin_size)

spectrum_final = np.zeros((2,len(ms_list_bin)))
spectrum_final[0,:] = ms_list_bin
cal_range = int(bin_parameter/2)

for i in range(len(ms_list_bin)):
    index=np.where(np.isclose(ms_list_bin[i],spectrum_corrected[0,:]))
    index=index[0][0]
    spectrum_final[1,i]=sum(spectrum_corrected[1,index-cal_range:index+cal_range])/TIC

Export

In [15]:
# Convert spectrum to dataframe
df_spectrum = pd.DataFrame(spectrum_final[1,:]).T
df_spectrum.columns = spectrum_final[0,:]

# Convert info into dataframe (ID and label)
df_info = pd.DataFrame({'ID':[ID],'Type':[Label]})

# Combine the two dataframe and output as csv file
df_final = pd.concat([df_info, df_spectrum], axis=1, join='inner')
df_final.to_csv(os.path.join(Path, Filename_output), index=False)