In [4]:
import pandas as pd
import numpy as np

In [7]:
# Read the CSV file
df = pd.read_csv(r"all_aligned_no_background.csv")

# Select only the rows where 'Class' is not 'background'
df = df[df['Class'] != 'other']

# Get the unique classes in the 'Class' column
unique_classes = df['Class'].unique()

print(unique_classes)

['adenocarcinoma' 'benign mucosa' 'submucosa' 'tissue' 'smooth muscle'
 'inflammatory cells' 'serosa']


In [None]:
# Save the new dataset as a CSV file
df.to_csv(r"all_aligned_no_background_others.csv", index=False)

In [5]:
# total ion current normalization (spectrum based)
def tic_normalize(peaks):
    tot_ion_cur = np.sum(peaks, axis=1)
    peaks_ticn = np.empty(peaks.shape)
    for i in range(len(peaks)):
        if tot_ion_cur[i]!=0:
            peaks_ticn[i] = peaks[i]/tot_ion_cur[i]
    return peaks_ticn

def ion_minmax_normalize(peaks):
    max_ion_int = np.max(peaks, axis=0)
    min_ion_int = np.min(peaks, axis=0)
    peaks_ionorm = np.empty(peaks.shape)
    for i in range(peaks.shape[1]):
        if max_ion_int[i]!=min_ion_int[i]:
            peaks_ionorm[:,i] = (peaks[:,i]-min_ion_int[i])/(max_ion_int[i]-min_ion_int[i])
    return peaks_ionorm

In [7]:
df = pd.read_csv(r"C:\Users\jenni\Documents\GitHub\dc-DeepMSI\all_aligned_no_background_others.csv")
print(df.head())

                                  Slide           Class   X   Y  89.0243  \
0  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  88  89    115.0   
1  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  88  90    105.0   
2  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  89  87    180.0   
3  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  89  88     68.0   
4  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  89  89     88.0   

   96.9674  115.0208  124.0079  145.0609  157.0312  ...  1331.7322  1373.7587  \
0      0.0      67.0     532.0      18.0       NaN  ...        0.0        0.0   
1     19.0      92.0     273.0      22.0       NaN  ...        0.0        0.0   
2     84.0      70.0     332.0      41.0       NaN  ...       22.0        0.0   
3      0.0      28.0     294.0      41.0       NaN  ...        0.0        0.0   
4     19.0     143.0     253.0      78.0       NaN  ...        0.0        0.0   

   1374.76  1375.7596  1417.78  1418.7822  1419.7805  14

In [8]:
# Split the DataFrame
df1 = df.iloc[:, :4]
df2 = df.iloc[:, 4:]

# Convert the second part of the DataFrame to a numpy array
peaks = df2.to_numpy()
peaks = np.nan_to_num(peaks)

# Apply the normalization functions
peaks_ticn = tic_normalize(peaks)
peaks_ionorm = ion_minmax_normalize(peaks_ticn)

# Convert the normalized numpy array back to a DataFrame
df2_normalized = pd.DataFrame(peaks_ionorm, columns=df2.columns)

# Concatenate the two DataFrames
df_normalized = pd.concat([df1, df2_normalized], axis=1)

# Save the resulting DataFrame to a new CSV file
df_normalized.to_csv('all_aligned_no_background_others_preprocessed.csv', index=False)

print(df_normalized.head())

                                  Slide           Class   X   Y   89.0243  \
0  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  88  89  0.011928   
1  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  88  90  0.013172   
2  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  89  87  0.012428   
3  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  89  88  0.005192   
4  2021 03 30 colon 0720931-3 Analyte 5  adenocarcinoma  89  89  0.005283   

    96.9674  115.0208  124.0079  145.0609  157.0312  ...  1331.7322  \
0  0.000000  0.031504  0.137820  0.009702       0.0  ...   0.000000   
1  0.024782  0.052320  0.085535  0.014342       0.0  ...   0.000000   
2  0.060303  0.021911  0.057253  0.014711       0.0  ...   0.015974   
3  0.000000  0.009693  0.056070  0.016269       0.0  ...   0.000000   
4  0.011861  0.038923  0.037939  0.024337       0.0  ...   0.000000   

   1373.7587  1374.76  1375.7596   1417.78  1418.7822  1419.7805  1461.8021  \
0        0.0      0.0   0.00000