In [1]:
import os
import numpy as np
import pandas as pd
import itertools
import pickle
from combinations import process_data_fusions
from preprocess import process_region,process_data, pool_data, generate_patient_file_name,process_files,process_and_combine_files

In [2]:
### Each folder represents a different cell type; please perform this preprocessing step for each folder.
input_folder = '/Users/laying/Desktop/P1_cancer_differentiation/Data/rawdata/wbcs'
output_folder = '/Users/laying/Desktop/P1_cancer_differentiation/Data/preprocessed_data/wbcs'
for filename in os.listdir(input_folder):
    if not filename.startswith('.') and filename.endswith(".csv"):
        file_path = os.path.join(input_folder, filename)               
        df = pd.read_csv(file_path)        
        processed_data = process_data(df,  region1=(1000, 1700), region2=(2820, 3000), iterations1=70, iterations2=50, smoothing_window=3, window_length=17, polynomial=5, apply_savgol=True, norm_method ='vector')
        output_file_path = os.path.join(output_folder, filename)
        np.savetxt(output_file_path, processed_data, delimiter=",")

In [3]:
### Define the path to the clinical metadata file
## number1 represents WBCs, number2 represents Granus, number5 represents CD4+, number6 represents CD8+
metadata_path = '/Users/laying/Desktop/P1_cancer_differentiation/Data/rawdata/immune_meta_data.xlsx'
folders = [
    ("/Users/laying/Desktop/P1_cancer_differentiation/Data/preprocessed_data/cd8", '6_filename', 'label of cancer subtypes', 6),
    ("/Users/laying/Desktop/P1_cancer_differentiation/Data/preprocessed_data/cd4", '5_filename', 'label of cancer subtypes', 5),
    ("/Users/laying/Desktop/P1_cancer_differentiation/Data/preprocessed_data/granus", '2_filename', 'label of cancer subtypes', 2),
    ("/Users/laying/Desktop/P1_cancer_differentiation/Data/preprocessed_data/wbcs", '1_filename', 'label of cancer subtypes', 1),
]
### Process and combine all data from the specified folders with clinical metadata
### The function also returns the corresponding wavenumber information
combined_df, wavenumber = process_and_combine_files(folders, metadata_path, pool_data, generate_patient_file_name)
print(combined_df)
print(wavenumber)

  all_data.loc[len(all_data)] = [None] + file_names
  all_data.fillna("filename", inplace=True)
  all_data.loc[len(all_data)] = [None] + file_names
  all_data.fillna("filename", inplace=True)
  all_data.loc[len(all_data)] = [None] + file_names
  all_data.fillna("filename", inplace=True)
  all_data.loc[len(all_data)] = [None] + file_names
  all_data.fillna("filename", inplace=True)


       group     patient  cells 2998.56365 2996.63531 2994.70698 2992.77864  \
0        2.0  Patient#40      6  -0.000073   0.001881   0.003009   0.003701   
1        2.0  Patient#40      6   0.000191   0.001643   0.002955   0.003909   
2        2.0  Patient#40      6  -0.000123   0.001473   0.002646   0.003655   
3        2.0  Patient#40      6  -0.000565   0.002279   0.003865   0.004872   
4        2.0  Patient#40      6    0.00025   0.001082   0.002253   0.003461   
...      ...         ...    ...        ...        ...        ...        ...   
22267    2.0  Patient#59      1   0.000274     0.0001   0.000418   0.000927   
22268    2.0  Patient#59      1  -0.000266   0.000946   0.001705   0.002092   
22269    2.0  Patient#59      1   0.000019   0.000266   0.000757   0.001327   
22270    2.0  Patient#59      1   0.000502   0.000111   0.000646   0.001562   
22271    2.0  Patient#59      1  -0.000234   0.001086   0.002156   0.002896   

      2990.8503 2988.92197 2986.99363  ... 1018.161

In [4]:
##### Filter patients that are shared across all cell types
metadata = pd.read_excel(metadata_path)
cell_types = [1, 2, 5, 6] 
## number1 represents WBCs, number2 represents Granus, number5 represents CD4+, number6 represents CD8+
common_patients = set.intersection(*[
    set(metadata[f'{cell_type}_filename'].dropna().apply(lambda x: ''.join(filter(str.isdigit, x.split('P')[0])) if isinstance(x, str) else None).dropna().astype(int))
    for cell_type in cell_types
])
combined_df['patient_id'] = combined_df['patient'].str.extract(r'Patient#(\d+)').astype(int)
combined_df_filter = combined_df[combined_df['patient_id'].isin(common_patients)]
print(combined_df_filter)

       group     patient  cells 2998.56365 2996.63531 2994.70698 2992.77864  \
256      2.0  Patient#46      6   0.000071   0.002322    0.00443   0.006326   
257      2.0  Patient#46      6  -0.001008   0.003546   0.005772   0.006599   
258      2.0  Patient#46      6  -0.000635   0.001413    0.00483   0.008176   
259      2.0  Patient#46      6  -0.000745   0.003976   0.006079   0.006977   
260      2.0  Patient#46      6  -0.000743   0.005389   0.009726    0.01253   
...      ...         ...    ...        ...        ...        ...        ...   
22267    2.0  Patient#59      1   0.000274     0.0001   0.000418   0.000927   
22268    2.0  Patient#59      1  -0.000266   0.000946   0.001705   0.002092   
22269    2.0  Patient#59      1   0.000019   0.000266   0.000757   0.001327   
22270    2.0  Patient#59      1   0.000502   0.000111   0.000646   0.001562   
22271    2.0  Patient#59      1  -0.000234   0.001086   0.002156   0.002896   

      2990.8503 2988.92197 2986.99363  ... 1016.233

In [5]:
### Format each subpopulations in a consistent structure to enable seamless integration
df = combined_df_filter[combined_df_filter['group'].isin([1,2])]
cell_mapping={1: 'WBCs', 2: 'Granus', 5: 'CD4+', 6: 'CD8+'}
cell_types = [1,2,5,6]
all_combinations = []
all_combinations.extend(itertools.combinations(cell_types, 1))
# all_combinations.extend(itertools.combinations(cell_types, 2))
# all_combinations.extend(itertools.combinations(cell_types, 3))
# all_combinations.extend(itertools.combinations(cell_types, 4))

processed_data_dict = {}
for comb in all_combinations:
    comb_name = '_'.join(map(str, comb))
    cell_names = [cell_mapping[int(c)] for c in comb_name.split('_')]
    cell_name_str = ','.join(cell_names)
    processed_data_dict[cell_name_str] = process_data_fusions(df, comb)
#     processed_data_dict[comb_name] =process_data_fusions(df, comb)
##show results
for comb_name, processed_data in processed_data_dict.items():
    print(f"combinations: {comb_name}")
    print(processed_data)
    print()

combinations: WBCs
         patient  group 2998.56365_1 2996.63531_1 2994.70698_1 2992.77864_1  \
0     Patient#13    1.0     0.000109    -0.000045     0.000136      0.00037   
1     Patient#13    1.0    -0.000037     0.000254     0.000507     0.000631   
2     Patient#13    1.0     0.000027     0.000202     0.000383     0.000463   
3     Patient#13    1.0    -0.000036     0.000158     0.000398     0.000534   
4     Patient#13    1.0     0.000033    -0.000022     0.000195     0.000431   
...          ...    ...          ...          ...          ...          ...   
5371   Patient#8    2.0    -0.000496     0.001232     0.001833     0.001786   
5372   Patient#8    2.0     0.000111     0.000671     0.001138     0.001456   
5373   Patient#8    2.0    -0.000031     0.000483     0.000911     0.001162   
5374   Patient#8    2.0    -0.000542     0.001235     0.001982     0.002068   
5375   Patient#8    2.0     0.000055     0.000472     0.000999     0.001451   

     2990.8503_1 2988.92197_1 29

In [6]:
###save those data to results for classfication model
data_path = '/Users/laying/Desktop/P1_cancer_differentiation/Results'
output_file = os.path.join(data_path, 'combined_df_filter.pkl')
with open(output_file,'wb') as f:
    pickle.dump(combined_df_filter,f)
    data_path = '/Users/laying/Desktop/P1_cancer_differentiation/Results'
output_file = os.path.join(data_path, 'processed_data_dict.pkl')
with open(output_file,'wb') as f:
    pickle.dump(processed_data_dict,f)